feat: add scrapers for vandriel, vanherk, vanoord
- vandriel (Schiedam): OG Online API, filtered by city=schiedam - vanherk (Schiedam): SURE WordPress plugin (card-house), detail page kenmerken - vanoord (Delft + Schiedam): Elementor WordPress, two filtered listing URLs, rw-object-features-list detail parsing - makelaars.md: mark all three as done, add TODO for API scraper detail page enrichment Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,9 @@
|
|||||||
# Verkoopmakelaars Delft, Leiden, Den Haag & Schiedam
|
# Verkoopmakelaars Delft, Leiden, Den Haag & Schiedam
|
||||||
|
|
||||||
|
## TODO
|
||||||
|
|
||||||
|
- **API scrapers need detail page enrichment**: OG Online API (bjornd, moerman, vandaal, elzenaar, doen, vandriel) sometimes omits fields like `energyLabel`. We should fetch the detail page for each listing and merge in missing fields (especially energielabel, bouwjaar). This is already done for SSR scrapers; needs to be added to API-based ones.
|
||||||
|
|
||||||
## Delft
|
## Delft
|
||||||
|
|
||||||
| Done | Naam | Website | Adres |
|
| Done | Naam | Website | Adres |
|
||||||
@@ -23,6 +27,7 @@
|
|||||||
| [ ] | Bergklis Makelaars | bergklis.nl | — |
|
| [ ] | Bergklis Makelaars | bergklis.nl | — |
|
||||||
| [ ] | Van Gulden Makelaardij | vanguldenmakelaardij.nl | Zaïrestraat 1 |
|
| [ ] | Van Gulden Makelaardij | vanguldenmakelaardij.nl | Zaïrestraat 1 |
|
||||||
| [ ] | Van der Togt Makelaardij | vdtmakelaardij.nl | — (Voorburg, actief in Delft) |
|
| [ ] | Van der Togt Makelaardij | vdtmakelaardij.nl | — (Voorburg, actief in Delft) |
|
||||||
|
| [x] | Van Oord Makelaardij | vanoordmakelaardij.nl | — (Delft + Schiedam) |
|
||||||
|
|
||||||
|
|
||||||
## Schiedam
|
## Schiedam
|
||||||
@@ -40,6 +45,8 @@
|
|||||||
| [x] | Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B |
|
| [x] | Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B |
|
||||||
| [ ] | Hagestein Makelaardij | — | Degerfors 54 |
|
| [ ] | Hagestein Makelaardij | — | Degerfors 54 |
|
||||||
| [x] | Schieland Borsboom NVM Makelaars | schielandborsboom.nl | (Rotterdam, actief in Schiedam) |
|
| [x] | Schieland Borsboom NVM Makelaars | schielandborsboom.nl | (Rotterdam, actief in Schiedam) |
|
||||||
|
| [x] | Vandriel Makelaardij | vandrielmakelaardij.nl | — |
|
||||||
|
| [x] | Van Herk Makelaars | vanherk.nl | — |
|
||||||
|
|
||||||
|
|
||||||
## Den Haag
|
## Den Haag
|
||||||
|
|||||||
@@ -436,6 +436,69 @@ def fetch_doen() -> list[RawListing]:
|
|||||||
return listings
|
return listings
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Vandriel Makelaardij (Schiedam) — OG Online / realtime-listings
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_VANDRIEL_BASE = "https://www.vandrielmakelaardij.nl"
|
||||||
|
_VANDRIEL_SKIP = {"rented", "rented_ur"}
|
||||||
|
|
||||||
|
_VANDRIEL_STATUS_MAP = {
|
||||||
|
"available": "beschikbaar",
|
||||||
|
"under_bid": "onder_bod",
|
||||||
|
"under_option": "onder_bod",
|
||||||
|
"sold": "verkocht",
|
||||||
|
"sold_ur": "verkocht",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_vandriel() -> list[RawListing]:
|
||||||
|
data = fetch_json(
|
||||||
|
f"{_VANDRIEL_BASE}/nl/realtime-listings/consumer",
|
||||||
|
headers={"X-Requested-With": "XMLHttpRequest"},
|
||||||
|
)
|
||||||
|
|
||||||
|
listings = []
|
||||||
|
for item in data:
|
||||||
|
if not item.get("isSales"):
|
||||||
|
continue
|
||||||
|
if item.get("statusOrig") in _VANDRIEL_SKIP:
|
||||||
|
continue
|
||||||
|
if (item.get("city") or "").lower() != "schiedam":
|
||||||
|
continue
|
||||||
|
if item.get("salesPrice", 0) > config.MAX_PRICE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
postcode = (item.get("zipcode") or "").replace(" ", "") or None
|
||||||
|
perceel = item.get("plotSurface") or None
|
||||||
|
if perceel == 0:
|
||||||
|
perceel = None
|
||||||
|
|
||||||
|
raw_year = item.get("dateOfConstruction") or ""
|
||||||
|
bouwjaar = int(raw_year) if raw_year.isdigit() else None
|
||||||
|
|
||||||
|
listings.append(RawListing(
|
||||||
|
url=_VANDRIEL_BASE + item["url"],
|
||||||
|
source_makelaar="vandriel",
|
||||||
|
status=_VANDRIEL_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
|
||||||
|
adres=item.get("address") or None,
|
||||||
|
postcode=postcode,
|
||||||
|
stad=item.get("city") or None,
|
||||||
|
prijs=item.get("salesPrice") or None,
|
||||||
|
woningtype=item.get("type") or None,
|
||||||
|
woonoppervlak=item.get("livingSurface") or None,
|
||||||
|
perceeloppervlak=perceel,
|
||||||
|
kamers=item.get("rooms") or None,
|
||||||
|
slaapkamers=item.get("bedrooms") or None,
|
||||||
|
bouwjaar=bouwjaar,
|
||||||
|
energielabel=item.get("energyLabel") or None,
|
||||||
|
hero_image_url=item.get("photo") or None,
|
||||||
|
))
|
||||||
|
|
||||||
|
log.info("vandriel: %d koopwoningen opgehaald", len(listings))
|
||||||
|
return listings
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# SCRAPERS — exporteer hier alle actieve API adapters
|
# SCRAPERS — exporteer hier alle actieve API adapters
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -447,4 +510,5 @@ SCRAPERS = {
|
|||||||
'vandaal': fetch_vandaal,
|
'vandaal': fetch_vandaal,
|
||||||
'elzenaar': fetch_elzenaar,
|
'elzenaar': fetch_elzenaar,
|
||||||
'doen': fetch_doen,
|
'doen': fetch_doen,
|
||||||
|
'vandriel': fetch_vandriel,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1874,6 +1874,265 @@ def fetch_borgdorff() -> list[RawListing]:
|
|||||||
return listings
|
return listings
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Van Herk Makelaars (Schiedam) — SURE WordPress plugin (card-house)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Listings filtered by city + price in URL; pagination via /page/{N}/.
|
||||||
|
# Detail page: div.features ul.unstyled li with two <span> (label + value).
|
||||||
|
|
||||||
|
_VANHERK_BASE = "https://www.vanherk.nl"
|
||||||
|
_VANHERK_LISTINGS = "https://www.vanherk.nl/wonen/aanbod/zoeken/schiedam/200000-300000/"
|
||||||
|
|
||||||
|
_VANHERK_STATUS_MAP = {
|
||||||
|
"beschikbaar": "beschikbaar",
|
||||||
|
"onder bod": "onder_bod",
|
||||||
|
"onder optie": "onder_bod",
|
||||||
|
"verkocht": "verkocht",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _vanherk_detail(detail_url: str) -> dict:
|
||||||
|
"""Fetch Van Herk detail page; extract kenmerken from div.features."""
|
||||||
|
try:
|
||||||
|
soup = fetch_soup(detail_url)
|
||||||
|
kv: dict[str, str] = {}
|
||||||
|
for li in soup.select("div.features ul.unstyled li"):
|
||||||
|
spans = li.select("span")
|
||||||
|
if len(spans) >= 2:
|
||||||
|
label = spans[0].get_text(strip=True).lower()
|
||||||
|
value = spans[1].get_text(strip=True)
|
||||||
|
kv[label] = value
|
||||||
|
return {
|
||||||
|
"status": kv.get("status", "").lower(),
|
||||||
|
"bouwjaar": kv.get("bouwjaar"),
|
||||||
|
"woonoppervlak": kv.get("woonoppervlakte"),
|
||||||
|
"slaapkamers": kv.get("aantal slaapkamers"),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("vanherk: detail fetch fout %s: %s", detail_url, e)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_vanherk() -> list[RawListing]:
|
||||||
|
"""Fetch Van Herk listings; only Schiedam, only koop."""
|
||||||
|
listings = []
|
||||||
|
page = 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if page == 1:
|
||||||
|
url = _VANHERK_LISTINGS
|
||||||
|
else:
|
||||||
|
url = _VANHERK_LISTINGS + f"page/{page}/"
|
||||||
|
|
||||||
|
soup = fetch_soup(url)
|
||||||
|
cards = soup.select("a.card-house")
|
||||||
|
if not cards:
|
||||||
|
break
|
||||||
|
|
||||||
|
for card in cards:
|
||||||
|
try:
|
||||||
|
href = card.get("href", "")
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
detail_url = href if href.startswith("http") else _VANHERK_BASE + href
|
||||||
|
|
||||||
|
# City from lead paragraph
|
||||||
|
lead = card.select_one("p.lead")
|
||||||
|
stad = lead.get_text(strip=True) if lead else None
|
||||||
|
|
||||||
|
# Address from h4 (normalize whitespace incl. )
|
||||||
|
h4 = card.select_one("h4")
|
||||||
|
adres = " ".join(h4.get_text().split()) if h4 else None
|
||||||
|
|
||||||
|
# Price from .subtitle
|
||||||
|
subtitle = card.select_one("p.subtitle")
|
||||||
|
prijs = parse_prijs(subtitle.get_text() if subtitle else None)
|
||||||
|
if prijs and prijs > config.MAX_PRICE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Hero image: largest srcset source
|
||||||
|
src_tag = card.select_one('picture source[media="(min-width:1280px)"]')
|
||||||
|
hero = src_tag.get("srcset") if src_tag else None
|
||||||
|
if hero and not hero.startswith("http"):
|
||||||
|
hero = _VANHERK_BASE + hero
|
||||||
|
|
||||||
|
# Card data icons: surface, bedrooms, energy label
|
||||||
|
woonoppervlak_card = None
|
||||||
|
slaapkamers_card = None
|
||||||
|
energielabel_card = None
|
||||||
|
for data_div in card.select("div.data"):
|
||||||
|
classes = data_div.get("class") or []
|
||||||
|
if "d-none" in classes:
|
||||||
|
continue
|
||||||
|
if "data-energie" in classes:
|
||||||
|
inner = data_div.select_one(".date__inner")
|
||||||
|
energielabel_card = inner.get_text(strip=True) if inner else None
|
||||||
|
elif data_div.select_one("i.icon-surface"):
|
||||||
|
inner = data_div.select_one("span.date__inner")
|
||||||
|
woonoppervlak_card = parse_m2(inner.get_text(strip=True) if inner else None)
|
||||||
|
elif data_div.select_one("i.icon-bed"):
|
||||||
|
inner = data_div.select_one("span.date__inner")
|
||||||
|
txt = inner.get_text(strip=True) if inner else None
|
||||||
|
m = re.search(r"(\d+)", txt) if txt else None
|
||||||
|
slaapkamers_card = int(m.group(1)) if m else None
|
||||||
|
|
||||||
|
kk = _vanherk_detail(detail_url)
|
||||||
|
|
||||||
|
status = _VANHERK_STATUS_MAP.get(kk.get("status", ""), "beschikbaar")
|
||||||
|
|
||||||
|
listings.append(RawListing(
|
||||||
|
url=detail_url,
|
||||||
|
source_makelaar="vanherk",
|
||||||
|
status=status,
|
||||||
|
adres=adres,
|
||||||
|
stad=stad,
|
||||||
|
prijs=prijs,
|
||||||
|
hero_image_url=hero,
|
||||||
|
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,
|
||||||
|
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
|
||||||
|
slaapkamers=(int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None) or slaapkamers_card,
|
||||||
|
energielabel=energielabel_card,
|
||||||
|
))
|
||||||
|
if config.APP_ENV == "dev":
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("vanherk: parse fout: %s", e)
|
||||||
|
|
||||||
|
if len(cards) < 15:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
log.info("vanherk: %d listings opgehaald", len(listings))
|
||||||
|
return listings
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Van Oord Makelaardij (Delft + Schiedam) — Elementor WordPress
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Two filtered listing URLs (one per city). Cards are div.e-loop-item.
|
||||||
|
# Detail page: ul.rw-object-features-list li with label/value spans.
|
||||||
|
|
||||||
|
_VANOORD_BASE = "https://www.vanoordmakelaardij.nl"
|
||||||
|
_VANOORD_LISTINGS = [
|
||||||
|
"https://www.vanoordmakelaardij.nl/aanbod/?view=list&plaats=delft&prijs_vanaf=225000&prijs_tot=300000",
|
||||||
|
"https://www.vanoordmakelaardij.nl/aanbod/?view=list&plaats=schiedam&prijs_vanaf=225000&prijs_tot=300000",
|
||||||
|
]
|
||||||
|
|
||||||
|
_VANOORD_STATUS_MAP = {
|
||||||
|
"beschikbaar": "beschikbaar",
|
||||||
|
"onder bod": "onder_bod",
|
||||||
|
"onder optie": "onder_bod",
|
||||||
|
"verkocht": "verkocht",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _vanoord_detail(detail_url: str) -> dict:
|
||||||
|
"""Fetch Van Oord detail page; extract kenmerken from rw-object-features-list."""
|
||||||
|
try:
|
||||||
|
soup = fetch_soup(detail_url)
|
||||||
|
kv: dict[str, str] = {}
|
||||||
|
for li in soup.select("ul.rw-object-features-list li"):
|
||||||
|
label_el = li.select_one("span.rw-object-list-label")
|
||||||
|
value_el = li.select_one("span.rw-object-list-value")
|
||||||
|
if label_el and value_el:
|
||||||
|
label = label_el.get_text(strip=True).lower()
|
||||||
|
value = value_el.get_text(strip=True)
|
||||||
|
kv[label] = value
|
||||||
|
return {
|
||||||
|
"status": kv.get("status", "").lower(),
|
||||||
|
"bouwjaar": kv.get("bouwjaar"),
|
||||||
|
"woonoppervlak": kv.get("woonoppervlakte"),
|
||||||
|
"kamers": kv.get("aantal kamers"),
|
||||||
|
"slaapkamers": kv.get("slaapkamers"),
|
||||||
|
"energielabel": kv.get("energieklasse"),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("vanoord: detail fetch fout %s: %s", detail_url, e)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_vanoord() -> list[RawListing]:
|
||||||
|
"""Fetch Van Oord listings; Delft and Schiedam, only koop."""
|
||||||
|
seen: set[str] = set()
|
||||||
|
listings = []
|
||||||
|
|
||||||
|
for listing_url in _VANOORD_LISTINGS:
|
||||||
|
soup = fetch_soup(listing_url)
|
||||||
|
cards = soup.select("div.e-loop-item")
|
||||||
|
|
||||||
|
for card in cards:
|
||||||
|
try:
|
||||||
|
# Detail URL from h3 > a
|
||||||
|
a_tag = card.select_one("h3.elementor-heading-title a[href]")
|
||||||
|
if not a_tag:
|
||||||
|
continue
|
||||||
|
detail_url = a_tag["href"]
|
||||||
|
if not detail_url.startswith("http"):
|
||||||
|
detail_url = _VANOORD_BASE + detail_url
|
||||||
|
if detail_url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(detail_url)
|
||||||
|
|
||||||
|
# Status from rw-status-label widget class
|
||||||
|
status_el = card.select_one("[class*='rw-status-label--']")
|
||||||
|
status = "beschikbaar"
|
||||||
|
if status_el:
|
||||||
|
status_text = status_el.get_text(strip=True).lower()
|
||||||
|
status = _VANOORD_STATUS_MAP.get(status_text, "beschikbaar")
|
||||||
|
|
||||||
|
# City from h4
|
||||||
|
h4 = card.select_one("h4.elementor-heading-title")
|
||||||
|
stad = h4.get_text(strip=True) if h4 else None
|
||||||
|
|
||||||
|
# Address from h3 > a text
|
||||||
|
adres = " ".join(a_tag.get_text().split())
|
||||||
|
|
||||||
|
# Price from h3 without <a> child
|
||||||
|
prijs = None
|
||||||
|
for h3 in card.select("h3.elementor-heading-title"):
|
||||||
|
if not h3.select_one("a"):
|
||||||
|
prijs = parse_prijs(h3.get_text())
|
||||||
|
break
|
||||||
|
if prijs and prijs > config.MAX_PRICE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Card icon list: [0]=surface [1]=rooms [2]=energy
|
||||||
|
icon_items = card.select("ul.elementor-icon-list-items li span.elementor-icon-list-text")
|
||||||
|
woonoppervlak_card = parse_m2(icon_items[0].get_text()) if len(icon_items) > 0 else None
|
||||||
|
kamers_card = None
|
||||||
|
if len(icon_items) > 1:
|
||||||
|
m = re.search(r"(\d+)", icon_items[1].get_text())
|
||||||
|
kamers_card = int(m.group(1)) if m else None
|
||||||
|
energielabel_card = icon_items[2].get_text(strip=True) if len(icon_items) > 2 else None
|
||||||
|
|
||||||
|
kk = _vanoord_detail(detail_url)
|
||||||
|
|
||||||
|
detail_status = _VANOORD_STATUS_MAP.get(kk.get("status", ""), "")
|
||||||
|
if detail_status:
|
||||||
|
status = detail_status
|
||||||
|
|
||||||
|
listings.append(RawListing(
|
||||||
|
url=detail_url,
|
||||||
|
source_makelaar="vanoord",
|
||||||
|
status=status,
|
||||||
|
adres=adres,
|
||||||
|
stad=stad,
|
||||||
|
prijs=prijs,
|
||||||
|
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,
|
||||||
|
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
|
||||||
|
kamers=(int(kk["kamers"]) if kk.get("kamers", "").isdigit() else None) or kamers_card,
|
||||||
|
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None,
|
||||||
|
energielabel=kk.get("energielabel") or energielabel_card,
|
||||||
|
))
|
||||||
|
if config.APP_ENV == "dev":
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("vanoord: parse fout: %s", e)
|
||||||
|
|
||||||
|
log.info("vanoord: %d listings opgehaald", len(listings))
|
||||||
|
return listings
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# SCRAPERS — exporteer hier alle actieve SSR adapters
|
# SCRAPERS — exporteer hier alle actieve SSR adapters
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -1896,4 +2155,6 @@ SCRAPERS = {
|
|||||||
'olsthoorn': fetch_olsthoorn,
|
'olsthoorn': fetch_olsthoorn,
|
||||||
'88makelaars': fetch_88makelaars,
|
'88makelaars': fetch_88makelaars,
|
||||||
'borgdorff': fetch_borgdorff,
|
'borgdorff': fetch_borgdorff,
|
||||||
|
'vanherk': fetch_vanherk,
|
||||||
|
'vanoord': fetch_vanoord,
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user