refactor: split ssr.py into package, enrich OG Online detail pages, fix travel upsert

- Split src/adapters/ssr.py (2160 LOC) into ssr/ package grouped by CMS:
  realworks.py, sure.py, schiedam.py, denhaag.py, overige.py
- Add _og_detail() to api.py; all OG Online scrapers now fall back to
  detail page fetch when energielabel/bouwjaar are missing from the API
- Fix run() to recalculate travel times for existing listings where
  fiets_mark IS NULL; upsert() now writes travel cols on existing rows too
- Update tests/cache.py to patch fetch_soup in every ssr submodule
- Update docs to reflect new package structure and mark API enrichment TODO done

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-11 23:39:35 +02:00
parent 1011d9cf87
commit f74e9bcfb0
14 changed files with 2478 additions and 2199 deletions

View File

@@ -7,9 +7,11 @@ Voeg nieuwe toe onderaan en registreer in SCRAPERS.
import json
import logging
import re
import time
import httpx
from bs4 import BeautifulSoup
import config
from huizenbot import RawListing
@@ -40,8 +42,71 @@ def fetch_json(url: str, *, params: dict = None, headers: dict = None) -> dict |
return r.json()
raise RuntimeError(f"Blijvend 429 op {url}")
def _og_detail(url: str, makelaar: str) -> dict:
"""
Fetch an OG Online detail page and extract missing fields.
OG Online sites typically expose kenmerken in one of two patterns:
1. A table/list with dt/dd or label/value span pairs
2. An energielabel CSS class (energielabel-A, energielabel-B, etc.)
Returns a dict with any fields found; empty dict on failure.
"""
try:
r = httpx.get(
url,
headers={"User-Agent": config.USER_AGENT},
timeout=15,
follow_redirects=True,
)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
# Pattern 1: energielabel CSS class on any element
energielabel = None
for el in soup.select("[class]"):
for cls in el.get("class", []):
if cls.startswith("energielabel-") and cls != "energielabel":
energielabel = cls.replace("energielabel-", "").upper()
break
if energielabel:
break
# Pattern 2: kenmerken table — try dt/dd pairs first
kv: dict[str, str] = {}
dts = soup.select("dt")
dds = soup.select("dd")
for dt, dd in zip(dts, dds):
kv[dt.get_text(strip=True).lower()] = dd.get_text(strip=True)
# Pattern 3: ul.objectkenmerken / div.kenmerken span pairs
if not kv:
for li in soup.select("li"):
spans = li.select("span")
if len(spans) >= 2:
kv[spans[0].get_text(strip=True).lower()] = spans[1].get_text(strip=True)
if not energielabel:
energielabel = (
kv.get("energielabel")
or kv.get("energieklasse")
or kv.get("energie")
) or None
raw_year = kv.get("bouwjaar") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
return {
"energielabel": energielabel,
"bouwjaar": bouwjaar,
}
except Exception as e:
log.warning("%s: detail fetch fout %s: %s", makelaar, url, e)
return {}
# ---------------------------------------------------------------------------
# Bjornd
# ---------------------------------------------------------------------------
@@ -56,26 +121,36 @@ _STATUS_MAP = {
"sold": "verkocht",
"sold_ur": "verkocht",
}
def fetch_bjornd() -> list[RawListing]:
data = fetch_json(
f"{_BJORND_BASE}/nl/realtime-listings/consumer",
headers={"X-Requested-With": "XMLHttpRequest"},
)
listings = []
for item in data:
if not item.get("isSales"):
continue
if item.get("statusOrig") in _BJORND_SKIP:
continue
if item.get('salesPrice')>config.MAX_PRICE:
if item.get("salesPrice", 0) > config.MAX_PRICE:
continue
detail_url = _BJORND_BASE + item["url"]
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
# Fetch detail page when API omits key fields
if not energielabel or not bouwjaar:
extra_kk = _og_detail(detail_url, "bjornd")
energielabel = energielabel or extra_kk.get("energielabel")
bouwjaar = bouwjaar or extra_kk.get("bouwjaar")
listings.append(RawListing(
url=_BJORND_BASE + item["url"],
url=detail_url,
source_makelaar="bjornd",
status=_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -87,6 +162,8 @@ def fetch_bjornd() -> list[RawListing]:
perceeloppervlak=item.get("plotSurface") or None,
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
extra=json.dumps({
"balcony": item.get("balcony"),
@@ -102,10 +179,13 @@ def fetch_bjornd() -> list[RawListing]:
"photos": item.get("photos"),
}, ensure_ascii=False),
))
if config.APP_ENV == "dev":
break
log.info("bjornd: %d koopwoningen opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Ooms
# ---------------------------------------------------------------------------
@@ -221,9 +301,15 @@ def fetch_moerman() -> list[RawListing]:
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _MOERMAN_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "moerman")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=_MOERMAN_BASE + item["url"],
url=detail_url,
source_makelaar="moerman",
status=_MOERMAN_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -236,9 +322,11 @@ def fetch_moerman() -> list[RawListing]:
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=item.get("energyLabel") or None,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("moerman: %d koopwoningen opgehaald", len(listings))
return listings
@@ -284,9 +372,15 @@ def fetch_vandaal() -> list[RawListing]:
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _VANDAAL_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "vandaal")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=_VANDAAL_BASE + item["url"],
url=detail_url,
source_makelaar="vandaal",
status=_VANDAAL_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -299,9 +393,11 @@ def fetch_vandaal() -> list[RawListing]:
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=item.get("energyLabel") or None,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("vandaal: %d koopwoningen opgehaald", len(listings))
return listings
@@ -349,9 +445,15 @@ def fetch_elzenaar() -> list[RawListing]:
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _ELZENAAR_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "elzenaar")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=_ELZENAAR_BASE + item["url"],
url=detail_url,
source_makelaar="elzenaar",
status=_ELZENAAR_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -364,9 +466,11 @@ def fetch_elzenaar() -> list[RawListing]:
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=item.get("energyLabel") or None,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("elzenaar: %d koopwoningen opgehaald", len(listings))
return listings
@@ -413,9 +517,15 @@ def fetch_doen() -> list[RawListing]:
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _DOEN_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "doen")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=_DOEN_BASE + item["url"],
url=detail_url,
source_makelaar="doen",
status=_DOEN_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -428,9 +538,11 @@ def fetch_doen() -> list[RawListing]:
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=item.get("energyLabel") or None,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("doen: %d koopwoningen opgehaald", len(listings))
return listings
@@ -476,9 +588,15 @@ def fetch_vandriel() -> list[RawListing]:
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _VANDRIEL_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "vandriel")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=_VANDRIEL_BASE + item["url"],
url=detail_url,
source_makelaar="vandriel",
status=_VANDRIEL_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -491,9 +609,11 @@ def fetch_vandriel() -> list[RawListing]:
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=item.get("energyLabel") or None,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("vandriel: %d koopwoningen opgehaald", len(listings))
return listings

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,63 @@
"""
adapters/ssr — HTML/SSR-based makelaars
Elke scraper is een functie () -> list[RawListing].
Om een nieuwe makelaar toe te voegen:
1. Voeg een fetch_* functie toe in het juiste submodule
(realworks.py, sure.py, schiedam.py, denhaag.py, overige.py)
2. Importeer de functie hier en registreer in SCRAPERS.
CMS-typen per module:
realworks.py — Realworks CMS (li/div.aanbodEntry + span.kenmerk detail)
sure.py — SURE WordPress plugin (/wonen?sure_koop_huur=koop + #kenmerken)
schiedam.py — Custom Schiedam scrapers (diverse platforms)
denhaag.py — Den Haag scrapers (diverse platforms)
overige.py — Overige / multi-stad (OG Online WP, Elementor)
"""
from .realworks import (
fetch_ankebodewes,
fetch_woongoed,
fetch_vwmakelaars,
fetch_zomakelaars,
fetch_morris,
fetch_wassenaar,
fetch_roepman,
fetch_post,
)
from .sure import (
fetch_schielandborsboom,
fetch_olsthoorn,
fetch_vanherk,
fetch_borgdorff,
)
from .schiedam import (
fetch_dewittegarantiemakelaars,
fetch_dens,
fetch_3dmakelaars,
fetch_dupont,
)
from .denhaag import fetch_88makelaars
from .overige import fetch_vansilfhout, fetch_vanoord
SCRAPERS = {
'ankebodewes': fetch_ankebodewes,
'woongoed': fetch_woongoed,
'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars,
'wassenaar': fetch_wassenaar,
'dens': fetch_dens,
'3dmakelaars': fetch_3dmakelaars,
'dupont': fetch_dupont,
'schielandborsboom': fetch_schielandborsboom,
'vansilfhout': fetch_vansilfhout,
'vwmakelaars': fetch_vwmakelaars,
'roepman': fetch_roepman,
'zomakelaars': fetch_zomakelaars,
'post': fetch_post,
'morris': fetch_morris,
'olsthoorn': fetch_olsthoorn,
'88makelaars': fetch_88makelaars,
'borgdorff': fetch_borgdorff,
'vanherk': fetch_vanherk,
'vanoord': fetch_vanoord,
}

View File

@@ -0,0 +1,79 @@
"""Shared utilities for all SSR scrapers."""
import logging
import re
import time
import httpx
from bs4 import BeautifulSoup
import config
log = logging.getLogger("huizenbot.ssr")
def fetch_soup(url: str, *, params: dict = None) -> BeautifulSoup:
"""GET request → BeautifulSoup. Handelt 429 af met Retry-After."""
for attempt in range(3):
r = httpx.get(
url,
params=params,
headers={"User-Agent": config.USER_AGENT},
timeout=15,
follow_redirects=True,
)
if r.status_code == 429:
wait = int(r.headers.get("Retry-After", 60))
log.warning("429 op %s, wacht %ds", url, wait)
time.sleep(wait)
continue
r.raise_for_status()
return BeautifulSoup(r.text, "html.parser")
raise RuntimeError(f"Blijvend 429 op {url}")
def parse_prijs(text: str | None) -> int | None:
"""'€ 325.000 k.k.' → 325000"""
if not text:
return None
digits = re.sub(r"[^\d]", "", text)
return int(digits) if digits else None
def parse_m2(text: str | None) -> int | None:
"""'87 m²' → 87"""
if not text:
return None
m = re.search(r"(\d+)", text.replace(".", ""))
return int(m.group(1)) if m else None
def _text(soup, selector: str) -> str | None:
el = soup.select_one(selector)
return el.get_text(strip=True) if el else None
def _src(soup, selector: str) -> str | None:
el = soup.select_one(selector)
if el is None:
return None
return el.get("src") or el.get("data-src")
def _extract_postcode(text: str | None) -> str | None:
if not text:
return None
m = re.search(r"\b(\d{4}\s?[A-Z]{2})\b", text)
return m.group(1).replace(" ", "") if m else None
def _infer_stad(postcode: str | None) -> str | None:
"""Simpele mapping op basis van postcode range — uitbreiden naar wens."""
if not postcode:
return None
code = int(postcode[:4])
if 2600 <= code <= 2629:
return "Delft"
if 3100 <= code <= 3135:
return "Schiedam"
return None

138
src/adapters/ssr/denhaag.py Normal file
View File

@@ -0,0 +1,138 @@
"""
Den Haag scrapers (custom platforms).
Scrapers: 88makelaars
Note: borgdorff also covers Den Haag but uses the SURE CMS → see sure.py.
"""
import re
import config
from huizenbot import RawListing
from ._shared import fetch_soup, parse_prijs, parse_m2, _text, log
# ---------------------------------------------------------------------------
# 88 Makelaars (Den Haag)
# ---------------------------------------------------------------------------
_88_BASE = "https://88makelaars.nl"
_88_STATUS_MAP = {
"te koop": "beschikbaar",
"beschikbaar": "beschikbaar",
"onder bod": "onder_bod",
"onder optie": "onder_bod",
"verkocht onder voorbehoud": "verkocht",
"verkocht": "verkocht",
}
def _88makelaars_detail(detail_url: str) -> dict:
"""Fetch 88makelaars detail page; extract kenmerken from div.listing_detail kv pairs."""
try:
soup = fetch_soup(detail_url)
kv: dict[str, str] = {}
for div in soup.select("div.listing_detail"):
txt = div.get_text(strip=True)
if ":" in txt:
label, _, value = txt.partition(":")
kv[label.strip().lower()] = value.strip()
raw_pc = kv.get("postcode") or ""
pc_match = re.search(r"\d{4}\s*[A-Z]{2}", raw_pc.upper())
postcode = pc_match.group(0).replace(" ", "") if pc_match else None
return {
"postcode": postcode,
"slaapkamers": kv.get("slaapkamers"),
"woonoppervlak": kv.get("woning grootte"),
"energielabel": kv.get("energieklasse"),
"woningtype": kv.get("soort woning"),
}
except Exception as e:
log.warning("88makelaars: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_88makelaars() -> list[RawListing]:
"""Fetch 88 Makelaars listings (Den Haag only)."""
listings = []
page = 1
while True:
if page == 1:
url = f"{_88_BASE}/ons-aanbod/"
else:
url = f"{_88_BASE}/ons-aanbod/page/{page}/"
soup = fetch_soup(url)
cards = soup.select("div.property_listing")
if not cards:
break
for card in cards:
try:
# URL from carousel
a_tag = card.select_one(".property_unit_carousel a[href]")
if not a_tag:
continue
detail_url = a_tag["href"]
if not detail_url.startswith("http"):
detail_url = _88_BASE + detail_url
# City — last link in property_location_image
loc_links = card.select(".property_location_image a")
stad = loc_links[-1].get_text(strip=True) if loc_links else None
if not stad or stad.lower() != "den haag":
continue
# Price
prijs = parse_prijs(_text(card, ".listing_unit_price_wrapper"))
if prijs and prijs > config.MAX_PRICE:
continue
# Status
status_text = (_text(card, ".ribbon-inside") or "").lower()
status = _88_STATUS_MAP.get(status_text, "beschikbaar")
# Address
adres = _text(card, "h4 a") or _text(card, "h4")
# Surface + rooms
woonoppervlak_card = parse_m2(_text(card, "span.infosize"))
kamers_card = None
rooms_txt = _text(card, "span.inforoom")
if rooms_txt:
m = re.search(r"(\d+)", rooms_txt)
kamers_card = int(m.group(1)) if m else None
# Hero: first active carousel image
img = card.select_one(".item.active img")
hero = img.get("src") or img.get("data-original") if img else None
kk = _88makelaars_detail(detail_url)
listings.append(RawListing(
url=detail_url,
source_makelaar="88makelaars",
status=status,
adres=adres,
postcode=kk.get("postcode"),
stad="Den Haag",
prijs=prijs,
hero_image_url=hero,
woningtype=kk.get("woningtype"),
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
kamers=kamers_card,
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None,
energielabel=kk.get("energielabel"),
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("88makelaars: parse fout: %s", e)
if len(cards) < 10:
break
page += 1
log.info("88makelaars: %d listings opgehaald", len(listings))
return listings

280
src/adapters/ssr/overige.py Normal file
View File

@@ -0,0 +1,280 @@
"""
Overige SSR scrapers (no shared CMS platform, multi-city).
Scrapers: vansilfhout (OG Online WordPress), vanoord (Elementor/custom)
"""
import re
import config
from huizenbot import RawListing
from ._shared import fetch_soup, parse_prijs, parse_m2, _text, log
# ---------------------------------------------------------------------------
# Van Silfhout & Hogetoorn Wereldmakelaars (Delft) — OG Online WordPress
# ---------------------------------------------------------------------------
# All listings on one page. Postcode embedded in JS; detail has shortSpecs.
# Also serves as base for fetch_vwmakelaars and fetch_zomakelaars which
# happen to use the standard Realworks CMS instead — see realworks.py.
_VANSILFHOUT_BASE = "https://www.vansilfhout.nl"
_VANSILFHOUT_STATUS_MAP = {
"te koop": "beschikbaar",
"onder bod": "onder_bod",
"verkocht": "verkocht",
}
def _vansilfhout_detail(detail_url: str) -> dict:
"""Fetch Van Silfhout detail page; extract postcode from JS and specs from shortSpecs."""
try:
import httpx
r = httpx.get(
detail_url,
headers={"User-Agent": config.USER_AGENT},
timeout=15,
follow_redirects=True,
)
r.raise_for_status()
html = r.text
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# Postcode embedded in JS: objectZipcode': '2624NP'
m = re.search(r"objectZipcode':\s*'([^']+)'", html)
postcode = m.group(1) if m else None
# shortSpecs: <li><span>Label:</span><span>Value</span></li>
kv: dict[str, str] = {}
for li in soup.select(".shortSpecs li"):
spans = li.select("span")
if len(spans) >= 2:
label = spans[0].get_text(strip=True).rstrip(":").lower()
value = spans[-1].get_text(strip=True)
kv[label] = value
return {
"postcode": postcode,
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("oppervlakte"),
"kamers": kv.get("kamers"),
"slaapkamers": kv.get("slaapkamers"),
}
except Exception as e:
log.warning("vansilfhout: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_vansilfhout() -> list[RawListing]:
"""Fetch Van Silfhout woningaanbod (alle listings op één pagina)."""
soup = fetch_soup(f"{_VANSILFHOUT_BASE}/woningaanbod/")
listings = []
for card in soup.select("article.row"):
try:
a_tag = card.select_one("a.objectcontainerimg")
if not a_tag or "href" not in a_tag.attrs:
continue
detail_url = a_tag["href"]
if not detail_url.startswith("http"):
detail_url = _VANSILFHOUT_BASE + detail_url
# Status
status_text = (_text(card, "span.objectstatus") or "").lower()
status = _VANSILFHOUT_STATUS_MAP.get(status_text, "beschikbaar")
# Address and city
adres = _text(card, "h2.objecttitle")
city_el = card.select("a.straatnaamwoonplaats span")
stad = city_el[-1].get_text(strip=True) if city_el else None
# Price from shortSpecs strong
prijs = parse_prijs(_text(card, "ul.shortSpecs li strong"))
if prijs and prijs > config.MAX_PRICE:
continue
# Area and rooms from shortSpecs
woonoppervlak_card = None
kamers_card = None
for li in card.select("ul.shortSpecs li"):
spans = li.select("span")
if len(spans) >= 2:
label = spans[0].get_text(strip=True).lower()
val = spans[-1].get_text(strip=True)
if "oppervlakt" in label:
woonoppervlak_card = parse_m2(val)
elif "kamer" in label:
m = re.search(r"(\d+)", val)
kamers_card = int(m.group(1)) if m else None
# Hero image: prefer data-lazy-src, fall back to noscript img src
img_tag = card.select_one("a.objectcontainerimg img")
hero = None
if img_tag:
hero = (img_tag.get("data-lazy-src")
or img_tag.get("src") or None)
if hero and hero.startswith("data:"):
noscript = card.select_one("noscript img")
hero = noscript["src"] if noscript else None
kk = _vansilfhout_detail(detail_url)
# Parse kamers/slaapkamers from detail
kamers = kamers_card
if kk.get("kamers"):
m = re.search(r"(\d+)", kk["kamers"])
kamers = int(m.group(1)) if m else kamers_card
slaapkamers = None
if kk.get("slaapkamers"):
m = re.search(r"(\d+)", kk["slaapkamers"])
slaapkamers = int(m.group(1)) if m else None
listings.append(RawListing(
url=detail_url,
source_makelaar="vansilfhout",
status=status,
adres=adres,
postcode=kk.get("postcode"),
stad=stad,
prijs=prijs,
hero_image_url=hero,
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
kamers=kamers,
slaapkamers=slaapkamers,
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("vansilfhout: parse fout: %s", e)
log.info("vansilfhout: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Van Oord Makelaardij (Delft + Schiedam) — Elementor/custom WordPress
# ---------------------------------------------------------------------------
# Separate listing pages per city; detail page has rw-object-features-list.
_VANOORD_BASE = "https://www.vanoordmakelaardij.nl"
_VANOORD_LISTINGS = [
f"https://www.vanoordmakelaardij.nl/aanbod/?_price=0%2C{config.MAX_PRICE}&_city=Delft&_availability=Te+koop",
f"https://www.vanoordmakelaardij.nl/aanbod/?_price=0%2C{config.MAX_PRICE}&_city=Schiedam&_availability=Te+koop",
]
_VANOORD_STATUS_MAP = {
"te koop": "beschikbaar",
"onder bod": "onder_bod",
"verkocht": "verkocht",
}
def _vanoord_detail(detail_url: str) -> dict:
"""Fetch Van Oord detail page; extract kenmerken from rw-object-features-list."""
try:
soup = fetch_soup(detail_url)
kv: dict[str, str] = {}
for li in soup.select("ul.rw-object-features-list li"):
label_el = li.select_one("span.rw-object-list-label")
value_el = li.select_one("span.rw-object-list-value")
if label_el and value_el:
label = label_el.get_text(strip=True).lower()
value = value_el.get_text(strip=True)
kv[label] = value
return {
"status": kv.get("status", "").lower(),
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("woonoppervlakte"),
"kamers": kv.get("aantal kamers"),
"slaapkamers": kv.get("slaapkamers"),
"energielabel": kv.get("energieklasse"),
}
except Exception as e:
log.warning("vanoord: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_vanoord() -> list[RawListing]:
"""Fetch Van Oord listings; Delft and Schiedam, only koop."""
seen: set[str] = set()
listings = []
for listing_url in _VANOORD_LISTINGS:
soup = fetch_soup(listing_url)
cards = soup.select("div.e-loop-item")
for card in cards:
try:
# Detail URL from h3 > a
a_tag = card.select_one("h3.elementor-heading-title a[href]")
if not a_tag:
continue
detail_url = a_tag["href"]
if not detail_url.startswith("http"):
detail_url = _VANOORD_BASE + detail_url
if detail_url in seen:
continue
seen.add(detail_url)
# Status from rw-status-label widget class
status_el = card.select_one("[class*='rw-status-label--']")
status = "beschikbaar"
if status_el:
status_text = status_el.get_text(strip=True).lower()
status = _VANOORD_STATUS_MAP.get(status_text, "beschikbaar")
# City from h4
h4 = card.select_one("h4.elementor-heading-title")
stad = h4.get_text(strip=True) if h4 else None
# Address from h3 > a text
adres = " ".join(a_tag.get_text().split())
# Price from h3 without <a> child
prijs = None
for h3 in card.select("h3.elementor-heading-title"):
if not h3.select_one("a"):
prijs = parse_prijs(h3.get_text())
break
if prijs and prijs > config.MAX_PRICE:
continue
# Card icon list: [0]=surface [1]=rooms [2]=energy
icon_items = card.select("ul.elementor-icon-list-items li span.elementor-icon-list-text")
woonoppervlak_card = parse_m2(icon_items[0].get_text()) if len(icon_items) > 0 else None
kamers_card = None
if len(icon_items) > 1:
m = re.search(r"(\d+)", icon_items[1].get_text())
kamers_card = int(m.group(1)) if m else None
energielabel_card = icon_items[2].get_text(strip=True) if len(icon_items) > 2 else None
kk = _vanoord_detail(detail_url)
detail_status = _VANOORD_STATUS_MAP.get(kk.get("status", ""), "")
if detail_status:
status = detail_status
listings.append(RawListing(
url=detail_url,
source_makelaar="vanoord",
status=status,
adres=adres,
stad=stad,
prijs=prijs,
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
kamers=(int(kk["kamers"]) if kk.get("kamers", "").isdigit() else None) or kamers_card,
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None,
energielabel=kk.get("energielabel") or energielabel_card,
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("vanoord: parse fout: %s", e)
log.info("vanoord: %d listings opgehaald", len(listings))
return listings

View File

@@ -0,0 +1,502 @@
"""
Realworks CMS scrapers.
All makelaars here run the Realworks CMS. Listings come from paginated
/aanbod/woningaanbod/-{price}/koop/ pages; detail pages have span.kenmerk
label/value pairs. Some variants (Wassenaar, Roepman) expose listing-level
data via JSON-LD instead of card HTML.
Scrapers: ankebodewes, woongoed, vwmakelaars, zomakelaars, morris,
wassenaar, roepman, post
"""
import json as _json
import re
import config
from huizenbot import RawListing
from ._shared import fetch_soup, parse_prijs, parse_m2, _text, log
# ---------------------------------------------------------------------------
# Shared Realworks helpers
# ---------------------------------------------------------------------------
_REALWORKS_STATUS_MAP = {
"te koop": "beschikbaar",
"nieuw": "beschikbaar",
"onder bod": "onder_bod",
"onder optie": "onder_bod",
"verkocht o.v.": "verkocht",
"verkocht": "verkocht",
}
def _realworks_detail(detail_url: str, makelaar: str) -> dict:
"""Fetch a Realworks detail page and extract kenmerken. Returns empty dict on failure."""
try:
soup = fetch_soup(detail_url)
# Build a label→value map from all .kenmerk spans
kv: dict[str, str] = {}
for kenmerk in soup.select("span.kenmerk"):
label_el = kenmerk.select_one("span.kenmerkName")
value_el = kenmerk.select_one("span.kenmerkValue")
if label_el and value_el:
label = label_el.get_text(strip=True).lower()
value = value_el.get_text(strip=True)
kv[label] = value
return {
"woningtype": kv.get("type woning"),
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("woonoppervlakte"),
"perceeloppervlak": kv.get("perceeloppervlakte"),
"kamers": kv.get("aantal kamers"),
"slaapkamers": kv.get("aantal slaapkamers"),
"energielabel": kv.get("energieklasse"),
}
except Exception as e:
log.warning("%s: detail fetch fout %s: %s", makelaar, detail_url, e)
return {}
def fetch_realworks(base_url: str, makelaar: str) -> list[RawListing]:
"""
Generic fetcher for Realworks CMS brokers.
Paginates via /pagina-{n}/, fetches detail page per listing.
"""
listings_path = f"/aanbod/woningaanbod/-{config.MAX_PRICE}/koop"
listings = []
page = 1
while True:
url = f"{base_url}{listings_path}/pagina-{page}/"
soup = fetch_soup(url)
cards = soup.select("li.aanbodEntry")
if not cards:
break
for card in cards:
try:
a_tag = card.select_one("a.aanbodEntryLink")
if not a_tag:
continue
listing_url = base_url + a_tag["href"]
adres = _text(card, ".street-address")
postcode = (_text(card, ".postal-code") or "").replace(" ", "") or None
stad = _text(card, ".locality")
prijs = parse_prijs(_text(card, ".koopprijs .kenmerkValue"))
status_text = (_text(card, ".objectstatusbanner") or "").lower()
status = _REALWORKS_STATUS_MAP.get(status_text, "beschikbaar")
img_tag = card.select_one(".hoofdfoto img")
hero = img_tag["src"] if img_tag else None
kk = _realworks_detail(listing_url, makelaar)
listings.append(RawListing(
url=listing_url,
source_makelaar=makelaar,
adres=adres,
postcode=postcode,
stad=stad,
prijs=prijs,
status=status,
hero_image_url=hero,
woningtype=kk.get("woningtype"),
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
woonoppervlak=parse_m2(kk.get("woonoppervlak")),
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
kamers=int(kk["kamers"]) if kk.get("kamers") else None,
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None,
energielabel=kk.get("energielabel"),
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("%s: parse fout: %s", makelaar, e)
if len(cards) < 10:
break
page += 1
log.info("%s: %d listings opgehaald", makelaar, len(listings))
return listings
# ---------------------------------------------------------------------------
# Simple Realworks wrappers (one-liners)
# ---------------------------------------------------------------------------
def fetch_ankebodewes() -> list[RawListing]:
return fetch_realworks("https://www.ankebodewes.nl", "ankebodewes")
def fetch_woongoed() -> list[RawListing]:
return fetch_realworks("https://www.woongoedmakelaars.nl", "woongoed")
def fetch_vwmakelaars() -> list[RawListing]:
return fetch_realworks("https://www.vwmakelaars.nl", "vwmakelaars")
def fetch_zomakelaars() -> list[RawListing]:
return fetch_realworks("https://www.zomakelaars.nl", "zomakelaars")
def fetch_morris() -> list[RawListing]:
return fetch_realworks("https://www.morrismakelaardij.nl", "morris")
# ---------------------------------------------------------------------------
# Makelaardij Wassenaar (Schiedam) — Realworks CMS, JSON-LD listing page
# ---------------------------------------------------------------------------
# Listings page has JSON-LD (Residence) with url/address/price/photo.
# Detail pages have span.kenmerk with Wassenaar-specific label names.
_WASSENAAR_BASE = "https://www.makelaardijwassenaar.nl"
_WASSENAAR_STATUS_MAP = {
"te koop": "beschikbaar",
"nieuw": "beschikbaar",
"onder bod": "onder_bod",
"onder optie": "onder_bod",
"verkocht o.v.": "onder_bod",
"verkocht onder voorbehoud": "onder_bod",
"verkocht": "verkocht",
}
def _wassenaar_detail(detail_url: str) -> dict:
"""Fetch Realworks detail page; extract kenmerken with Wassenaar-specific labels."""
try:
soup = fetch_soup(detail_url)
kv: dict[str, str] = {}
for kenmerk in soup.select("span.kenmerk"):
label_el = kenmerk.select_one("span.kenmerkName")
value_el = kenmerk.select_one("span.kenmerkValue")
if label_el and value_el:
kv[label_el.get_text(strip=True).lower()] = value_el.get_text(strip=True)
return {
"woningtype": kv.get("soort object"),
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("woonoppervlakte"),
"perceeloppervlak": kv.get("perceeloppervlakte"),
"kamers": kv.get("aantal kamers"),
"slaapkamers": kv.get("aantal slaapkamers"),
"energielabel": kv.get("energieklasse"),
}
except Exception as e:
log.warning("wassenaar: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_wassenaar() -> list[RawListing]:
soup = fetch_soup(f"{_WASSENAAR_BASE}/aanbod/woningaanbod/-{config.MAX_PRICE}/koop/")
# First pass: collect status + thumbnail per relative url
# Each listing has two a.aanbodEntryLink with the same href;
# the first has the status banner + photo, the second has address + price.
status_by_url: dict[str, str] = {}
photo_by_url: dict[str, str] = {}
for a in soup.select("a.aanbodEntryLink[href]"):
href = a["href"]
if href in status_by_url:
continue
banner = a.select_one(".objectstatusbanner")
status_text = banner.get_text(strip=True).lower() if banner else ""
status_by_url[href] = _WASSENAAR_STATUS_MAP.get(status_text, "beschikbaar")
img = a.select_one("span.hoofdfoto img")
if img:
src = img.get("src", "")
if "geenfotobeschikbaar" not in src:
photo_by_url[href] = src
# Second pass: parse JSON-LD blocks (one per listing)
seen: set[str] = set()
listings = []
for tag in soup.select('script[type="application/ld+json"]'):
try:
ld = _json.loads(tag.string)
if ld.get("@type") != "Residence":
continue
rel_url = ld.get("url", "")
if not rel_url or rel_url in seen:
continue
seen.add(rel_url)
detail_url = _WASSENAAR_BASE + rel_url
address = ld.get("address", {})
postcode = address.get("postalCode", "").replace(" ", "") or None
price_spec = next(
(a.get("priceSpecification", {}) for a in ld.get("potentialAction", [])
if a.get("priceSpecification")),
{}
)
prijs = int(price_spec["price"]) if price_spec.get("price") else None
if prijs and prijs > config.MAX_PRICE:
continue
hero = ld.get("photo") or photo_by_url.get(rel_url)
status = status_by_url.get(rel_url, "beschikbaar")
kk = _wassenaar_detail(detail_url)
listings.append(RawListing(
url=detail_url,
source_makelaar="wassenaar",
status=status,
adres=address.get("streetAddress") or None,
postcode=postcode,
stad=address.get("addressLocality") or None,
prijs=prijs,
hero_image_url=hero,
woningtype=kk.get("woningtype"),
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
woonoppervlak=parse_m2(kk.get("woonoppervlak")),
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
kamers=int(kk["kamers"]) if kk.get("kamers") else None,
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None,
energielabel=kk.get("energielabel"),
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("wassenaar: parse fout: %s", e)
log.info("wassenaar: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Roepman Makelaardij NVM (Delft) — Realworks CMS, JSON-LD listing page
# ---------------------------------------------------------------------------
# Uses div.aanbodEntry instead of li.aanbodEntry; price from JSON-LD.
_ROEPMAN_BASE = "https://www.roepman.nl"
def fetch_roepman() -> list[RawListing]:
listings_path = f"/aanbod/woningaanbod/-{config.MAX_PRICE}/koop"
listings = []
page = 1
while True:
url = f"{_ROEPMAN_BASE}{listings_path}/pagina-{page}/"
soup = fetch_soup(url)
cards = soup.select("div.aanbodEntry")
if not cards:
break
# Collect status + photo per relative url
status_by_url: dict[str, str] = {}
photo_by_url: dict[str, str] = {}
for card in cards:
a_tag = card.select_one("a.aanbodEntryLink[href]")
if not a_tag:
continue
href = a_tag["href"]
if href in status_by_url:
continue
banner = card.select_one(".objectstatusbanner")
status_text = banner.get_text(strip=True).lower() if banner else ""
status_by_url[href] = _REALWORKS_STATUS_MAP.get(status_text, "beschikbaar")
img = card.select_one("img")
if img:
src = img.get("src", "")
if "geenfotobeschikbaar" not in src:
photo_by_url[href] = src
# Parse JSON-LD Residence blocks (one per listing)
seen: set[str] = set()
for tag in soup.select('script[type="application/ld+json"]'):
try:
ld = _json.loads(tag.string)
if ld.get("@type") != "Residence":
continue
rel_url = ld.get("url", "")
if not rel_url or rel_url in seen:
continue
seen.add(rel_url)
detail_url = _ROEPMAN_BASE + rel_url
address = ld.get("address", {})
postcode = address.get("postalCode", "").replace(" ", "") or None
price_spec = next(
(a.get("priceSpecification", {}) for a in ld.get("potentialAction", [])
if a.get("priceSpecification")),
{}
)
prijs = int(price_spec["price"]) if price_spec.get("price") else None
if prijs and prijs > config.MAX_PRICE:
continue
hero = ld.get("photo") or photo_by_url.get(rel_url)
status = status_by_url.get(rel_url, "beschikbaar")
kk = _realworks_detail(detail_url, "roepman")
listings.append(RawListing(
url=detail_url,
source_makelaar="roepman",
status=status,
adres=address.get("streetAddress") or None,
postcode=postcode,
stad=address.get("addressLocality") or None,
prijs=prijs,
hero_image_url=hero,
woningtype=kk.get("woningtype"),
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
woonoppervlak=parse_m2(kk.get("woonoppervlak")),
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
kamers=int(kk["kamers"]) if kk.get("kamers") else None,
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None,
energielabel=kk.get("energielabel"),
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("roepman: parse fout: %s", e)
if len(cards) < 10:
break
page += 1
log.info("roepman: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Post Makelaardij (Delft) — Realworks CMS, custom detail parser
# ---------------------------------------------------------------------------
_POST_BASE = "https://www.postmakelaardij.nl"
_POST_STATUS_MAP = {
"te koop": "beschikbaar",
"onder bod": "onder_bod",
"verkocht": "verkocht",
}
def _post_detail(detail_url: str) -> dict:
"""Fetch Post Makelaardij detail page and extract kenmerken."""
try:
soup = fetch_soup(detail_url)
# Energielabel from CSS class: energielabel-{letter}
energielabel = None
for el in soup.select('[class]'):
for cls in el.get('class', []):
if cls.startswith('energielabel-') and cls != 'energielabel':
energielabel = cls.replace('energielabel-', '').upper()
break
if energielabel:
break
# Woonoppervlak, perceeloppervlak, slaapkamers from icon spans
woonoppervlak = None
perceeloppervlak = None
slaapkamers = None
for span in soup.select('span.object-info-icon-text'):
txt = span.get_text(strip=True)
if 'slaapkamer' in txt:
m = re.search(r'(\d+)', txt)
slaapkamers = int(m.group(1)) if m else None
elif 'perceel' in txt:
perceeloppervlak = parse_m2(txt)
elif '' in txt or 'm2' in txt:
woonoppervlak = parse_m2(txt)
return {
"woonoppervlak": woonoppervlak,
"perceeloppervlak": perceeloppervlak,
"slaapkamers": slaapkamers,
"energielabel": energielabel,
}
except Exception as e:
log.warning("post: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_post() -> list[RawListing]:
"""Fetch Post Makelaardij listings; only Delft, only koop."""
listings = []
page = 1
while True:
url = f"{_POST_BASE}/woningaanbod/koop?page={page}"
soup = fetch_soup(url)
cards = soup.select("article")
if not cards:
break
for card in cards:
try:
# URL — first link in image slider
a_tag = card.select_one("a[href]")
if not a_tag:
continue
href = a_tag["href"]
detail_url = href if href.startswith("http") else _POST_BASE + href
# Postcode + city from span.custom-postcode-text
pc_el = card.select_one("span.custom-postcode-text")
if not pc_el:
continue
pc_parts = pc_el.get_text(strip=True).split()
if len(pc_parts) < 3:
continue
postcode = pc_parts[0] + pc_parts[1] # "2613BD"
stad = " ".join(pc_parts[2:]) # "Delft"
# Filter: only Delft
if stad.lower() != "delft":
continue
# Price — filter early
prijs = parse_prijs(_text(card, "span.price-block"))
if prijs and prijs > config.MAX_PRICE:
continue
# Status from span.status text
status_text = (_text(card, "span.status") or "").lower()
status = _POST_STATUS_MAP.get(status_text, "beschikbaar")
# Address
adres = _text(card, "h4.custom-address-text")
# Hero: first img in article
img = card.select_one("img")
hero = img["src"] if img else None
kk = _post_detail(detail_url)
listings.append(RawListing(
url=detail_url,
source_makelaar="post",
status=status,
adres=adres,
postcode=postcode,
stad=stad,
prijs=prijs,
hero_image_url=hero,
woonoppervlak=kk.get("woonoppervlak"),
perceeloppervlak=kk.get("perceeloppervlak"),
slaapkamers=kk.get("slaapkamers"),
energielabel=kk.get("energielabel"),
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("post: parse fout: %s", e)
if len(cards) < 12:
break
page += 1
log.info("post: %d listings opgehaald", len(listings))
return listings

View File

@@ -0,0 +1,542 @@
"""
Custom Schiedam scrapers (no shared CMS platform).
Each makelaar here uses a bespoke site structure that required its own parser.
Scrapers: dewittegarantiemakelaars (JSON-LD), dens, 3dmakelaars, dupont
"""
import re
import config
from huizenbot import RawListing
from ._shared import (
fetch_soup, parse_prijs, parse_m2, _text,
_extract_postcode, _infer_stad, log,
)
# ---------------------------------------------------------------------------
# De Witte Garantiemakelaars (Schiedam)
# ---------------------------------------------------------------------------
# Listing cards have a pill badge for status. All detail data comes from
# JSON-LD (schema.org BuyAction/Offer) on the detail page.
_DEWITTE_BASE = "https://dewittegarantiemakelaars.nl"
_DEWITTE_PILL_MAP = {
"bg-fun-green": "beschikbaar",
"bg-sold": "verkocht",
}
_DEWITTE_TYPE_MAP = {
"Apartment": "appartement",
"House": "woning",
"SingleFamilyResidence": "woning",
"Residence": "woning",
}
def _dewitte_jsonld(detail_url: str) -> dict:
"""Fetch detail page and return parsed JSON-LD dict, or {} on failure."""
import json
try:
soup = fetch_soup(detail_url)
tag = soup.select_one('script[type="application/ld+json"]')
if not tag:
log.warning("dewitte: geen JSON-LD op %s", detail_url)
return {}
return json.loads(tag.string)
except Exception as e:
log.warning("dewitte: JSON-LD fout %s: %s", detail_url, e)
return {}
def fetch_dewittegarantiemakelaars() -> list[RawListing]:
listings = []
page = 1
while True:
url = (
f"{_DEWITTE_BASE}/woningaanbod"
f"?buy_rent=buy&buy_price=1-{config.MAX_PRICE}&page={page}"
)
soup = fetch_soup(url)
cards = soup.select("div.card.card--property")
if not cards:
break
for card in cards:
try:
a_tag = card.select_one("a.card__anchor")
if not a_tag:
continue
detail_url = a_tag["href"]
if not detail_url.startswith("http"):
detail_url = _DEWITTE_BASE + detail_url
pill = card.select_one("span.pill")
pill_classes = pill.get("class", []) if pill else []
status_key = next(
(c for c in pill_classes if c.startswith("bg-")), None
)
status = _DEWITTE_PILL_MAP.get(status_key, "onder_bod")
ld = _dewitte_jsonld(detail_url)
if not ld:
continue
offered = ld.get("itemOffered", {})
address = offered.get("address", {})
floor_size = offered.get("floorSize", {})
postcode = address.get("postalCode", "").replace(" ", "") or None
stad = address.get("addressLocality") or None
adres = address.get("streetAddress") or None
prijs = ld.get("price")
if prijs and int(prijs) > config.MAX_PRICE:
continue
woningtype = _DEWITTE_TYPE_MAP.get(offered.get("@type", ""))
woonoppervlak = int(floor_size["value"]) if floor_size.get("value") else None
kamers = offered.get("numberOfRooms")
bouwjaar = offered.get("yearBuilt")
# Full-res image from JSON-LD, fall back to card thumbnail
hero = ld.get("image")
if not hero:
img = card.select_one("picture img")
hero = img["src"] if img else None
listings.append(RawListing(
url=detail_url,
source_makelaar="dewittegarantiemakelaars",
status=status,
adres=adres,
postcode=postcode,
stad=stad,
prijs=int(prijs) if prijs else None,
woningtype=woningtype,
woonoppervlak=woonoppervlak,
kamers=int(kamers) if kamers else None,
bouwjaar=int(bouwjaar) if bouwjaar else None,
hero_image_url=hero,
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("dewitte: parse fout: %s", e)
if len(cards) < 10:
break
page += 1
log.info("dewittegarantiemakelaars: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# D&S Makelaars (Schiedam)
# ---------------------------------------------------------------------------
_DS_BASE = "https://www.densmakelaars.nl"
_DS_STATUS_MAP = {
"onder bod": "onder_bod",
"te koop": "beschikbaar",
"nieuw": "beschikbaar",
"beschikbaar": "beschikbaar",
"verkocht": "verkocht",
}
def _ds_detail(detail_url: str, html_text: str = None) -> dict:
"""Fetch D&S detail page and extract all kenmerken from <dt>/<dd> pairs and postcode from maps URL."""
try:
# If html_text not provided, fetch it
if html_text is None:
import httpx
r = httpx.get(
detail_url,
headers={"User-Agent": config.USER_AGENT},
timeout=15,
follow_redirects=True,
)
html_text = r.text
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_text, "html.parser")
# Parse <dt>/<dd> pairs into a label → value map
kv: dict[str, str] = {}
dts = soup.select("dt")
dds = soup.select("dd")
for dt, dd in zip(dts, dds):
label = dt.get_text(strip=True).lower()
value = dd.get_text(strip=True)
kv[label] = value
# Extract postcode from Google Maps URL in iframe src
# Pattern: q=...POSTCODE...,CITY where POSTCODE is 4 digits + 2 letters
postcode = None
m = re.search(r'q=.+?,(\d{4})\s+([A-Z]{2}),', html_text)
if m:
postcode = f"{m.group(1)}{m.group(2)}"
return {
"status": kv.get("status", "beschikbaar").lower(),
"woningtype": kv.get("soort woning"),
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("woonoppervlakte"),
"kamers": kv.get("aantal kamers"),
"slaapkamers": kv.get("aantal slaapkamers"),
"energielabel": kv.get("energielabel"),
"postcode": postcode,
}
except Exception as e:
log.warning("dens: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_dens() -> list[RawListing]:
"""Fetch D&S Makelaars listings with full detail pages."""
listings = []
page = 1
while True:
url = f"{_DS_BASE}/aanbod/koopwoningen?page={page}"
soup = fetch_soup(url)
cards = soup.select(".col-12.col-md-4.object-wrapper")
if not cards:
break
for card in cards:
try:
# Extract URL
a_tag = card.select_one("a.property")
if not a_tag or "href" not in a_tag.attrs:
continue
detail_url = a_tag["href"]
if not detail_url.startswith("http"):
detail_url = _DS_BASE + detail_url
# Extract listing page data
status_label = _text(card, "span.label") or "beschikbaar"
status_label = status_label.strip().lower()
status = _DS_STATUS_MAP.get(status_label, "beschikbaar")
adres = _text(card, "h3")
stad = _text(card, "h4")
prijs_text = _text(card, "div.price")
prijs = parse_prijs(prijs_text)
# Extract area and rooms from footer
footer_spans = card.select("div.footer span")
woonoppervlak = None
kamers = None
for span in footer_spans:
text = span.get_text(strip=True)
if "" in text:
woonoppervlak = parse_m2(text)
elif "kamers" in text.lower():
m = re.search(r"(\d+)", text)
if m:
kamers = int(m.group(1))
# Extract hero image
img_tag = card.select_one("img")
hero = img_tag["src"] if img_tag else None
# Fetch and parse detail page
detail_data = _ds_detail(detail_url)
# Use postcode from detail data (extracted from Google Maps URL)
postcode = detail_data.get("postcode")
# Determine status from detail page if available
if detail_data.get("status"):
status = _DS_STATUS_MAP.get(detail_data["status"], status)
listings.append(RawListing(
url=detail_url,
source_makelaar="dens",
adres=adres,
postcode=postcode,
stad=stad or _infer_stad(postcode),
prijs=prijs,
status=status,
hero_image_url=hero,
woningtype=detail_data.get("woningtype"),
bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None,
woonoppervlak=parse_m2(detail_data.get("woonoppervlak")) or woonoppervlak,
kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else kamers,
slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None,
energielabel=detail_data.get("energielabel"),
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("dens: parse fout: %s", e)
if len(cards) < 10:
break
page += 1
log.info("dens: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# 3D Makelaars (Schiedam/Vlaardingen)
# ---------------------------------------------------------------------------
_3D_BASE = "https://3dmakelaars.nl"
def _3dmakelaars_detail(detail_url: str) -> dict:
"""Fetch 3dmakelaars detail page and extract structured info block."""
try:
soup = fetch_soup(detail_url)
# Parse structured info block: span (label) + p (value) pairs
kv: dict[str, str] = {}
for li in soup.select("div.tl-adiltional-inforamtion ul.tl-adiltional-listed li"):
label_el = li.select_one("span")
value_el = li.select_one("p")
if label_el and value_el:
label = label_el.get_text(strip=True).lower()
value = value_el.get_text(strip=True)
kv[label] = value
# Extract postcode from first description paragraph
postcode = None
p_tag = soup.select_one(".omschrijving > p:nth-child(1)")
if p_tag:
text = p_tag.get_text()
postcode = _extract_postcode(text)
return {
"kamers": int(kv["aantal kamers"].split()[0]) if "aantal kamers" in kv else None,
"slaapkamers": int(kv["aantal slaapkamers"].split()[0]) if "aantal slaapkamers" in kv else None,
"bouwjaar": int(kv["bouwjaar"]) if "bouwjaar" in kv else None,
"woningtype": kv.get("bouwvorm"),
"woonoppervlak": parse_m2(kv.get("oppervlakte")),
"postcode": postcode,
}
except Exception as e:
log.warning("3dmakelaars: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_3dmakelaars() -> list[RawListing]:
"""Fetch 3D Makelaars listings with pagination."""
listings = []
page = 1
while True:
url = (
f"{_3D_BASE}/woningen-te-koop-in-schiedam-en-vlaardingen"
f"?kamers=&oppervlakte=&woonplaats=&video=&prijs=3&page={page}"
)
soup = fetch_soup(url)
cards = soup.select("div.tl-properties-item")
if not cards:
break
for card in cards:
try:
# Extract detail URL from onclick attribute
onclick = card.get("onclick", "")
detail_url = None
if "window.location" in onclick:
m = re.search(r"window\.location\s*=\s*['\"]([^'\"]+)['\"]", onclick)
if m:
detail_url = _3D_BASE + m.group(1)
if not detail_url:
continue
# Extract listing-level info
adres = _text(card, "h3.price")
prijs_text = _text(card, "span.address")
prijs = parse_prijs(prijs_text)
# Extract rooms and area from meta list
kamers = None
woonoppervlak = None
for li in card.select("ul.tl-meta-listed > li"):
text = li.get_text(strip=True)
if "kamers" in text.lower():
m = re.search(r"(\d+)", text)
if m:
kamers = int(m.group(1))
elif "" in text or "m2" in text:
woonoppervlak = parse_m2(text)
# Extract image
img_tag = card.select_one("img")
hero = img_tag["src"] if img_tag else None
if hero and not hero.startswith("http"):
hero = _3D_BASE + hero
# Fetch detail page for full info
detail_data = _3dmakelaars_detail(detail_url)
# Postcode from detail page, fallback to extraction from address
postcode = detail_data.get("postcode")
if not postcode and adres:
postcode = _extract_postcode(adres)
listings.append(RawListing(
url=detail_url,
source_makelaar="3dmakelaars",
adres=adres,
postcode=postcode,
stad=_infer_stad(postcode),
prijs=prijs,
woningtype=detail_data.get("woningtype"),
bouwjaar=detail_data.get("bouwjaar"),
woonoppervlak=woonoppervlak or detail_data.get("woonoppervlak"),
kamers=kamers or detail_data.get("kamers"),
slaapkamers=detail_data.get("slaapkamers"),
hero_image_url=hero,
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("3dmakelaars: parse fout: %s", e)
if len(cards) < 7:
break
page += 1
log.info("3dmakelaars: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Dupont ERA Makelaars (Schiedam/Rotterdam)
# ---------------------------------------------------------------------------
_DUPONT_BASE = "https://www.dupont.nl"
_DUPONT_STATUS_MAP = {
"te koop": "beschikbaar",
"nieuw": "beschikbaar",
"onder bod": "onder_bod",
"verkocht onder voorbehoud": "onder_bod",
"verkocht": "verkocht",
}
def _dupont_detail(detail_url: str) -> dict:
"""Fetch Dupont detail page and extract kenmerken from dt/dd pairs."""
try:
soup = fetch_soup(detail_url)
# Parse dt/dd pairs into label → value map
kv: dict[str, str] = {}
dts = soup.select("dt")
dds = soup.select("dd")
for dt, dd in zip(dts, dds):
label = dt.get_text(strip=True).lower()
value = dd.get_text(strip=True)
kv[label] = value
# Extract postcode from small tag (format: "NNNN AA CITY")
postcode = None
small_tag = soup.select_one("section div.container-fluid small")
if small_tag:
postcode = _extract_postcode(small_tag.get_text())
return {
"postcode": postcode,
"woningtype": kv.get("soort woning"),
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("woonoppervlakte"),
"kamers": kv.get("aantal kamers"),
"slaapkamers": kv.get("aantal slaapkamers"),
"energielabel": kv.get("energielabel"),
}
except Exception as e:
log.warning("dupont: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_dupont() -> list[RawListing]:
"""Fetch Dupont ERA Makelaars listings with pagination and detail pages."""
listings = []
page = 1
while True:
url = f"{_DUPONT_BASE}/aanbod/koopwoningen?page={page}"
soup = fetch_soup(url)
cards = soup.select("article.object")
if not cards:
break
for card in cards:
try:
# Extract URL
a_tag = card.select_one("a[href]")
if not a_tag or "href" not in a_tag.attrs:
continue
detail_url = a_tag["href"]
if not detail_url.startswith("http"):
detail_url = _DUPONT_BASE + detail_url
# Extract listing-level data
adres = _text(card, "h3")
stad = _text(card, "h4")
prijs_text = _text(card, "div.price")
prijs = parse_prijs(prijs_text)
# Extract status from label
status_label = _text(card, "div.label") or "beschikbaar"
status_label = status_label.strip().lower()
status = _DUPONT_STATUS_MAP.get(status_label, "beschikbaar")
# Extract image
img_tag = card.select_one("img.img-responsive")
hero = img_tag["src"] if img_tag else None
if hero and not hero.startswith("http"):
hero = _DUPONT_BASE + hero
# Fetch detail page for full data
detail_data = _dupont_detail(detail_url)
# Use postcode from detail if available
postcode = detail_data.get("postcode")
listings.append(RawListing(
url=detail_url,
source_makelaar="dupont",
adres=adres,
postcode=postcode,
stad=stad or _infer_stad(postcode),
prijs=prijs,
status=status,
hero_image_url=hero,
woningtype=detail_data.get("woningtype"),
bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None,
woonoppervlak=parse_m2(detail_data.get("woonoppervlak")),
kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else None,
slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None,
energielabel=detail_data.get("energielabel"),
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("dupont: parse fout: %s", e)
if len(cards) < 10:
break
page += 1
log.info("dupont: %d listings opgehaald", len(listings))
return listings

630
src/adapters/ssr/sure.py Normal file
View File

@@ -0,0 +1,630 @@
"""
SURE WordPress plugin scrapers.
All makelaars here use the SURE real estate plugin for WordPress. Listings
are at /wonen?sure_koop_huur=koop with pagination via /wonen/page/{N}/.
Cards use class a.card-house or div.card.card--house.
Detail pages have a #kenmerken section with label/value pairs.
Scrapers: schielandborsboom, olsthoorn, vanherk, borgdorff
"""
import re
import config
from huizenbot import RawListing
from ._shared import fetch_soup, parse_prijs, parse_m2, _text, _extract_postcode, log
# ---------------------------------------------------------------------------
# Schieland Borsboom NVM Makelaars (Rotterdam, active in Schiedam)
# ---------------------------------------------------------------------------
_SCHIELAND_BASE = "https://www.schielandborsboom.nl"
_SCHIELAND_STATUS_MAP = {
"sure-status-available": "beschikbaar",
"sure-status-under_bid": "onder_bod",
"sure-status-sold": "verkocht",
}
def _schieland_detail(detail_url: str) -> dict:
"""Fetch Schieland Borsboom detail page and extract kenmerken."""
try:
soup = fetch_soup(detail_url)
# Postcode from house__status p (e.g. "3117 DP Schiedam")
postcode_el = soup.select_one("div.house__status p")
postcode = _extract_postcode(postcode_el.get_text()) if postcode_el else None
# Parse #kenmerken section: <li><strong>label</strong><span>value</span></li>
kv: dict[str, str] = {}
kenmerken = soup.select_one("#kenmerken")
if kenmerken:
for li in kenmerken.select("li"):
label_el = li.select_one("strong")
value_el = li.select_one("span")
if label_el and value_el:
# Strip nested links (e.g. "Hypotheek berekenen")
for a in value_el.select("a"):
a.decompose()
kv[label_el.get_text(strip=True).lower()] = value_el.get_text(strip=True)
return {
"postcode": postcode,
"status": kv.get("status", "").lower(),
"woningtype": kv.get("soort bouw"),
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("woonoppervlakte"),
"perceeloppervlak": kv.get("perceeloppervlakte"),
"kamers": kv.get("aantal kamers"),
"slaapkamers": kv.get("aantal slaapkamers"),
"energielabel": kv.get("energielabel"),
}
except Exception as e:
log.warning("schielandborsboom: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_schielandborsboom() -> list[RawListing]:
"""Fetch Schieland Borsboom NVM listings (koop only, Schiedam)."""
listings = []
page = 1
while True:
if page == 1:
url = f"{_SCHIELAND_BASE}/wonen?sure_koop_huur=koop"
else:
url = f"{_SCHIELAND_BASE}/wonen/page/{page}/?sure_koop_huur=koop"
soup = fetch_soup(url)
cards = soup.select("div.card.card--house")
if not cards:
break
for card in cards:
try:
a_tag = card.select_one("a.card__anchor")
if not a_tag or "href" not in a_tag.attrs:
continue
detail_url = a_tag["href"]
if not detail_url.startswith("http"):
detail_url = _SCHIELAND_BASE + detail_url
# Filter: only Schiedam
stad_el = card.select_one("p.house-place")
stad = stad_el.get_text(strip=True) if stad_el else None
if not stad or stad.lower() != "schiedam":
continue
# Status from card-house__thumb second class
thumb = card.select_one("div.card-house__thumb")
status_classes = thumb.get("class", []) if thumb else []
status_text = next(
(c for c in status_classes if c != "card-house__thumb"), "beschikbaar"
).lower()
status = _SCHIELAND_STATUS_MAP.get(status_text, "beschikbaar")
# Price
prijs = parse_prijs(_text(card, "p.price"))
if prijs and prijs > config.MAX_PRICE:
continue
adres = _text(card, "h4.house-street")
# Hero image from picture source (medium size)
src_tag = card.select_one('picture source[media="(min-width:100px)"]')
hero = src_tag["srcset"] if src_tag else None
if hero is None:
img = card.select_one("img")
hero = img.get("src") if img else None
if hero and not hero.startswith("http"):
hero = _SCHIELAND_BASE + hero
# Data icons on card: surface, bedrooms, energy label
woonoppervlak_card = None
slaapkamers_card = None
energielabel_card = None
for data_div in card.select("div.data"):
txt = data_div.get_text(strip=True)
if data_div.select_one("i.icon-surface"):
woonoppervlak_card = parse_m2(txt)
elif data_div.select_one("i.icon-bedrooms"):
m = re.search(r"(\d+)", txt)
slaapkamers_card = int(m.group(1)) if m else None
elif data_div.select_one("i.icon-label"):
energielabel_card = txt.strip() or None
# Fetch detail page for full kenmerken
kk = _schieland_detail(detail_url)
# Refine status from detail page
if kk.get("status"):
status = _SCHIELAND_STATUS_MAP.get(kk["status"], status)
# Parse kamers: "5 kamers" → 5
kamers = None
if kk.get("kamers"):
m = re.search(r"(\d+)", kk["kamers"])
kamers = int(m.group(1)) if m else None
# Parse slaapkamers: "3" or "3 slaapkamers" → 3
slaapkamers = slaapkamers_card
if kk.get("slaapkamers"):
m = re.search(r"(\d+)", kk["slaapkamers"])
slaapkamers = int(m.group(1)) if m else slaapkamers_card
listings.append(RawListing(
url=detail_url,
source_makelaar="schielandborsboom",
status=status,
adres=adres,
postcode=kk.get("postcode"),
stad=stad,
prijs=prijs,
hero_image_url=hero,
woningtype=kk.get("woningtype"),
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
kamers=kamers,
slaapkamers=slaapkamers,
energielabel=kk.get("energielabel") or energielabel_card,
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("schielandborsboom: parse fout: %s", e)
if len(cards) < 18:
break
page += 1
log.info("schielandborsboom: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Olsthoorn Makelaars Delft (SURE WordPress plugin)
# ---------------------------------------------------------------------------
# Covers Delft, Den Haag, Naaldwijk etc — we filter for Delft only.
# Detail page has no postcode; leave as None.
_OLSTHOORN_BASE = "https://www.olsthoornmakelaars.nl"
_OLSTHOORN_STATUS_MAP = {
"badge-available": "beschikbaar",
"badge-bid": "onder_bod",
"badge-option": "onder_bod",
"badge-sold": "verkocht",
}
_OLSTHOORN_DETAIL_STATUS_MAP = {
"beschikbaar": "beschikbaar",
"onder bod": "onder_bod",
"onder optie": "onder_bod",
"verkocht": "verkocht",
}
def _olsthoorn_detail(detail_url: str) -> dict:
"""Fetch Olsthoorn detail page; extract kenmerken from #kenmerken li pairs."""
try:
soup = fetch_soup(detail_url)
kv: dict[str, str] = {}
for li in soup.select("#kenmerken li"):
spans = li.select("span")
if len(spans) >= 2:
label = spans[0].get_text(strip=True).lower()
value = spans[1].get_text(strip=True)
kv[label] = value
return {
"status": kv.get("status", "").lower(),
"woningtype": kv.get("soort object") or kv.get("soort woning") or kv.get("soort bouw"),
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("gebruiksoppervlakte"),
"perceeloppervlak": kv.get("perceeloppervlakte"),
"kamers": kv.get("aantal kamers"),
"slaapkamers": kv.get("aantal slaapkamers"),
"energielabel": kv.get("energielabel"),
}
except Exception as e:
log.warning("olsthoorn: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_olsthoorn() -> list[RawListing]:
"""Fetch Olsthoorn Makelaars listings; only Delft, only koop."""
listings = []
page = 1
while True:
if page == 1:
url = f"{_OLSTHOORN_BASE}/wonen?sure_koop_huur=koop"
else:
url = f"{_OLSTHOORN_BASE}/wonen/page/{page}/?sure_koop_huur=koop"
soup = fetch_soup(url)
cards = soup.select("a.card-house")
if not cards:
break
for card in cards:
try:
href = card.get("href", "")
if not href:
continue
detail_url = href if href.startswith("http") else _OLSTHOORN_BASE + href
# Filter: only Delft
stad_el = card.select_one("h2.card__title")
stad = stad_el.get_text(strip=True) if stad_el else None
if not stad or stad.lower() != "delft":
continue
# Price from bold tag — filter early before detail fetch
prijs_b = card.select_one("b")
prijs = parse_prijs(prijs_b.get_text() if prijs_b else None)
if prijs and prijs > config.MAX_PRICE:
continue
# Status from badge class on label span
label_span = card.select_one("span.card-house__label")
status = "beschikbaar"
if label_span:
for cls in label_span.get("class", []):
if cls in _OLSTHOORN_STATUS_MAP:
status = _OLSTHOORN_STATUS_MAP[cls]
break
# Address: second <p> under .short--info (collapse internal whitespace)
adres_p = card.select("div.short--info > p")
if adres_p:
adres = " ".join(adres_p[0].get_text().split())
else:
adres = None
# Hero image: largest source srcset
src_tag = card.select_one('picture source[media="(min-width:1024px)"]')
hero = src_tag.get("data-srcset") if src_tag else None
if hero and not hero.startswith("http"):
hero = _OLSTHOORN_BASE + hero
# Woonoppervlak + kamers + energielabel from card data icons
woonoppervlak_card = None
kamers_card = None
energielabel_card = None
for data_div in card.select("div.data"):
inner = data_div.select_one("span.date__inner")
if not inner:
continue
txt = inner.get_text(strip=True)
if data_div.select_one("i.icon-sizes"):
woonoppervlak_card = parse_m2(txt)
elif data_div.select_one("i.icon-door"):
m = re.search(r"(\d+)", txt)
kamers_card = int(m.group(1)) if m else None
elif data_div.select_one("i.icon-energylabel"):
energielabel_card = txt or None
kk = _olsthoorn_detail(detail_url)
# Refine status from detail page
detail_status = _OLSTHOORN_DETAIL_STATUS_MAP.get(kk.get("status", ""), "")
if detail_status:
status = detail_status
listings.append(RawListing(
url=detail_url,
source_makelaar="olsthoorn",
status=status,
adres=adres,
postcode=None, # not exposed by broker
stad=stad,
prijs=prijs,
hero_image_url=hero,
woningtype=kk.get("woningtype"),
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
kamers=int(kk["kamers"]) if kk.get("kamers") else kamers_card,
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None,
energielabel=kk.get("energielabel") or energielabel_card,
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("olsthoorn: parse fout: %s", e)
if len(cards) < 15:
break
page += 1
log.info("olsthoorn: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Van Herk Makelaars (Schiedam) — SURE WordPress plugin (card-house)
# ---------------------------------------------------------------------------
# Listings filtered by city + price in URL; pagination via /page/{N}/.
# Detail page: div.features ul.unstyled li with two <span> (label + value).
_VANHERK_BASE = "https://www.vanherk.nl"
_VANHERK_LISTINGS = "https://www.vanherk.nl/wonen/aanbod/zoeken/schiedam/200000-300000/"
_VANHERK_STATUS_MAP = {
"beschikbaar": "beschikbaar",
"onder bod": "onder_bod",
"onder optie": "onder_bod",
"verkocht": "verkocht",
}
def _vanherk_detail(detail_url: str) -> dict:
"""Fetch Van Herk detail page; extract kenmerken from div.features."""
try:
soup = fetch_soup(detail_url)
kv: dict[str, str] = {}
for li in soup.select("div.features ul.unstyled li"):
spans = li.select("span")
if len(spans) >= 2:
label = spans[0].get_text(strip=True).lower()
value = spans[1].get_text(strip=True)
kv[label] = value
return {
"status": kv.get("status", "").lower(),
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("woonoppervlakte"),
"slaapkamers": kv.get("aantal slaapkamers"),
}
except Exception as e:
log.warning("vanherk: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_vanherk() -> list[RawListing]:
"""Fetch Van Herk listings; only Schiedam, only koop."""
listings = []
page = 1
while True:
if page == 1:
url = _VANHERK_LISTINGS
else:
url = _VANHERK_LISTINGS + f"page/{page}/"
soup = fetch_soup(url)
cards = soup.select("a.card-house")
if not cards:
break
for card in cards:
try:
href = card.get("href", "")
if not href:
continue
detail_url = href if href.startswith("http") else _VANHERK_BASE + href
# City from lead paragraph
lead = card.select_one("p.lead")
stad = lead.get_text(strip=True) if lead else None
# Address from h4 (normalize whitespace incl. &nbsp;)
h4 = card.select_one("h4")
adres = " ".join(h4.get_text().split()) if h4 else None
# Price from .subtitle
subtitle = card.select_one("p.subtitle")
prijs = parse_prijs(subtitle.get_text() if subtitle else None)
if prijs and prijs > config.MAX_PRICE:
continue
# Hero image: largest srcset source
src_tag = card.select_one('picture source[media="(min-width:1280px)"]')
hero = src_tag.get("srcset") if src_tag else None
if hero and not hero.startswith("http"):
hero = _VANHERK_BASE + hero
# Card data icons: surface, bedrooms, energy label
woonoppervlak_card = None
slaapkamers_card = None
energielabel_card = None
for data_div in card.select("div.data"):
classes = data_div.get("class") or []
if "d-none" in classes:
continue
if "data-energie" in classes:
inner = data_div.select_one(".date__inner")
energielabel_card = inner.get_text(strip=True) if inner else None
elif data_div.select_one("i.icon-surface"):
inner = data_div.select_one("span.date__inner")
woonoppervlak_card = parse_m2(inner.get_text(strip=True) if inner else None)
elif data_div.select_one("i.icon-bed"):
inner = data_div.select_one("span.date__inner")
txt = inner.get_text(strip=True) if inner else None
m = re.search(r"(\d+)", txt) if txt else None
slaapkamers_card = int(m.group(1)) if m else None
kk = _vanherk_detail(detail_url)
status = _VANHERK_STATUS_MAP.get(kk.get("status", ""), "beschikbaar")
listings.append(RawListing(
url=detail_url,
source_makelaar="vanherk",
status=status,
adres=adres,
stad=stad,
prijs=prijs,
hero_image_url=hero,
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
slaapkamers=(int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None) or slaapkamers_card,
energielabel=energielabel_card,
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("vanherk: parse fout: %s", e)
if len(cards) < 15:
break
page += 1
log.info("vanherk: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Borgdorff Makelaars (Den Haag / Westland) — SURE WordPress plugin
# ---------------------------------------------------------------------------
# Covers Den Haag ('s-gravenhage), Monster, Naaldwijk etc. Filter for Den Haag.
# Same SURE plugin as Schieland Borsboom but uses a.card--house (double dash).
# No postcode on detail page.
_BORGDORFF_BASE = "https://www.borgdorff.nl"
_BORGDORFF_DEN_HAAG = {"'s-gravenhage", "den haag"}
_BORGDORFF_BADGE_MAP = {
"badge--info": "beschikbaar",
"badge--warning": "onder_bod",
"badge--danger": "verkocht",
}
_BORGDORFF_DETAIL_STATUS_MAP = {
"beschikbaar": "beschikbaar",
"onder bod": "onder_bod",
"onder optie": "onder_bod",
"verkocht": "verkocht",
}
def _borgdorff_detail(detail_url: str) -> dict:
"""Fetch Borgdorff detail page; extract #kenmerken li span pairs."""
try:
soup = fetch_soup(detail_url)
kv: dict[str, str] = {}
for li in soup.select("#kenmerken li"):
spans = li.select("span")
if len(spans) >= 2:
label = spans[0].get_text(strip=True).lower()
value = spans[1].get_text(strip=True)
kv[label] = value
return {
"status": kv.get("status", "").lower(),
"woningtype": kv.get("soort woonhuis") or kv.get("soort woning") or kv.get("soort bouw"),
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("gebruiksoppervlakte wonen") or kv.get("gebruiksoppervlakte"),
"perceeloppervlak": kv.get("perceeloppervlakte"),
"slaapkamers": kv.get("aantal slaapkamers"),
"energielabel": kv.get("energielabel"),
}
except Exception as e:
log.warning("borgdorff: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_borgdorff() -> list[RawListing]:
"""Fetch Borgdorff listings; only Den Haag / 's-gravenhage, only koop."""
listings = []
page = 1
while True:
if page == 1:
url = f"{_BORGDORFF_BASE}/wonen?sure_koop_huur=koop"
else:
url = f"{_BORGDORFF_BASE}/wonen/page/{page}/?sure_koop_huur=koop"
soup = fetch_soup(url)
cards = soup.select("a.card--house")
if not cards:
break
for card in cards:
try:
href = card.get("href", "")
if not href:
continue
detail_url = href if href.startswith("http") else _BORGDORFF_BASE + href
# Filter: only Den Haag
stad_el = card.select_one("p.lead-two")
stad = stad_el.get_text(strip=True) if stad_el else None
if not stad or stad.lower() not in _BORGDORFF_DEN_HAAG:
continue
# Price — filter early
prijs = parse_prijs(_text(card, "p.strong"))
if prijs and prijs > config.MAX_PRICE:
continue
# Status from badge class
label_span = card.select_one("span.card-house__label")
status = "beschikbaar"
if label_span:
for cls in label_span.get("class", []):
if cls in _BORGDORFF_BADGE_MAP:
status = _BORGDORFF_BADGE_MAP[cls]
break
# Address
adres = _text(card, "h4")
# Hero: largest source srcset
src_tag = card.select_one('picture source[media="(min-width:1280px)"]')
hero = src_tag.get("srcset") if src_tag else None
if not hero:
img = card.select_one("img[data-src]")
hero = img.get("data-src") if img else None
if hero and not hero.startswith("http"):
hero = _BORGDORFF_BASE + hero
# Surface + bedrooms from data icons
woonoppervlak_card = None
slaapkamers_card = None
for data_div in card.select("div.data"):
inner = data_div.select_one("p.small")
if not inner:
continue
txt = inner.get_text(strip=True)
if data_div.select_one("i.icon-surface"):
woonoppervlak_card = parse_m2(txt)
elif data_div.select_one("i.icon-bed"):
m = re.search(r"(\d+)", txt)
slaapkamers_card = int(m.group(1)) if m else None
kk = _borgdorff_detail(detail_url)
# Refine status from detail page
if kk.get("status"):
status = _BORGDORFF_DETAIL_STATUS_MAP.get(kk["status"], status)
listings.append(RawListing(
url=detail_url,
source_makelaar="borgdorff",
status=status,
adres=adres,
postcode=None, # not exposed by broker
stad=stad,
prijs=prijs,
hero_image_url=hero,
woningtype=kk.get("woningtype"),
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else slaapkamers_card,
energielabel=kk.get("energielabel"),
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("borgdorff: parse fout: %s", e)
if len(cards) < 15:
break
page += 1
log.info("borgdorff: %d listings opgehaald", len(listings))
return listings

View File

@@ -159,9 +159,22 @@ def upsert(conn: sqlite3.Connection, listing: RawListing, travel: dict[str,int])
"extra": json.dumps(listing.extra) if listing.extra else None,
})
else:
_cursor = conn.execute("""
UPDATE woningen SET last_seen = ?, status = ? WHERE id = ?
""", (now, listing.status, lid))
if travel:
conn.execute("""
UPDATE woningen
SET last_seen = ?, status = ?,
fiets_mark = ?, fiets_michelle = ?, ov_mark = ?, ov_michelle = ?
WHERE id = ?
""", (
now, listing.status,
travel.get("fiets_mark"), travel.get("fiets_michelle"),
travel.get("ov_mark"), travel.get("ov_michelle"),
lid,
))
else:
conn.execute("""
UPDATE woningen SET last_seen = ?, status = ? WHERE id = ?
""", (now, listing.status, lid))
conn.commit()
return is_new
@@ -391,11 +404,13 @@ def run(scrapers: dict[str,Scraper], db_path: str) -> None:
travel = {}
try:
lid = listing_id(listing.url)
is_existing = conn.execute(
"SELECT id FROM woningen WHERE id = ?", (lid,)
).fetchone() is not None
row = conn.execute(
"SELECT fiets_mark FROM woningen WHERE id = ?", (lid,)
).fetchone()
is_existing = row is not None
needs_travel = not is_existing or row[0] is None
if not is_existing:
if needs_travel:
travel = bereken_reistijden(listing.postcode, listing.stad)
is_new = upsert(conn, listing, travel)