refactor: split ssr.py into package, enrich OG Online detail pages, fix travel upsert

- Split src/adapters/ssr.py (2160 LOC) into ssr/ package grouped by CMS:
  realworks.py, sure.py, schiedam.py, denhaag.py, overige.py
- Add _og_detail() to api.py; all OG Online scrapers now fall back to
  detail page fetch when energielabel/bouwjaar are missing from the API
- Fix run() to recalculate travel times for existing listings where
  fiets_mark IS NULL; upsert() now writes travel cols on existing rows too
- Update tests/cache.py to patch fetch_soup in every ssr submodule
- Update docs to reflect new package structure and mark API enrichment TODO done

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-11 23:39:35 +02:00
parent 1011d9cf87
commit f74e9bcfb0
14 changed files with 2478 additions and 2199 deletions

View File

@@ -7,9 +7,11 @@ Voeg nieuwe toe onderaan en registreer in SCRAPERS.
import json
import logging
import re
import time
import httpx
from bs4 import BeautifulSoup
import config
from huizenbot import RawListing
@@ -40,8 +42,71 @@ def fetch_json(url: str, *, params: dict = None, headers: dict = None) -> dict |
return r.json()
raise RuntimeError(f"Blijvend 429 op {url}")
def _og_detail(url: str, makelaar: str) -> dict:
"""
Fetch an OG Online detail page and extract missing fields.
OG Online sites typically expose kenmerken in one of two patterns:
1. A table/list with dt/dd or label/value span pairs
2. An energielabel CSS class (energielabel-A, energielabel-B, etc.)
Returns a dict with any fields found; empty dict on failure.
"""
try:
r = httpx.get(
url,
headers={"User-Agent": config.USER_AGENT},
timeout=15,
follow_redirects=True,
)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
# Pattern 1: energielabel CSS class on any element
energielabel = None
for el in soup.select("[class]"):
for cls in el.get("class", []):
if cls.startswith("energielabel-") and cls != "energielabel":
energielabel = cls.replace("energielabel-", "").upper()
break
if energielabel:
break
# Pattern 2: kenmerken table — try dt/dd pairs first
kv: dict[str, str] = {}
dts = soup.select("dt")
dds = soup.select("dd")
for dt, dd in zip(dts, dds):
kv[dt.get_text(strip=True).lower()] = dd.get_text(strip=True)
# Pattern 3: ul.objectkenmerken / div.kenmerken span pairs
if not kv:
for li in soup.select("li"):
spans = li.select("span")
if len(spans) >= 2:
kv[spans[0].get_text(strip=True).lower()] = spans[1].get_text(strip=True)
if not energielabel:
energielabel = (
kv.get("energielabel")
or kv.get("energieklasse")
or kv.get("energie")
) or None
raw_year = kv.get("bouwjaar") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
return {
"energielabel": energielabel,
"bouwjaar": bouwjaar,
}
except Exception as e:
log.warning("%s: detail fetch fout %s: %s", makelaar, url, e)
return {}
# ---------------------------------------------------------------------------
# Bjornd
# ---------------------------------------------------------------------------
@@ -56,26 +121,36 @@ _STATUS_MAP = {
"sold": "verkocht",
"sold_ur": "verkocht",
}
def fetch_bjornd() -> list[RawListing]:
data = fetch_json(
f"{_BJORND_BASE}/nl/realtime-listings/consumer",
headers={"X-Requested-With": "XMLHttpRequest"},
)
listings = []
for item in data:
if not item.get("isSales"):
continue
if item.get("statusOrig") in _BJORND_SKIP:
continue
if item.get('salesPrice')>config.MAX_PRICE:
if item.get("salesPrice", 0) > config.MAX_PRICE:
continue
detail_url = _BJORND_BASE + item["url"]
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
# Fetch detail page when API omits key fields
if not energielabel or not bouwjaar:
extra_kk = _og_detail(detail_url, "bjornd")
energielabel = energielabel or extra_kk.get("energielabel")
bouwjaar = bouwjaar or extra_kk.get("bouwjaar")
listings.append(RawListing(
url=_BJORND_BASE + item["url"],
url=detail_url,
source_makelaar="bjornd",
status=_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -87,6 +162,8 @@ def fetch_bjornd() -> list[RawListing]:
perceeloppervlak=item.get("plotSurface") or None,
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
extra=json.dumps({
"balcony": item.get("balcony"),
@@ -102,10 +179,13 @@ def fetch_bjornd() -> list[RawListing]:
"photos": item.get("photos"),
}, ensure_ascii=False),
))
if config.APP_ENV == "dev":
break
log.info("bjornd: %d koopwoningen opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Ooms
# ---------------------------------------------------------------------------
@@ -221,9 +301,15 @@ def fetch_moerman() -> list[RawListing]:
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _MOERMAN_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "moerman")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=_MOERMAN_BASE + item["url"],
url=detail_url,
source_makelaar="moerman",
status=_MOERMAN_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -236,9 +322,11 @@ def fetch_moerman() -> list[RawListing]:
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=item.get("energyLabel") or None,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("moerman: %d koopwoningen opgehaald", len(listings))
return listings
@@ -284,9 +372,15 @@ def fetch_vandaal() -> list[RawListing]:
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _VANDAAL_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "vandaal")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=_VANDAAL_BASE + item["url"],
url=detail_url,
source_makelaar="vandaal",
status=_VANDAAL_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -299,9 +393,11 @@ def fetch_vandaal() -> list[RawListing]:
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=item.get("energyLabel") or None,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("vandaal: %d koopwoningen opgehaald", len(listings))
return listings
@@ -349,9 +445,15 @@ def fetch_elzenaar() -> list[RawListing]:
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _ELZENAAR_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "elzenaar")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=_ELZENAAR_BASE + item["url"],
url=detail_url,
source_makelaar="elzenaar",
status=_ELZENAAR_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -364,9 +466,11 @@ def fetch_elzenaar() -> list[RawListing]:
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=item.get("energyLabel") or None,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("elzenaar: %d koopwoningen opgehaald", len(listings))
return listings
@@ -413,9 +517,15 @@ def fetch_doen() -> list[RawListing]:
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _DOEN_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "doen")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=_DOEN_BASE + item["url"],
url=detail_url,
source_makelaar="doen",
status=_DOEN_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -428,9 +538,11 @@ def fetch_doen() -> list[RawListing]:
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=item.get("energyLabel") or None,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("doen: %d koopwoningen opgehaald", len(listings))
return listings
@@ -476,9 +588,15 @@ def fetch_vandriel() -> list[RawListing]:
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _VANDRIEL_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "vandriel")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=_VANDRIEL_BASE + item["url"],
url=detail_url,
source_makelaar="vandriel",
status=_VANDRIEL_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
@@ -491,9 +609,11 @@ def fetch_vandriel() -> list[RawListing]:
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=item.get("energyLabel") or None,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("vandriel: %d koopwoningen opgehaald", len(listings))
return listings