refactor: split ssr.py into package, enrich OG Online detail pages, fix travel upsert
- Split src/adapters/ssr.py (2160 LOC) into ssr/ package grouped by CMS: realworks.py, sure.py, schiedam.py, denhaag.py, overige.py - Add _og_detail() to api.py; all OG Online scrapers now fall back to detail page fetch when energielabel/bouwjaar are missing from the API - Fix run() to recalculate travel times for existing listings where fiets_mark IS NULL; upsert() now writes travel cols on existing rows too - Update tests/cache.py to patch fetch_soup in every ssr submodule - Update docs to reflect new package structure and mark API enrichment TODO done Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,9 +7,11 @@ Voeg nieuwe toe onderaan en registreer in SCRAPERS.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import config
|
||||
from huizenbot import RawListing
|
||||
@@ -40,8 +42,71 @@ def fetch_json(url: str, *, params: dict = None, headers: dict = None) -> dict |
|
||||
return r.json()
|
||||
|
||||
raise RuntimeError(f"Blijvend 429 op {url}")
|
||||
|
||||
|
||||
|
||||
|
||||
def _og_detail(url: str, makelaar: str) -> dict:
|
||||
"""
|
||||
Fetch an OG Online detail page and extract missing fields.
|
||||
|
||||
OG Online sites typically expose kenmerken in one of two patterns:
|
||||
1. A table/list with dt/dd or label/value span pairs
|
||||
2. An energielabel CSS class (energielabel-A, energielabel-B, etc.)
|
||||
|
||||
Returns a dict with any fields found; empty dict on failure.
|
||||
"""
|
||||
try:
|
||||
r = httpx.get(
|
||||
url,
|
||||
headers={"User-Agent": config.USER_AGENT},
|
||||
timeout=15,
|
||||
follow_redirects=True,
|
||||
)
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
|
||||
# Pattern 1: energielabel CSS class on any element
|
||||
energielabel = None
|
||||
for el in soup.select("[class]"):
|
||||
for cls in el.get("class", []):
|
||||
if cls.startswith("energielabel-") and cls != "energielabel":
|
||||
energielabel = cls.replace("energielabel-", "").upper()
|
||||
break
|
||||
if energielabel:
|
||||
break
|
||||
|
||||
# Pattern 2: kenmerken table — try dt/dd pairs first
|
||||
kv: dict[str, str] = {}
|
||||
dts = soup.select("dt")
|
||||
dds = soup.select("dd")
|
||||
for dt, dd in zip(dts, dds):
|
||||
kv[dt.get_text(strip=True).lower()] = dd.get_text(strip=True)
|
||||
|
||||
# Pattern 3: ul.objectkenmerken / div.kenmerken span pairs
|
||||
if not kv:
|
||||
for li in soup.select("li"):
|
||||
spans = li.select("span")
|
||||
if len(spans) >= 2:
|
||||
kv[spans[0].get_text(strip=True).lower()] = spans[1].get_text(strip=True)
|
||||
|
||||
if not energielabel:
|
||||
energielabel = (
|
||||
kv.get("energielabel")
|
||||
or kv.get("energieklasse")
|
||||
or kv.get("energie")
|
||||
) or None
|
||||
|
||||
raw_year = kv.get("bouwjaar") or ""
|
||||
bouwjaar = int(raw_year) if raw_year.isdigit() else None
|
||||
|
||||
return {
|
||||
"energielabel": energielabel,
|
||||
"bouwjaar": bouwjaar,
|
||||
}
|
||||
except Exception as e:
|
||||
log.warning("%s: detail fetch fout %s: %s", makelaar, url, e)
|
||||
return {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bjornd
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -56,26 +121,36 @@ _STATUS_MAP = {
|
||||
"sold": "verkocht",
|
||||
"sold_ur": "verkocht",
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
def fetch_bjornd() -> list[RawListing]:
|
||||
data = fetch_json(
|
||||
f"{_BJORND_BASE}/nl/realtime-listings/consumer",
|
||||
headers={"X-Requested-With": "XMLHttpRequest"},
|
||||
)
|
||||
|
||||
|
||||
listings = []
|
||||
for item in data:
|
||||
if not item.get("isSales"):
|
||||
continue
|
||||
if item.get("statusOrig") in _BJORND_SKIP:
|
||||
continue
|
||||
if item.get('salesPrice')>config.MAX_PRICE:
|
||||
if item.get("salesPrice", 0) > config.MAX_PRICE:
|
||||
continue
|
||||
|
||||
|
||||
|
||||
detail_url = _BJORND_BASE + item["url"]
|
||||
raw_year = item.get("dateOfConstruction") or ""
|
||||
bouwjaar = int(raw_year) if raw_year.isdigit() else None
|
||||
energielabel = item.get("energyLabel") or None
|
||||
|
||||
# Fetch detail page when API omits key fields
|
||||
if not energielabel or not bouwjaar:
|
||||
extra_kk = _og_detail(detail_url, "bjornd")
|
||||
energielabel = energielabel or extra_kk.get("energielabel")
|
||||
bouwjaar = bouwjaar or extra_kk.get("bouwjaar")
|
||||
|
||||
listings.append(RawListing(
|
||||
url=_BJORND_BASE + item["url"],
|
||||
url=detail_url,
|
||||
source_makelaar="bjornd",
|
||||
status=_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
|
||||
adres=item.get("address") or None,
|
||||
@@ -87,6 +162,8 @@ def fetch_bjornd() -> list[RawListing]:
|
||||
perceeloppervlak=item.get("plotSurface") or None,
|
||||
kamers=item.get("rooms") or None,
|
||||
slaapkamers=item.get("bedrooms") or None,
|
||||
bouwjaar=bouwjaar,
|
||||
energielabel=energielabel,
|
||||
hero_image_url=item.get("photo") or None,
|
||||
extra=json.dumps({
|
||||
"balcony": item.get("balcony"),
|
||||
@@ -102,10 +179,13 @@ def fetch_bjornd() -> list[RawListing]:
|
||||
"photos": item.get("photos"),
|
||||
}, ensure_ascii=False),
|
||||
))
|
||||
|
||||
if config.APP_ENV == "dev":
|
||||
break
|
||||
|
||||
log.info("bjornd: %d koopwoningen opgehaald", len(listings))
|
||||
return listings
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Ooms
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -221,9 +301,15 @@ def fetch_moerman() -> list[RawListing]:
|
||||
|
||||
raw_year = item.get("dateOfConstruction") or ""
|
||||
bouwjaar = int(raw_year) if raw_year.isdigit() else None
|
||||
energielabel = item.get("energyLabel") or None
|
||||
|
||||
detail_url = _MOERMAN_BASE + item["url"]
|
||||
if not energielabel:
|
||||
extra_kk = _og_detail(detail_url, "moerman")
|
||||
energielabel = extra_kk.get("energielabel")
|
||||
|
||||
listings.append(RawListing(
|
||||
url=_MOERMAN_BASE + item["url"],
|
||||
url=detail_url,
|
||||
source_makelaar="moerman",
|
||||
status=_MOERMAN_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
|
||||
adres=item.get("address") or None,
|
||||
@@ -236,9 +322,11 @@ def fetch_moerman() -> list[RawListing]:
|
||||
kamers=item.get("rooms") or None,
|
||||
slaapkamers=item.get("bedrooms") or None,
|
||||
bouwjaar=bouwjaar,
|
||||
energielabel=item.get("energyLabel") or None,
|
||||
energielabel=energielabel,
|
||||
hero_image_url=item.get("photo") or None,
|
||||
))
|
||||
if config.APP_ENV == "dev":
|
||||
break
|
||||
|
||||
log.info("moerman: %d koopwoningen opgehaald", len(listings))
|
||||
return listings
|
||||
@@ -284,9 +372,15 @@ def fetch_vandaal() -> list[RawListing]:
|
||||
|
||||
raw_year = item.get("dateOfConstruction") or ""
|
||||
bouwjaar = int(raw_year) if raw_year.isdigit() else None
|
||||
energielabel = item.get("energyLabel") or None
|
||||
|
||||
detail_url = _VANDAAL_BASE + item["url"]
|
||||
if not energielabel:
|
||||
extra_kk = _og_detail(detail_url, "vandaal")
|
||||
energielabel = extra_kk.get("energielabel")
|
||||
|
||||
listings.append(RawListing(
|
||||
url=_VANDAAL_BASE + item["url"],
|
||||
url=detail_url,
|
||||
source_makelaar="vandaal",
|
||||
status=_VANDAAL_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
|
||||
adres=item.get("address") or None,
|
||||
@@ -299,9 +393,11 @@ def fetch_vandaal() -> list[RawListing]:
|
||||
kamers=item.get("rooms") or None,
|
||||
slaapkamers=item.get("bedrooms") or None,
|
||||
bouwjaar=bouwjaar,
|
||||
energielabel=item.get("energyLabel") or None,
|
||||
energielabel=energielabel,
|
||||
hero_image_url=item.get("photo") or None,
|
||||
))
|
||||
if config.APP_ENV == "dev":
|
||||
break
|
||||
|
||||
log.info("vandaal: %d koopwoningen opgehaald", len(listings))
|
||||
return listings
|
||||
@@ -349,9 +445,15 @@ def fetch_elzenaar() -> list[RawListing]:
|
||||
|
||||
raw_year = item.get("dateOfConstruction") or ""
|
||||
bouwjaar = int(raw_year) if raw_year.isdigit() else None
|
||||
energielabel = item.get("energyLabel") or None
|
||||
|
||||
detail_url = _ELZENAAR_BASE + item["url"]
|
||||
if not energielabel:
|
||||
extra_kk = _og_detail(detail_url, "elzenaar")
|
||||
energielabel = extra_kk.get("energielabel")
|
||||
|
||||
listings.append(RawListing(
|
||||
url=_ELZENAAR_BASE + item["url"],
|
||||
url=detail_url,
|
||||
source_makelaar="elzenaar",
|
||||
status=_ELZENAAR_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
|
||||
adres=item.get("address") or None,
|
||||
@@ -364,9 +466,11 @@ def fetch_elzenaar() -> list[RawListing]:
|
||||
kamers=item.get("rooms") or None,
|
||||
slaapkamers=item.get("bedrooms") or None,
|
||||
bouwjaar=bouwjaar,
|
||||
energielabel=item.get("energyLabel") or None,
|
||||
energielabel=energielabel,
|
||||
hero_image_url=item.get("photo") or None,
|
||||
))
|
||||
if config.APP_ENV == "dev":
|
||||
break
|
||||
|
||||
log.info("elzenaar: %d koopwoningen opgehaald", len(listings))
|
||||
return listings
|
||||
@@ -413,9 +517,15 @@ def fetch_doen() -> list[RawListing]:
|
||||
|
||||
raw_year = item.get("dateOfConstruction") or ""
|
||||
bouwjaar = int(raw_year) if raw_year.isdigit() else None
|
||||
energielabel = item.get("energyLabel") or None
|
||||
|
||||
detail_url = _DOEN_BASE + item["url"]
|
||||
if not energielabel:
|
||||
extra_kk = _og_detail(detail_url, "doen")
|
||||
energielabel = extra_kk.get("energielabel")
|
||||
|
||||
listings.append(RawListing(
|
||||
url=_DOEN_BASE + item["url"],
|
||||
url=detail_url,
|
||||
source_makelaar="doen",
|
||||
status=_DOEN_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
|
||||
adres=item.get("address") or None,
|
||||
@@ -428,9 +538,11 @@ def fetch_doen() -> list[RawListing]:
|
||||
kamers=item.get("rooms") or None,
|
||||
slaapkamers=item.get("bedrooms") or None,
|
||||
bouwjaar=bouwjaar,
|
||||
energielabel=item.get("energyLabel") or None,
|
||||
energielabel=energielabel,
|
||||
hero_image_url=item.get("photo") or None,
|
||||
))
|
||||
if config.APP_ENV == "dev":
|
||||
break
|
||||
|
||||
log.info("doen: %d koopwoningen opgehaald", len(listings))
|
||||
return listings
|
||||
@@ -476,9 +588,15 @@ def fetch_vandriel() -> list[RawListing]:
|
||||
|
||||
raw_year = item.get("dateOfConstruction") or ""
|
||||
bouwjaar = int(raw_year) if raw_year.isdigit() else None
|
||||
energielabel = item.get("energyLabel") or None
|
||||
|
||||
detail_url = _VANDRIEL_BASE + item["url"]
|
||||
if not energielabel:
|
||||
extra_kk = _og_detail(detail_url, "vandriel")
|
||||
energielabel = extra_kk.get("energielabel")
|
||||
|
||||
listings.append(RawListing(
|
||||
url=_VANDRIEL_BASE + item["url"],
|
||||
url=detail_url,
|
||||
source_makelaar="vandriel",
|
||||
status=_VANDRIEL_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
|
||||
adres=item.get("address") or None,
|
||||
@@ -491,9 +609,11 @@ def fetch_vandriel() -> list[RawListing]:
|
||||
kamers=item.get("rooms") or None,
|
||||
slaapkamers=item.get("bedrooms") or None,
|
||||
bouwjaar=bouwjaar,
|
||||
energielabel=item.get("energyLabel") or None,
|
||||
energielabel=energielabel,
|
||||
hero_image_url=item.get("photo") or None,
|
||||
))
|
||||
if config.APP_ENV == "dev":
|
||||
break
|
||||
|
||||
log.info("vandriel: %d koopwoningen opgehaald", len(listings))
|
||||
return listings
|
||||
|
||||
Reference in New Issue
Block a user