Files
huizenbot/src/adapters/api.py
Mark Kalsbeek f74e9bcfb0 refactor: split ssr.py into package, enrich OG Online detail pages, fix travel upsert
- Split src/adapters/ssr.py (2160 LOC) into ssr/ package grouped by CMS:
  realworks.py, sure.py, schiedam.py, denhaag.py, overige.py
- Add _og_detail() to api.py; all OG Online scrapers now fall back to
  detail page fetch when energielabel/bouwjaar are missing from the API
- Fix run() to recalculate travel times for existing listings where
  fiets_mark IS NULL; upsert() now writes travel cols on existing rows too
- Update tests/cache.py to patch fetch_soup in every ssr submodule
- Update docs to reflect new package structure and mark API enrichment TODO done

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-11 23:39:35 +02:00

635 lines
22 KiB
Python

"""
adapters/api.py — JSON/API-based makelaars
Elke scraper is een functie () -> list[RawListing].
Voeg nieuwe toe onderaan en registreer in SCRAPERS.
"""
import json
import logging
import re
import time
import httpx
from bs4 import BeautifulSoup
import config
from huizenbot import RawListing
log = logging.getLogger("huizenbot.api")
# ---------------------------------------------------------------------------
# Gedeelde HTTP helper
# ---------------------------------------------------------------------------
def fetch_json(url: str, *, params: dict = None, headers: dict = None) -> dict | list:
"""
GET request met User-Agent, timeout en Retry-After afhandeling.
Raises httpx.HTTPError bij aanhoudende fouten.
"""
hdrs = {"User-Agent": config.USER_AGENT}
if headers:
hdrs.update(headers)
for attempt in range(3):
r = httpx.get(url, params=params, headers=hdrs, timeout=15)
if r.status_code == 429:
wait = int(r.headers.get("Retry-After", 60))
log.warning("429 op %s, wacht %ds", url, wait)
time.sleep(wait)
continue
r.raise_for_status()
return r.json()
raise RuntimeError(f"Blijvend 429 op {url}")
def _og_detail(url: str, makelaar: str) -> dict:
"""
Fetch an OG Online detail page and extract missing fields.
OG Online sites typically expose kenmerken in one of two patterns:
1. A table/list with dt/dd or label/value span pairs
2. An energielabel CSS class (energielabel-A, energielabel-B, etc.)
Returns a dict with any fields found; empty dict on failure.
"""
try:
r = httpx.get(
url,
headers={"User-Agent": config.USER_AGENT},
timeout=15,
follow_redirects=True,
)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
# Pattern 1: energielabel CSS class on any element
energielabel = None
for el in soup.select("[class]"):
for cls in el.get("class", []):
if cls.startswith("energielabel-") and cls != "energielabel":
energielabel = cls.replace("energielabel-", "").upper()
break
if energielabel:
break
# Pattern 2: kenmerken table — try dt/dd pairs first
kv: dict[str, str] = {}
dts = soup.select("dt")
dds = soup.select("dd")
for dt, dd in zip(dts, dds):
kv[dt.get_text(strip=True).lower()] = dd.get_text(strip=True)
# Pattern 3: ul.objectkenmerken / div.kenmerken span pairs
if not kv:
for li in soup.select("li"):
spans = li.select("span")
if len(spans) >= 2:
kv[spans[0].get_text(strip=True).lower()] = spans[1].get_text(strip=True)
if not energielabel:
energielabel = (
kv.get("energielabel")
or kv.get("energieklasse")
or kv.get("energie")
) or None
raw_year = kv.get("bouwjaar") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
return {
"energielabel": energielabel,
"bouwjaar": bouwjaar,
}
except Exception as e:
log.warning("%s: detail fetch fout %s: %s", makelaar, url, e)
return {}
# ---------------------------------------------------------------------------
# Bjornd
# ---------------------------------------------------------------------------
_BJORND_BASE = "https://www.bjornd.nl"
_BJORND_SKIP = {"rented", "rented_ur"}
_STATUS_MAP = {
"available": "beschikbaar",
"under_bid": "onder_bod",
"under_option": "onder_bod",
"sold": "verkocht",
"sold_ur": "verkocht",
}
def fetch_bjornd() -> list[RawListing]:
data = fetch_json(
f"{_BJORND_BASE}/nl/realtime-listings/consumer",
headers={"X-Requested-With": "XMLHttpRequest"},
)
listings = []
for item in data:
if not item.get("isSales"):
continue
if item.get("statusOrig") in _BJORND_SKIP:
continue
if item.get("salesPrice", 0) > config.MAX_PRICE:
continue
detail_url = _BJORND_BASE + item["url"]
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
# Fetch detail page when API omits key fields
if not energielabel or not bouwjaar:
extra_kk = _og_detail(detail_url, "bjornd")
energielabel = energielabel or extra_kk.get("energielabel")
bouwjaar = bouwjaar or extra_kk.get("bouwjaar")
listings.append(RawListing(
url=detail_url,
source_makelaar="bjornd",
status=_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
postcode=item.get("zipcode") or None,
stad=item.get("city") or None,
prijs=item.get("salesPrice") or None,
woningtype=item.get("type") or None,
woonoppervlak=item.get("livingSurface") or None,
perceeloppervlak=item.get("plotSurface") or None,
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
extra=json.dumps({
"balcony": item.get("balcony"),
"garden": item.get("garden"),
"mainType": item.get("mainType"),
"buildType": item.get("buildType"),
"district": item.get("district"),
"lat": item.get("lat"),
"lng": item.get("lng"),
"isFurnished": item.get("isFurnished"),
"hasOpenHouse": item.get("hasOpenHouse"),
"description": item.get("description"),
"photos": item.get("photos"),
}, ensure_ascii=False),
))
if config.APP_ENV == "dev":
break
log.info("bjornd: %d koopwoningen opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Ooms
# ---------------------------------------------------------------------------
_OOMS_BASE = "https://ooms.com"
_OOMS_CITIES = {"Delft", "Schiedam", "Rotterdam", "Leiden", "Voorburg", "Pijnacker"}
_OOMS_SKIP_STATUS = {"verhuurd", "verhuurd onder voorbehoud"}
_OOMS_STATUS_MAP = {
"beschikbaar": "beschikbaar",
"onder bod": "onder_bod",
"onder optie": "onder_bod",
"verkocht": "verkocht",
"verkocht onder voorbehoud":"verkocht",
}
def fetch_ooms() -> list[RawListing]:
data = fetch_json(f"{_OOMS_BASE}/api/properties/available.json")
listings = []
for item in data.get("objects", []):
if item.get("buy_or_rent") != "buy":
continue
if item.get("place") not in _OOMS_CITIES:
continue
if item.get("buy_price", 0) > config.MAX_PRICE:
continue
status_raw = item.get("availability_status", "")
if status_raw in _OOMS_SKIP_STATUS:
continue
hnr = item.get("house_number", "")
add = item.get("house_number_addition") or ""
adres = f"{item.get('street_name', '')} {hnr}{(' ' + add) if add else ''}".strip()
main_images = item.get("realworks_main_images") or item.get("realworks_images") or []
hero = None
if main_images:
sizes = main_images[0].get("sizes") or []
best = max(sizes, key=lambda s: s.get("width", 0), default=None)
if best:
hero = _OOMS_BASE + best["imageUrl"]
perceel = item.get("parcel_surface") or None
if perceel == 0:
perceel = None
listings.append(RawListing(
url=item["url"],
source_makelaar="ooms",
datum_aanmelding=item.get("publish_date", "")[:10] or None,
status=_OOMS_STATUS_MAP.get(status_raw, "beschikbaar"),
adres=adres or None,
postcode=(item.get("zip_code") or "").replace(" ", "") or None,
stad=item.get("place") or None,
prijs=item.get("buy_price") or None,
woningtype=item.get("appartment_characteristic") or item.get("residential_building_type") or None,
woonoppervlak=item.get("usable_area_living_function") or None,
perceeloppervlak=perceel,
kamers=item.get("amount_of_rooms") or None,
slaapkamers=item.get("amount_of_bedrooms") or None,
hero_image_url=hero,
extra={
"office": item.get("office", {}).get("name"),
"locations": item.get("locations"),
"garden_types": item.get("garden_types"),
"lat": item.get("lat"),
"lng": item.get("lng"),
"object_code": item.get("object_code"),
},
))
log.info("ooms: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Moerman & De Jong Makelaars (Schiedam)
# ---------------------------------------------------------------------------
# Zelfde OG Online / realtime-listings platform als Bjornd.
_MOERMAN_BASE = "https://www.moerman-dejong.nl"
_MOERMAN_SKIP = {"rented", "rented_ur"}
_MOERMAN_STATUS_MAP = {
"available": "beschikbaar",
"under_bid": "onder_bod",
"under_option": "onder_bod",
"sold": "verkocht",
"sold_ur": "verkocht",
}
def fetch_moerman() -> list[RawListing]:
data = fetch_json(
f"{_MOERMAN_BASE}/nl/realtime-listings/consumer",
headers={"X-Requested-With": "XMLHttpRequest"},
)
listings = []
for item in data:
if not item.get("isSales"):
continue
if item.get("statusOrig") in _MOERMAN_SKIP:
continue
if item.get("salesPrice", 0) > config.MAX_PRICE:
continue
postcode = (item.get("zipcode") or "").replace(" ", "") or None
perceel = item.get("plotSurface") or None
if perceel == 0:
perceel = None
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _MOERMAN_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "moerman")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=detail_url,
source_makelaar="moerman",
status=_MOERMAN_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
postcode=postcode,
stad=item.get("city") or None,
prijs=item.get("salesPrice") or None,
woningtype=item.get("type") or None,
woonoppervlak=item.get("livingSurface") or None,
perceeloppervlak=perceel,
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("moerman: %d koopwoningen opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Van Daal Makelaardij (Delft)
# ---------------------------------------------------------------------------
# OG Online / realtime-listings platform.
_VANDAAL_BASE = "https://www.vandaalmakelaardij.nl"
_VANDAAL_SKIP = {"rented", "rented_ur"}
_VANDAAL_STATUS_MAP = {
"available": "beschikbaar",
"under_bid": "onder_bod",
"under_option": "onder_bod",
"is_bought": "verkocht",
"sold": "verkocht",
"sold_ur": "verkocht",
}
def fetch_vandaal() -> list[RawListing]:
data = fetch_json(
f"{_VANDAAL_BASE}/nl/realtime-listings/consumer",
headers={"X-Requested-With": "XMLHttpRequest"},
)
listings = []
for item in data:
if not item.get("isSales"):
continue
if item.get("statusOrig") in _VANDAAL_SKIP:
continue
if item.get("salesPrice", 0) > config.MAX_PRICE:
continue
postcode = (item.get("zipcode") or "").replace(" ", "") or None
perceel = item.get("plotSurface") or None
if perceel == 0:
perceel = None
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _VANDAAL_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "vandaal")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=detail_url,
source_makelaar="vandaal",
status=_VANDAAL_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
postcode=postcode,
stad=item.get("city") or None,
prijs=item.get("salesPrice") or None,
woningtype=item.get("type") or None,
woonoppervlak=item.get("livingSurface") or None,
perceeloppervlak=perceel,
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("vandaal: %d koopwoningen opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Elzenaar NVM Makelaars (Den Haag) — OG Online platform
# ---------------------------------------------------------------------------
# Zelfde platform als bjornd/moerman/vandaal.
_ELZENAAR_BASE = "https://www.elzenaar.com"
_ELZENAAR_SKIP = {"rented", "rented_ur"}
_ELZENAAR_CITIES = {"Den Haag", "Voorburg", "Rijswijk"}
_ELZENAAR_STATUS_MAP = {
"available": "beschikbaar",
"under_bid": "onder_bod",
"under_option": "onder_bod",
"sold": "verkocht",
"sold_ur": "verkocht",
}
def fetch_elzenaar() -> list[RawListing]:
data = fetch_json(
f"{_ELZENAAR_BASE}/nl/realtime-listings/consumer",
headers={"X-Requested-With": "XMLHttpRequest"},
)
listings = []
for item in data:
if not item.get("isSales"):
continue
if item.get("statusOrig") in _ELZENAAR_SKIP:
continue
if item.get("city") not in _ELZENAAR_CITIES:
continue
if item.get("salesPrice", 0) > config.MAX_PRICE:
continue
postcode = (item.get("zipcode") or "").replace(" ", "") or None
perceel = item.get("plotSurface") or None
if perceel == 0:
perceel = None
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _ELZENAAR_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "elzenaar")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=detail_url,
source_makelaar="elzenaar",
status=_ELZENAAR_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
postcode=postcode,
stad=item.get("city") or None,
prijs=item.get("salesPrice") or None,
woningtype=item.get("type") or None,
woonoppervlak=item.get("livingSurface") or None,
perceeloppervlak=perceel,
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("elzenaar: %d koopwoningen opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# DOEN NVM Makelaars (Den Haag / Leiden / Voorburg) — OG Online platform
# ---------------------------------------------------------------------------
_DOEN_BASE = "https://www.doenmakelaars.com"
_DOEN_SKIP = {"rented", "rented_ur"}
_DOEN_CITIES = {"Den Haag", "Leiden", "Voorburg", "Leidschendam", "Rijswijk", "Wassenaar", "Zoetermeer"}
_DOEN_STATUS_MAP = {
"available": "beschikbaar",
"under_bid": "onder_bod",
"under_option": "onder_bod",
"sold": "verkocht",
"sold_ur": "verkocht",
}
def fetch_doen() -> list[RawListing]:
data = fetch_json(
f"{_DOEN_BASE}/nl/realtime-listings/consumer",
headers={"X-Requested-With": "XMLHttpRequest"},
)
listings = []
for item in data:
if not item.get("isSales"):
continue
if item.get("statusOrig") in _DOEN_SKIP:
continue
if item.get("city") not in _DOEN_CITIES:
continue
if item.get("salesPrice", 0) > config.MAX_PRICE:
continue
postcode = (item.get("zipcode") or "").replace(" ", "") or None
perceel = item.get("plotSurface") or None
if perceel == 0:
perceel = None
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _DOEN_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "doen")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=detail_url,
source_makelaar="doen",
status=_DOEN_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
postcode=postcode,
stad=item.get("city") or None,
prijs=item.get("salesPrice") or None,
woningtype=item.get("type") or None,
woonoppervlak=item.get("livingSurface") or None,
perceeloppervlak=perceel,
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("doen: %d koopwoningen opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Vandriel Makelaardij (Schiedam) — OG Online / realtime-listings
# ---------------------------------------------------------------------------
_VANDRIEL_BASE = "https://www.vandrielmakelaardij.nl"
_VANDRIEL_SKIP = {"rented", "rented_ur"}
_VANDRIEL_STATUS_MAP = {
"available": "beschikbaar",
"under_bid": "onder_bod",
"under_option": "onder_bod",
"sold": "verkocht",
"sold_ur": "verkocht",
}
def fetch_vandriel() -> list[RawListing]:
data = fetch_json(
f"{_VANDRIEL_BASE}/nl/realtime-listings/consumer",
headers={"X-Requested-With": "XMLHttpRequest"},
)
listings = []
for item in data:
if not item.get("isSales"):
continue
if item.get("statusOrig") in _VANDRIEL_SKIP:
continue
if (item.get("city") or "").lower() != "schiedam":
continue
if item.get("salesPrice", 0) > config.MAX_PRICE:
continue
postcode = (item.get("zipcode") or "").replace(" ", "") or None
perceel = item.get("plotSurface") or None
if perceel == 0:
perceel = None
raw_year = item.get("dateOfConstruction") or ""
bouwjaar = int(raw_year) if raw_year.isdigit() else None
energielabel = item.get("energyLabel") or None
detail_url = _VANDRIEL_BASE + item["url"]
if not energielabel:
extra_kk = _og_detail(detail_url, "vandriel")
energielabel = extra_kk.get("energielabel")
listings.append(RawListing(
url=detail_url,
source_makelaar="vandriel",
status=_VANDRIEL_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"),
adres=item.get("address") or None,
postcode=postcode,
stad=item.get("city") or None,
prijs=item.get("salesPrice") or None,
woningtype=item.get("type") or None,
woonoppervlak=item.get("livingSurface") or None,
perceeloppervlak=perceel,
kamers=item.get("rooms") or None,
slaapkamers=item.get("bedrooms") or None,
bouwjaar=bouwjaar,
energielabel=energielabel,
hero_image_url=item.get("photo") or None,
))
if config.APP_ENV == "dev":
break
log.info("vandriel: %d koopwoningen opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# SCRAPERS — exporteer hier alle actieve API adapters
# ---------------------------------------------------------------------------
SCRAPERS = {
'bjornd': fetch_bjornd,
'ooms': fetch_ooms,
'moerman': fetch_moerman,
'vandaal': fetch_vandaal,
'elzenaar': fetch_elzenaar,
'doen': fetch_doen,
'vandriel': fetch_vandriel,
}