From c6328cee46492ac9fa84eca56c2902ac371ebd39 Mon Sep 17 00:00:00 2001 From: Mark Kalsbeek Date: Sat, 11 Apr 2026 23:53:17 +0200 Subject: [PATCH] fix: extract postcode for vanoord and vanherk scrapers Van Oord: postcode is in the first .elementor-heading-title on detail pages. Van Herk: postcode extracted via regex from tag; also pick up kamers and energielabel from the features list which were previously ignored. Test output now includes woonoppervlak and energielabel fields. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --- src/adapters/ssr/overige.py | 10 +++++++++- src/adapters/ssr/sure.py | 13 ++++++++++++- tests/test_adapters.py | 2 +- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/adapters/ssr/overige.py b/src/adapters/ssr/overige.py index 57e133f..8c28d05 100644 --- a/src/adapters/ssr/overige.py +++ b/src/adapters/ssr/overige.py @@ -174,7 +174,7 @@ _VANOORD_STATUS_MAP = { def _vanoord_detail(detail_url: str) -> dict: - """Fetch Van Oord detail page; extract kenmerken from rw-object-features-list.""" + """Fetch Van Oord detail page; extract kenmerken from rw-object-features-list and postcode.""" try: soup = fetch_soup(detail_url) kv: dict[str, str] = {} @@ -185,8 +185,14 @@ def _vanoord_detail(detail_url: str) -> dict: label = label_el.get_text(strip=True).lower() value = value_el.get_text(strip=True) kv[label] = value + # Postcode is in first .elementor-heading-title (e.g. "3562 TN,") + headings = soup.select(".elementor-heading-title") + postcode = None + if headings: + postcode = headings[0].get_text(strip=True).rstrip(",").strip() return { "status": kv.get("status", "").lower(), + "postcode": postcode, "bouwjaar": kv.get("bouwjaar"), "woonoppervlak": kv.get("woonoppervlakte"), "kamers": kv.get("aantal kamers"), @@ -198,6 +204,7 @@ def _vanoord_detail(detail_url: str) -> dict: return {} + def fetch_vanoord() -> list[RawListing]: """Fetch Van Oord listings; Delft and Schiedam, only koop.""" seen: set[str] = set() @@ -263,6 +270,7 @@ def fetch_vanoord() -> list[RawListing]: source_makelaar="vanoord", status=status, adres=adres, + postcode=kk.get("postcode"), stad=stad, prijs=prijs, bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None, diff --git a/src/adapters/ssr/sure.py b/src/adapters/ssr/sure.py index be2a021..bfc89f7 100644 --- a/src/adapters/ssr/sure.py +++ b/src/adapters/ssr/sure.py @@ -373,11 +373,20 @@ def _vanherk_detail(detail_url: str) -> dict: label = spans[0].get_text(strip=True).lower() value = spans[1].get_text(strip=True) kv[label] = value + # Postcode is in <title>: "Lorentzlaan 19 B, 3112 KE SCHIEDAM - Van Herk Makelaars" + postcode = None + if soup.title: + m = re.search(r"\b(\d{4}\s*[A-Z]{2})\b", soup.title.get_text()) + if m: + postcode = m.group(1).replace(" ", " ").strip() return { "status": kv.get("status", "").lower(), + "postcode": postcode, "bouwjaar": kv.get("bouwjaar"), "woonoppervlak": kv.get("woonoppervlakte"), + "kamers": kv.get("aantal kamers"), "slaapkamers": kv.get("aantal slaapkamers"), + "energielabel": kv.get("energielabel"), } except Exception as e: log.warning("vanherk: detail fetch fout %s: %s", detail_url, e) @@ -456,13 +465,15 @@ def fetch_vanherk() -> list[RawListing]: source_makelaar="vanherk", status=status, adres=adres, + postcode=kk.get("postcode"), stad=stad, prijs=prijs, hero_image_url=hero, bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None, woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, + kamers=int(kk["kamers"]) if kk.get("kamers", "").isdigit() else None, slaapkamers=(int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None) or slaapkamers_card, - energielabel=energielabel_card, + energielabel=kk.get("energielabel") or energielabel_card, )) if config.APP_ENV == "dev": break diff --git a/tests/test_adapters.py b/tests/test_adapters.py index f959d64..e640e4a 100644 --- a/tests/test_adapters.py +++ b/tests/test_adapters.py @@ -23,4 +23,4 @@ if __name__ == "__main__": listings = ADAPTER() print(f"Got {len(listings)} listings\n") for l in listings: - print(f" {l.adres}, {l.postcode}, {l.stad} — €{l.prijs} — {l.kamers} rooms — {l.url}") + print(f" {l.adres}, {l.postcode}, {l.stad} — €{l.prijs} — {l.kamers} rooms — {l.woonoppervlak}m2 — {l.energielabel} — {l.url}")