fix: extract postcode for vanoord and vanherk scrapers

Van Oord: postcode is in the first .elementor-heading-title on detail pages.
Van Herk: postcode extracted via regex from <title> tag; also pick up kamers
and energielabel from the features list which were previously ignored.
Test output now includes woonoppervlak and energielabel fields.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-11 23:53:17 +02:00
parent f74e9bcfb0
commit c6328cee46
3 changed files with 22 additions and 3 deletions

View File

@@ -174,7 +174,7 @@ _VANOORD_STATUS_MAP = {
def _vanoord_detail(detail_url: str) -> dict:
"""Fetch Van Oord detail page; extract kenmerken from rw-object-features-list."""
"""Fetch Van Oord detail page; extract kenmerken from rw-object-features-list and postcode."""
try:
soup = fetch_soup(detail_url)
kv: dict[str, str] = {}
@@ -185,8 +185,14 @@ def _vanoord_detail(detail_url: str) -> dict:
label = label_el.get_text(strip=True).lower()
value = value_el.get_text(strip=True)
kv[label] = value
# Postcode is in first .elementor-heading-title (e.g. "3562 TN,")
headings = soup.select(".elementor-heading-title")
postcode = None
if headings:
postcode = headings[0].get_text(strip=True).rstrip(",").strip()
return {
"status": kv.get("status", "").lower(),
"postcode": postcode,
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("woonoppervlakte"),
"kamers": kv.get("aantal kamers"),
@@ -198,6 +204,7 @@ def _vanoord_detail(detail_url: str) -> dict:
return {}
def fetch_vanoord() -> list[RawListing]:
"""Fetch Van Oord listings; Delft and Schiedam, only koop."""
seen: set[str] = set()
@@ -263,6 +270,7 @@ def fetch_vanoord() -> list[RawListing]:
source_makelaar="vanoord",
status=status,
adres=adres,
postcode=kk.get("postcode"),
stad=stad,
prijs=prijs,
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,

View File

@@ -373,11 +373,20 @@ def _vanherk_detail(detail_url: str) -> dict:
label = spans[0].get_text(strip=True).lower()
value = spans[1].get_text(strip=True)
kv[label] = value
# Postcode is in <title>: "Lorentzlaan 19 B, 3112 KE SCHIEDAM - Van Herk Makelaars"
postcode = None
if soup.title:
m = re.search(r"\b(\d{4}\s*[A-Z]{2})\b", soup.title.get_text())
if m:
postcode = m.group(1).replace(" ", " ").strip()
return {
"status": kv.get("status", "").lower(),
"postcode": postcode,
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("woonoppervlakte"),
"kamers": kv.get("aantal kamers"),
"slaapkamers": kv.get("aantal slaapkamers"),
"energielabel": kv.get("energielabel"),
}
except Exception as e:
log.warning("vanherk: detail fetch fout %s: %s", detail_url, e)
@@ -456,13 +465,15 @@ def fetch_vanherk() -> list[RawListing]:
source_makelaar="vanherk",
status=status,
adres=adres,
postcode=kk.get("postcode"),
stad=stad,
prijs=prijs,
hero_image_url=hero,
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
kamers=int(kk["kamers"]) if kk.get("kamers", "").isdigit() else None,
slaapkamers=(int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None) or slaapkamers_card,
energielabel=energielabel_card,
energielabel=kk.get("energielabel") or energielabel_card,
))
if config.APP_ENV == "dev":
break

View File

@@ -23,4 +23,4 @@ if __name__ == "__main__":
listings = ADAPTER()
print(f"Got {len(listings)} listings\n")
for l in listings:
print(f" {l.adres}, {l.postcode}, {l.stad} — €{l.prijs}{l.kamers} rooms — {l.url}")
print(f" {l.adres}, {l.postcode}, {l.stad} — €{l.prijs}{l.kamers} rooms — {l.woonoppervlak}m2 — {l.energielabel}{l.url}")