fix: extract postcode for vanoord and vanherk scrapers
Van Oord: postcode is in the first .elementor-heading-title on detail pages. Van Herk: postcode extracted via regex from <title> tag; also pick up kamers and energielabel from the features list which were previously ignored. Test output now includes woonoppervlak and energielabel fields. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -174,7 +174,7 @@ _VANOORD_STATUS_MAP = {
|
||||
|
||||
|
||||
def _vanoord_detail(detail_url: str) -> dict:
|
||||
"""Fetch Van Oord detail page; extract kenmerken from rw-object-features-list."""
|
||||
"""Fetch Van Oord detail page; extract kenmerken from rw-object-features-list and postcode."""
|
||||
try:
|
||||
soup = fetch_soup(detail_url)
|
||||
kv: dict[str, str] = {}
|
||||
@@ -185,8 +185,14 @@ def _vanoord_detail(detail_url: str) -> dict:
|
||||
label = label_el.get_text(strip=True).lower()
|
||||
value = value_el.get_text(strip=True)
|
||||
kv[label] = value
|
||||
# Postcode is in first .elementor-heading-title (e.g. "3562 TN,")
|
||||
headings = soup.select(".elementor-heading-title")
|
||||
postcode = None
|
||||
if headings:
|
||||
postcode = headings[0].get_text(strip=True).rstrip(",").strip()
|
||||
return {
|
||||
"status": kv.get("status", "").lower(),
|
||||
"postcode": postcode,
|
||||
"bouwjaar": kv.get("bouwjaar"),
|
||||
"woonoppervlak": kv.get("woonoppervlakte"),
|
||||
"kamers": kv.get("aantal kamers"),
|
||||
@@ -198,6 +204,7 @@ def _vanoord_detail(detail_url: str) -> dict:
|
||||
return {}
|
||||
|
||||
|
||||
|
||||
def fetch_vanoord() -> list[RawListing]:
|
||||
"""Fetch Van Oord listings; Delft and Schiedam, only koop."""
|
||||
seen: set[str] = set()
|
||||
@@ -263,6 +270,7 @@ def fetch_vanoord() -> list[RawListing]:
|
||||
source_makelaar="vanoord",
|
||||
status=status,
|
||||
adres=adres,
|
||||
postcode=kk.get("postcode"),
|
||||
stad=stad,
|
||||
prijs=prijs,
|
||||
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,
|
||||
|
||||
@@ -373,11 +373,20 @@ def _vanherk_detail(detail_url: str) -> dict:
|
||||
label = spans[0].get_text(strip=True).lower()
|
||||
value = spans[1].get_text(strip=True)
|
||||
kv[label] = value
|
||||
# Postcode is in <title>: "Lorentzlaan 19 B, 3112 KE SCHIEDAM - Van Herk Makelaars"
|
||||
postcode = None
|
||||
if soup.title:
|
||||
m = re.search(r"\b(\d{4}\s*[A-Z]{2})\b", soup.title.get_text())
|
||||
if m:
|
||||
postcode = m.group(1).replace(" ", " ").strip()
|
||||
return {
|
||||
"status": kv.get("status", "").lower(),
|
||||
"postcode": postcode,
|
||||
"bouwjaar": kv.get("bouwjaar"),
|
||||
"woonoppervlak": kv.get("woonoppervlakte"),
|
||||
"kamers": kv.get("aantal kamers"),
|
||||
"slaapkamers": kv.get("aantal slaapkamers"),
|
||||
"energielabel": kv.get("energielabel"),
|
||||
}
|
||||
except Exception as e:
|
||||
log.warning("vanherk: detail fetch fout %s: %s", detail_url, e)
|
||||
@@ -456,13 +465,15 @@ def fetch_vanherk() -> list[RawListing]:
|
||||
source_makelaar="vanherk",
|
||||
status=status,
|
||||
adres=adres,
|
||||
postcode=kk.get("postcode"),
|
||||
stad=stad,
|
||||
prijs=prijs,
|
||||
hero_image_url=hero,
|
||||
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,
|
||||
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
|
||||
kamers=int(kk["kamers"]) if kk.get("kamers", "").isdigit() else None,
|
||||
slaapkamers=(int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None) or slaapkamers_card,
|
||||
energielabel=energielabel_card,
|
||||
energielabel=kk.get("energielabel") or energielabel_card,
|
||||
))
|
||||
if config.APP_ENV == "dev":
|
||||
break
|
||||
|
||||
@@ -23,4 +23,4 @@ if __name__ == "__main__":
|
||||
listings = ADAPTER()
|
||||
print(f"Got {len(listings)} listings\n")
|
||||
for l in listings:
|
||||
print(f" {l.adres}, {l.postcode}, {l.stad} — €{l.prijs} — {l.kamers} rooms — {l.url}")
|
||||
print(f" {l.adres}, {l.postcode}, {l.stad} — €{l.prijs} — {l.kamers} rooms — {l.woonoppervlak}m2 — {l.energielabel} — {l.url}")
|
||||
|
||||
Reference in New Issue
Block a user