fix: extract postcode for vanoord and vanherk scrapers
Van Oord: postcode is in the first .elementor-heading-title on detail pages. Van Herk: postcode extracted via regex from <title> tag; also pick up kamers and energielabel from the features list which were previously ignored. Test output now includes woonoppervlak and energielabel fields. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -174,7 +174,7 @@ _VANOORD_STATUS_MAP = {
|
|||||||
|
|
||||||
|
|
||||||
def _vanoord_detail(detail_url: str) -> dict:
|
def _vanoord_detail(detail_url: str) -> dict:
|
||||||
"""Fetch Van Oord detail page; extract kenmerken from rw-object-features-list."""
|
"""Fetch Van Oord detail page; extract kenmerken from rw-object-features-list and postcode."""
|
||||||
try:
|
try:
|
||||||
soup = fetch_soup(detail_url)
|
soup = fetch_soup(detail_url)
|
||||||
kv: dict[str, str] = {}
|
kv: dict[str, str] = {}
|
||||||
@@ -185,8 +185,14 @@ def _vanoord_detail(detail_url: str) -> dict:
|
|||||||
label = label_el.get_text(strip=True).lower()
|
label = label_el.get_text(strip=True).lower()
|
||||||
value = value_el.get_text(strip=True)
|
value = value_el.get_text(strip=True)
|
||||||
kv[label] = value
|
kv[label] = value
|
||||||
|
# Postcode is in first .elementor-heading-title (e.g. "3562 TN,")
|
||||||
|
headings = soup.select(".elementor-heading-title")
|
||||||
|
postcode = None
|
||||||
|
if headings:
|
||||||
|
postcode = headings[0].get_text(strip=True).rstrip(",").strip()
|
||||||
return {
|
return {
|
||||||
"status": kv.get("status", "").lower(),
|
"status": kv.get("status", "").lower(),
|
||||||
|
"postcode": postcode,
|
||||||
"bouwjaar": kv.get("bouwjaar"),
|
"bouwjaar": kv.get("bouwjaar"),
|
||||||
"woonoppervlak": kv.get("woonoppervlakte"),
|
"woonoppervlak": kv.get("woonoppervlakte"),
|
||||||
"kamers": kv.get("aantal kamers"),
|
"kamers": kv.get("aantal kamers"),
|
||||||
@@ -198,6 +204,7 @@ def _vanoord_detail(detail_url: str) -> dict:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_vanoord() -> list[RawListing]:
|
def fetch_vanoord() -> list[RawListing]:
|
||||||
"""Fetch Van Oord listings; Delft and Schiedam, only koop."""
|
"""Fetch Van Oord listings; Delft and Schiedam, only koop."""
|
||||||
seen: set[str] = set()
|
seen: set[str] = set()
|
||||||
@@ -263,6 +270,7 @@ def fetch_vanoord() -> list[RawListing]:
|
|||||||
source_makelaar="vanoord",
|
source_makelaar="vanoord",
|
||||||
status=status,
|
status=status,
|
||||||
adres=adres,
|
adres=adres,
|
||||||
|
postcode=kk.get("postcode"),
|
||||||
stad=stad,
|
stad=stad,
|
||||||
prijs=prijs,
|
prijs=prijs,
|
||||||
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,
|
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,
|
||||||
|
|||||||
@@ -373,11 +373,20 @@ def _vanherk_detail(detail_url: str) -> dict:
|
|||||||
label = spans[0].get_text(strip=True).lower()
|
label = spans[0].get_text(strip=True).lower()
|
||||||
value = spans[1].get_text(strip=True)
|
value = spans[1].get_text(strip=True)
|
||||||
kv[label] = value
|
kv[label] = value
|
||||||
|
# Postcode is in <title>: "Lorentzlaan 19 B, 3112 KE SCHIEDAM - Van Herk Makelaars"
|
||||||
|
postcode = None
|
||||||
|
if soup.title:
|
||||||
|
m = re.search(r"\b(\d{4}\s*[A-Z]{2})\b", soup.title.get_text())
|
||||||
|
if m:
|
||||||
|
postcode = m.group(1).replace(" ", " ").strip()
|
||||||
return {
|
return {
|
||||||
"status": kv.get("status", "").lower(),
|
"status": kv.get("status", "").lower(),
|
||||||
|
"postcode": postcode,
|
||||||
"bouwjaar": kv.get("bouwjaar"),
|
"bouwjaar": kv.get("bouwjaar"),
|
||||||
"woonoppervlak": kv.get("woonoppervlakte"),
|
"woonoppervlak": kv.get("woonoppervlakte"),
|
||||||
|
"kamers": kv.get("aantal kamers"),
|
||||||
"slaapkamers": kv.get("aantal slaapkamers"),
|
"slaapkamers": kv.get("aantal slaapkamers"),
|
||||||
|
"energielabel": kv.get("energielabel"),
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning("vanherk: detail fetch fout %s: %s", detail_url, e)
|
log.warning("vanherk: detail fetch fout %s: %s", detail_url, e)
|
||||||
@@ -456,13 +465,15 @@ def fetch_vanherk() -> list[RawListing]:
|
|||||||
source_makelaar="vanherk",
|
source_makelaar="vanherk",
|
||||||
status=status,
|
status=status,
|
||||||
adres=adres,
|
adres=adres,
|
||||||
|
postcode=kk.get("postcode"),
|
||||||
stad=stad,
|
stad=stad,
|
||||||
prijs=prijs,
|
prijs=prijs,
|
||||||
hero_image_url=hero,
|
hero_image_url=hero,
|
||||||
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,
|
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None,
|
||||||
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
|
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
|
||||||
|
kamers=int(kk["kamers"]) if kk.get("kamers", "").isdigit() else None,
|
||||||
slaapkamers=(int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None) or slaapkamers_card,
|
slaapkamers=(int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None) or slaapkamers_card,
|
||||||
energielabel=energielabel_card,
|
energielabel=kk.get("energielabel") or energielabel_card,
|
||||||
))
|
))
|
||||||
if config.APP_ENV == "dev":
|
if config.APP_ENV == "dev":
|
||||||
break
|
break
|
||||||
|
|||||||
@@ -23,4 +23,4 @@ if __name__ == "__main__":
|
|||||||
listings = ADAPTER()
|
listings = ADAPTER()
|
||||||
print(f"Got {len(listings)} listings\n")
|
print(f"Got {len(listings)} listings\n")
|
||||||
for l in listings:
|
for l in listings:
|
||||||
print(f" {l.adres}, {l.postcode}, {l.stad} — €{l.prijs} — {l.kamers} rooms — {l.url}")
|
print(f" {l.adres}, {l.postcode}, {l.stad} — €{l.prijs} — {l.kamers} rooms — {l.woonoppervlak}m2 — {l.energielabel} — {l.url}")
|
||||||
|
|||||||
Reference in New Issue
Block a user