refactor: split ssr.py into package, enrich OG Online detail pages, fix travel upsert
- Split src/adapters/ssr.py (2160 LOC) into ssr/ package grouped by CMS: realworks.py, sure.py, schiedam.py, denhaag.py, overige.py - Add _og_detail() to api.py; all OG Online scrapers now fall back to detail page fetch when energielabel/bouwjaar are missing from the API - Fix run() to recalculate travel times for existing listings where fiets_mark IS NULL; upsert() now writes travel cols on existing rows too - Update tests/cache.py to patch fetch_soup in every ssr submodule - Update docs to reflect new package structure and mark API enrichment TODO done Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
542
src/adapters/ssr/schiedam.py
Normal file
542
src/adapters/ssr/schiedam.py
Normal file
@@ -0,0 +1,542 @@
|
||||
"""
|
||||
Custom Schiedam scrapers (no shared CMS platform).
|
||||
|
||||
Each makelaar here uses a bespoke site structure that required its own parser.
|
||||
|
||||
Scrapers: dewittegarantiemakelaars (JSON-LD), dens, 3dmakelaars, dupont
|
||||
"""
|
||||
import re
|
||||
|
||||
import config
|
||||
from huizenbot import RawListing
|
||||
|
||||
from ._shared import (
|
||||
fetch_soup, parse_prijs, parse_m2, _text,
|
||||
_extract_postcode, _infer_stad, log,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# De Witte Garantiemakelaars (Schiedam)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Listing cards have a pill badge for status. All detail data comes from
|
||||
# JSON-LD (schema.org BuyAction/Offer) on the detail page.
|
||||
|
||||
_DEWITTE_BASE = "https://dewittegarantiemakelaars.nl"
|
||||
|
||||
_DEWITTE_PILL_MAP = {
|
||||
"bg-fun-green": "beschikbaar",
|
||||
"bg-sold": "verkocht",
|
||||
}
|
||||
|
||||
_DEWITTE_TYPE_MAP = {
|
||||
"Apartment": "appartement",
|
||||
"House": "woning",
|
||||
"SingleFamilyResidence": "woning",
|
||||
"Residence": "woning",
|
||||
}
|
||||
|
||||
|
||||
def _dewitte_jsonld(detail_url: str) -> dict:
|
||||
"""Fetch detail page and return parsed JSON-LD dict, or {} on failure."""
|
||||
import json
|
||||
try:
|
||||
soup = fetch_soup(detail_url)
|
||||
tag = soup.select_one('script[type="application/ld+json"]')
|
||||
if not tag:
|
||||
log.warning("dewitte: geen JSON-LD op %s", detail_url)
|
||||
return {}
|
||||
return json.loads(tag.string)
|
||||
except Exception as e:
|
||||
log.warning("dewitte: JSON-LD fout %s: %s", detail_url, e)
|
||||
return {}
|
||||
|
||||
|
||||
def fetch_dewittegarantiemakelaars() -> list[RawListing]:
|
||||
listings = []
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
url = (
|
||||
f"{_DEWITTE_BASE}/woningaanbod"
|
||||
f"?buy_rent=buy&buy_price=1-{config.MAX_PRICE}&page={page}"
|
||||
)
|
||||
soup = fetch_soup(url)
|
||||
cards = soup.select("div.card.card--property")
|
||||
if not cards:
|
||||
break
|
||||
|
||||
for card in cards:
|
||||
try:
|
||||
a_tag = card.select_one("a.card__anchor")
|
||||
if not a_tag:
|
||||
continue
|
||||
detail_url = a_tag["href"]
|
||||
if not detail_url.startswith("http"):
|
||||
detail_url = _DEWITTE_BASE + detail_url
|
||||
|
||||
pill = card.select_one("span.pill")
|
||||
pill_classes = pill.get("class", []) if pill else []
|
||||
status_key = next(
|
||||
(c for c in pill_classes if c.startswith("bg-")), None
|
||||
)
|
||||
status = _DEWITTE_PILL_MAP.get(status_key, "onder_bod")
|
||||
|
||||
ld = _dewitte_jsonld(detail_url)
|
||||
if not ld:
|
||||
continue
|
||||
|
||||
offered = ld.get("itemOffered", {})
|
||||
address = offered.get("address", {})
|
||||
floor_size = offered.get("floorSize", {})
|
||||
|
||||
postcode = address.get("postalCode", "").replace(" ", "") or None
|
||||
stad = address.get("addressLocality") or None
|
||||
adres = address.get("streetAddress") or None
|
||||
|
||||
prijs = ld.get("price")
|
||||
if prijs and int(prijs) > config.MAX_PRICE:
|
||||
continue
|
||||
|
||||
woningtype = _DEWITTE_TYPE_MAP.get(offered.get("@type", ""))
|
||||
woonoppervlak = int(floor_size["value"]) if floor_size.get("value") else None
|
||||
kamers = offered.get("numberOfRooms")
|
||||
bouwjaar = offered.get("yearBuilt")
|
||||
|
||||
# Full-res image from JSON-LD, fall back to card thumbnail
|
||||
hero = ld.get("image")
|
||||
if not hero:
|
||||
img = card.select_one("picture img")
|
||||
hero = img["src"] if img else None
|
||||
|
||||
listings.append(RawListing(
|
||||
url=detail_url,
|
||||
source_makelaar="dewittegarantiemakelaars",
|
||||
status=status,
|
||||
adres=adres,
|
||||
postcode=postcode,
|
||||
stad=stad,
|
||||
prijs=int(prijs) if prijs else None,
|
||||
woningtype=woningtype,
|
||||
woonoppervlak=woonoppervlak,
|
||||
kamers=int(kamers) if kamers else None,
|
||||
bouwjaar=int(bouwjaar) if bouwjaar else None,
|
||||
hero_image_url=hero,
|
||||
))
|
||||
if config.APP_ENV == "dev":
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
log.warning("dewitte: parse fout: %s", e)
|
||||
|
||||
if len(cards) < 10:
|
||||
break
|
||||
page += 1
|
||||
|
||||
log.info("dewittegarantiemakelaars: %d listings opgehaald", len(listings))
|
||||
return listings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# D&S Makelaars (Schiedam)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DS_BASE = "https://www.densmakelaars.nl"
|
||||
|
||||
_DS_STATUS_MAP = {
|
||||
"onder bod": "onder_bod",
|
||||
"te koop": "beschikbaar",
|
||||
"nieuw": "beschikbaar",
|
||||
"beschikbaar": "beschikbaar",
|
||||
"verkocht": "verkocht",
|
||||
}
|
||||
|
||||
|
||||
def _ds_detail(detail_url: str, html_text: str = None) -> dict:
|
||||
"""Fetch D&S detail page and extract all kenmerken from <dt>/<dd> pairs and postcode from maps URL."""
|
||||
try:
|
||||
# If html_text not provided, fetch it
|
||||
if html_text is None:
|
||||
import httpx
|
||||
r = httpx.get(
|
||||
detail_url,
|
||||
headers={"User-Agent": config.USER_AGENT},
|
||||
timeout=15,
|
||||
follow_redirects=True,
|
||||
)
|
||||
html_text = r.text
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
|
||||
# Parse <dt>/<dd> pairs into a label → value map
|
||||
kv: dict[str, str] = {}
|
||||
dts = soup.select("dt")
|
||||
dds = soup.select("dd")
|
||||
|
||||
for dt, dd in zip(dts, dds):
|
||||
label = dt.get_text(strip=True).lower()
|
||||
value = dd.get_text(strip=True)
|
||||
kv[label] = value
|
||||
|
||||
# Extract postcode from Google Maps URL in iframe src
|
||||
# Pattern: q=...POSTCODE...,CITY where POSTCODE is 4 digits + 2 letters
|
||||
postcode = None
|
||||
m = re.search(r'q=.+?,(\d{4})\s+([A-Z]{2}),', html_text)
|
||||
if m:
|
||||
postcode = f"{m.group(1)}{m.group(2)}"
|
||||
|
||||
return {
|
||||
"status": kv.get("status", "beschikbaar").lower(),
|
||||
"woningtype": kv.get("soort woning"),
|
||||
"bouwjaar": kv.get("bouwjaar"),
|
||||
"woonoppervlak": kv.get("woonoppervlakte"),
|
||||
"kamers": kv.get("aantal kamers"),
|
||||
"slaapkamers": kv.get("aantal slaapkamers"),
|
||||
"energielabel": kv.get("energielabel"),
|
||||
"postcode": postcode,
|
||||
}
|
||||
except Exception as e:
|
||||
log.warning("dens: detail fetch fout %s: %s", detail_url, e)
|
||||
return {}
|
||||
|
||||
|
||||
def fetch_dens() -> list[RawListing]:
|
||||
"""Fetch D&S Makelaars listings with full detail pages."""
|
||||
listings = []
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
url = f"{_DS_BASE}/aanbod/koopwoningen?page={page}"
|
||||
soup = fetch_soup(url)
|
||||
cards = soup.select(".col-12.col-md-4.object-wrapper")
|
||||
if not cards:
|
||||
break
|
||||
|
||||
for card in cards:
|
||||
try:
|
||||
# Extract URL
|
||||
a_tag = card.select_one("a.property")
|
||||
if not a_tag or "href" not in a_tag.attrs:
|
||||
continue
|
||||
detail_url = a_tag["href"]
|
||||
if not detail_url.startswith("http"):
|
||||
detail_url = _DS_BASE + detail_url
|
||||
|
||||
# Extract listing page data
|
||||
status_label = _text(card, "span.label") or "beschikbaar"
|
||||
status_label = status_label.strip().lower()
|
||||
status = _DS_STATUS_MAP.get(status_label, "beschikbaar")
|
||||
|
||||
adres = _text(card, "h3")
|
||||
stad = _text(card, "h4")
|
||||
prijs_text = _text(card, "div.price")
|
||||
prijs = parse_prijs(prijs_text)
|
||||
|
||||
# Extract area and rooms from footer
|
||||
footer_spans = card.select("div.footer span")
|
||||
woonoppervlak = None
|
||||
kamers = None
|
||||
for span in footer_spans:
|
||||
text = span.get_text(strip=True)
|
||||
if "m²" in text:
|
||||
woonoppervlak = parse_m2(text)
|
||||
elif "kamers" in text.lower():
|
||||
m = re.search(r"(\d+)", text)
|
||||
if m:
|
||||
kamers = int(m.group(1))
|
||||
|
||||
# Extract hero image
|
||||
img_tag = card.select_one("img")
|
||||
hero = img_tag["src"] if img_tag else None
|
||||
|
||||
# Fetch and parse detail page
|
||||
detail_data = _ds_detail(detail_url)
|
||||
|
||||
# Use postcode from detail data (extracted from Google Maps URL)
|
||||
postcode = detail_data.get("postcode")
|
||||
|
||||
# Determine status from detail page if available
|
||||
if detail_data.get("status"):
|
||||
status = _DS_STATUS_MAP.get(detail_data["status"], status)
|
||||
|
||||
listings.append(RawListing(
|
||||
url=detail_url,
|
||||
source_makelaar="dens",
|
||||
adres=adres,
|
||||
postcode=postcode,
|
||||
stad=stad or _infer_stad(postcode),
|
||||
prijs=prijs,
|
||||
status=status,
|
||||
hero_image_url=hero,
|
||||
woningtype=detail_data.get("woningtype"),
|
||||
bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None,
|
||||
woonoppervlak=parse_m2(detail_data.get("woonoppervlak")) or woonoppervlak,
|
||||
kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else kamers,
|
||||
slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None,
|
||||
energielabel=detail_data.get("energielabel"),
|
||||
))
|
||||
if config.APP_ENV == "dev":
|
||||
break
|
||||
except Exception as e:
|
||||
log.warning("dens: parse fout: %s", e)
|
||||
|
||||
if len(cards) < 10:
|
||||
break
|
||||
page += 1
|
||||
|
||||
log.info("dens: %d listings opgehaald", len(listings))
|
||||
return listings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3D Makelaars (Schiedam/Vlaardingen)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_3D_BASE = "https://3dmakelaars.nl"
|
||||
|
||||
|
||||
def _3dmakelaars_detail(detail_url: str) -> dict:
|
||||
"""Fetch 3dmakelaars detail page and extract structured info block."""
|
||||
try:
|
||||
soup = fetch_soup(detail_url)
|
||||
|
||||
# Parse structured info block: span (label) + p (value) pairs
|
||||
kv: dict[str, str] = {}
|
||||
for li in soup.select("div.tl-adiltional-inforamtion ul.tl-adiltional-listed li"):
|
||||
label_el = li.select_one("span")
|
||||
value_el = li.select_one("p")
|
||||
if label_el and value_el:
|
||||
label = label_el.get_text(strip=True).lower()
|
||||
value = value_el.get_text(strip=True)
|
||||
kv[label] = value
|
||||
|
||||
# Extract postcode from first description paragraph
|
||||
postcode = None
|
||||
p_tag = soup.select_one(".omschrijving > p:nth-child(1)")
|
||||
if p_tag:
|
||||
text = p_tag.get_text()
|
||||
postcode = _extract_postcode(text)
|
||||
|
||||
return {
|
||||
"kamers": int(kv["aantal kamers"].split()[0]) if "aantal kamers" in kv else None,
|
||||
"slaapkamers": int(kv["aantal slaapkamers"].split()[0]) if "aantal slaapkamers" in kv else None,
|
||||
"bouwjaar": int(kv["bouwjaar"]) if "bouwjaar" in kv else None,
|
||||
"woningtype": kv.get("bouwvorm"),
|
||||
"woonoppervlak": parse_m2(kv.get("oppervlakte")),
|
||||
"postcode": postcode,
|
||||
}
|
||||
except Exception as e:
|
||||
log.warning("3dmakelaars: detail fetch fout %s: %s", detail_url, e)
|
||||
return {}
|
||||
|
||||
|
||||
def fetch_3dmakelaars() -> list[RawListing]:
|
||||
"""Fetch 3D Makelaars listings with pagination."""
|
||||
listings = []
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
url = (
|
||||
f"{_3D_BASE}/woningen-te-koop-in-schiedam-en-vlaardingen"
|
||||
f"?kamers=&oppervlakte=&woonplaats=&video=&prijs=3&page={page}"
|
||||
)
|
||||
soup = fetch_soup(url)
|
||||
cards = soup.select("div.tl-properties-item")
|
||||
if not cards:
|
||||
break
|
||||
|
||||
for card in cards:
|
||||
try:
|
||||
# Extract detail URL from onclick attribute
|
||||
onclick = card.get("onclick", "")
|
||||
detail_url = None
|
||||
if "window.location" in onclick:
|
||||
m = re.search(r"window\.location\s*=\s*['\"]([^'\"]+)['\"]", onclick)
|
||||
if m:
|
||||
detail_url = _3D_BASE + m.group(1)
|
||||
|
||||
if not detail_url:
|
||||
continue
|
||||
|
||||
# Extract listing-level info
|
||||
adres = _text(card, "h3.price")
|
||||
prijs_text = _text(card, "span.address")
|
||||
prijs = parse_prijs(prijs_text)
|
||||
|
||||
# Extract rooms and area from meta list
|
||||
kamers = None
|
||||
woonoppervlak = None
|
||||
for li in card.select("ul.tl-meta-listed > li"):
|
||||
text = li.get_text(strip=True)
|
||||
if "kamers" in text.lower():
|
||||
m = re.search(r"(\d+)", text)
|
||||
if m:
|
||||
kamers = int(m.group(1))
|
||||
elif "m²" in text or "m2" in text:
|
||||
woonoppervlak = parse_m2(text)
|
||||
|
||||
# Extract image
|
||||
img_tag = card.select_one("img")
|
||||
hero = img_tag["src"] if img_tag else None
|
||||
if hero and not hero.startswith("http"):
|
||||
hero = _3D_BASE + hero
|
||||
|
||||
# Fetch detail page for full info
|
||||
detail_data = _3dmakelaars_detail(detail_url)
|
||||
|
||||
# Postcode from detail page, fallback to extraction from address
|
||||
postcode = detail_data.get("postcode")
|
||||
if not postcode and adres:
|
||||
postcode = _extract_postcode(adres)
|
||||
|
||||
listings.append(RawListing(
|
||||
url=detail_url,
|
||||
source_makelaar="3dmakelaars",
|
||||
adres=adres,
|
||||
postcode=postcode,
|
||||
stad=_infer_stad(postcode),
|
||||
prijs=prijs,
|
||||
woningtype=detail_data.get("woningtype"),
|
||||
bouwjaar=detail_data.get("bouwjaar"),
|
||||
woonoppervlak=woonoppervlak or detail_data.get("woonoppervlak"),
|
||||
kamers=kamers or detail_data.get("kamers"),
|
||||
slaapkamers=detail_data.get("slaapkamers"),
|
||||
hero_image_url=hero,
|
||||
))
|
||||
if config.APP_ENV == "dev":
|
||||
break
|
||||
except Exception as e:
|
||||
log.warning("3dmakelaars: parse fout: %s", e)
|
||||
|
||||
if len(cards) < 7:
|
||||
break
|
||||
page += 1
|
||||
|
||||
log.info("3dmakelaars: %d listings opgehaald", len(listings))
|
||||
return listings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Dupont ERA Makelaars (Schiedam/Rotterdam)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DUPONT_BASE = "https://www.dupont.nl"
|
||||
|
||||
_DUPONT_STATUS_MAP = {
|
||||
"te koop": "beschikbaar",
|
||||
"nieuw": "beschikbaar",
|
||||
"onder bod": "onder_bod",
|
||||
"verkocht onder voorbehoud": "onder_bod",
|
||||
"verkocht": "verkocht",
|
||||
}
|
||||
|
||||
|
||||
def _dupont_detail(detail_url: str) -> dict:
|
||||
"""Fetch Dupont detail page and extract kenmerken from dt/dd pairs."""
|
||||
try:
|
||||
soup = fetch_soup(detail_url)
|
||||
|
||||
# Parse dt/dd pairs into label → value map
|
||||
kv: dict[str, str] = {}
|
||||
dts = soup.select("dt")
|
||||
dds = soup.select("dd")
|
||||
|
||||
for dt, dd in zip(dts, dds):
|
||||
label = dt.get_text(strip=True).lower()
|
||||
value = dd.get_text(strip=True)
|
||||
kv[label] = value
|
||||
|
||||
# Extract postcode from small tag (format: "NNNN AA CITY")
|
||||
postcode = None
|
||||
small_tag = soup.select_one("section div.container-fluid small")
|
||||
if small_tag:
|
||||
postcode = _extract_postcode(small_tag.get_text())
|
||||
|
||||
return {
|
||||
"postcode": postcode,
|
||||
"woningtype": kv.get("soort woning"),
|
||||
"bouwjaar": kv.get("bouwjaar"),
|
||||
"woonoppervlak": kv.get("woonoppervlakte"),
|
||||
"kamers": kv.get("aantal kamers"),
|
||||
"slaapkamers": kv.get("aantal slaapkamers"),
|
||||
"energielabel": kv.get("energielabel"),
|
||||
}
|
||||
except Exception as e:
|
||||
log.warning("dupont: detail fetch fout %s: %s", detail_url, e)
|
||||
return {}
|
||||
|
||||
|
||||
def fetch_dupont() -> list[RawListing]:
|
||||
"""Fetch Dupont ERA Makelaars listings with pagination and detail pages."""
|
||||
listings = []
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
url = f"{_DUPONT_BASE}/aanbod/koopwoningen?page={page}"
|
||||
soup = fetch_soup(url)
|
||||
cards = soup.select("article.object")
|
||||
if not cards:
|
||||
break
|
||||
|
||||
for card in cards:
|
||||
try:
|
||||
# Extract URL
|
||||
a_tag = card.select_one("a[href]")
|
||||
if not a_tag or "href" not in a_tag.attrs:
|
||||
continue
|
||||
detail_url = a_tag["href"]
|
||||
if not detail_url.startswith("http"):
|
||||
detail_url = _DUPONT_BASE + detail_url
|
||||
|
||||
# Extract listing-level data
|
||||
adres = _text(card, "h3")
|
||||
stad = _text(card, "h4")
|
||||
prijs_text = _text(card, "div.price")
|
||||
prijs = parse_prijs(prijs_text)
|
||||
|
||||
# Extract status from label
|
||||
status_label = _text(card, "div.label") or "beschikbaar"
|
||||
status_label = status_label.strip().lower()
|
||||
status = _DUPONT_STATUS_MAP.get(status_label, "beschikbaar")
|
||||
|
||||
# Extract image
|
||||
img_tag = card.select_one("img.img-responsive")
|
||||
hero = img_tag["src"] if img_tag else None
|
||||
if hero and not hero.startswith("http"):
|
||||
hero = _DUPONT_BASE + hero
|
||||
|
||||
# Fetch detail page for full data
|
||||
detail_data = _dupont_detail(detail_url)
|
||||
|
||||
# Use postcode from detail if available
|
||||
postcode = detail_data.get("postcode")
|
||||
|
||||
listings.append(RawListing(
|
||||
url=detail_url,
|
||||
source_makelaar="dupont",
|
||||
adres=adres,
|
||||
postcode=postcode,
|
||||
stad=stad or _infer_stad(postcode),
|
||||
prijs=prijs,
|
||||
status=status,
|
||||
hero_image_url=hero,
|
||||
woningtype=detail_data.get("woningtype"),
|
||||
bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None,
|
||||
woonoppervlak=parse_m2(detail_data.get("woonoppervlak")),
|
||||
kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else None,
|
||||
slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None,
|
||||
energielabel=detail_data.get("energielabel"),
|
||||
))
|
||||
if config.APP_ENV == "dev":
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
log.warning("dupont: parse fout: %s", e)
|
||||
|
||||
if len(cards) < 10:
|
||||
break
|
||||
page += 1
|
||||
|
||||
log.info("dupont: %d listings opgehaald", len(listings))
|
||||
return listings
|
||||
Reference in New Issue
Block a user