Files
huizenbot/src/adapters/ssr/schiedam.py
Mark Kalsbeek f74e9bcfb0 refactor: split ssr.py into package, enrich OG Online detail pages, fix travel upsert
- Split src/adapters/ssr.py (2160 LOC) into ssr/ package grouped by CMS:
  realworks.py, sure.py, schiedam.py, denhaag.py, overige.py
- Add _og_detail() to api.py; all OG Online scrapers now fall back to
  detail page fetch when energielabel/bouwjaar are missing from the API
- Fix run() to recalculate travel times for existing listings where
  fiets_mark IS NULL; upsert() now writes travel cols on existing rows too
- Update tests/cache.py to patch fetch_soup in every ssr submodule
- Update docs to reflect new package structure and mark API enrichment TODO done

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-11 23:39:35 +02:00

543 lines
20 KiB
Python

"""
Custom Schiedam scrapers (no shared CMS platform).
Each makelaar here uses a bespoke site structure that required its own parser.
Scrapers: dewittegarantiemakelaars (JSON-LD), dens, 3dmakelaars, dupont
"""
import re
import config
from huizenbot import RawListing
from ._shared import (
fetch_soup, parse_prijs, parse_m2, _text,
_extract_postcode, _infer_stad, log,
)
# ---------------------------------------------------------------------------
# De Witte Garantiemakelaars (Schiedam)
# ---------------------------------------------------------------------------
# Listing cards have a pill badge for status. All detail data comes from
# JSON-LD (schema.org BuyAction/Offer) on the detail page.
_DEWITTE_BASE = "https://dewittegarantiemakelaars.nl"
_DEWITTE_PILL_MAP = {
"bg-fun-green": "beschikbaar",
"bg-sold": "verkocht",
}
_DEWITTE_TYPE_MAP = {
"Apartment": "appartement",
"House": "woning",
"SingleFamilyResidence": "woning",
"Residence": "woning",
}
def _dewitte_jsonld(detail_url: str) -> dict:
"""Fetch detail page and return parsed JSON-LD dict, or {} on failure."""
import json
try:
soup = fetch_soup(detail_url)
tag = soup.select_one('script[type="application/ld+json"]')
if not tag:
log.warning("dewitte: geen JSON-LD op %s", detail_url)
return {}
return json.loads(tag.string)
except Exception as e:
log.warning("dewitte: JSON-LD fout %s: %s", detail_url, e)
return {}
def fetch_dewittegarantiemakelaars() -> list[RawListing]:
listings = []
page = 1
while True:
url = (
f"{_DEWITTE_BASE}/woningaanbod"
f"?buy_rent=buy&buy_price=1-{config.MAX_PRICE}&page={page}"
)
soup = fetch_soup(url)
cards = soup.select("div.card.card--property")
if not cards:
break
for card in cards:
try:
a_tag = card.select_one("a.card__anchor")
if not a_tag:
continue
detail_url = a_tag["href"]
if not detail_url.startswith("http"):
detail_url = _DEWITTE_BASE + detail_url
pill = card.select_one("span.pill")
pill_classes = pill.get("class", []) if pill else []
status_key = next(
(c for c in pill_classes if c.startswith("bg-")), None
)
status = _DEWITTE_PILL_MAP.get(status_key, "onder_bod")
ld = _dewitte_jsonld(detail_url)
if not ld:
continue
offered = ld.get("itemOffered", {})
address = offered.get("address", {})
floor_size = offered.get("floorSize", {})
postcode = address.get("postalCode", "").replace(" ", "") or None
stad = address.get("addressLocality") or None
adres = address.get("streetAddress") or None
prijs = ld.get("price")
if prijs and int(prijs) > config.MAX_PRICE:
continue
woningtype = _DEWITTE_TYPE_MAP.get(offered.get("@type", ""))
woonoppervlak = int(floor_size["value"]) if floor_size.get("value") else None
kamers = offered.get("numberOfRooms")
bouwjaar = offered.get("yearBuilt")
# Full-res image from JSON-LD, fall back to card thumbnail
hero = ld.get("image")
if not hero:
img = card.select_one("picture img")
hero = img["src"] if img else None
listings.append(RawListing(
url=detail_url,
source_makelaar="dewittegarantiemakelaars",
status=status,
adres=adres,
postcode=postcode,
stad=stad,
prijs=int(prijs) if prijs else None,
woningtype=woningtype,
woonoppervlak=woonoppervlak,
kamers=int(kamers) if kamers else None,
bouwjaar=int(bouwjaar) if bouwjaar else None,
hero_image_url=hero,
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("dewitte: parse fout: %s", e)
if len(cards) < 10:
break
page += 1
log.info("dewittegarantiemakelaars: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# D&S Makelaars (Schiedam)
# ---------------------------------------------------------------------------
_DS_BASE = "https://www.densmakelaars.nl"
_DS_STATUS_MAP = {
"onder bod": "onder_bod",
"te koop": "beschikbaar",
"nieuw": "beschikbaar",
"beschikbaar": "beschikbaar",
"verkocht": "verkocht",
}
def _ds_detail(detail_url: str, html_text: str = None) -> dict:
"""Fetch D&S detail page and extract all kenmerken from <dt>/<dd> pairs and postcode from maps URL."""
try:
# If html_text not provided, fetch it
if html_text is None:
import httpx
r = httpx.get(
detail_url,
headers={"User-Agent": config.USER_AGENT},
timeout=15,
follow_redirects=True,
)
html_text = r.text
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_text, "html.parser")
# Parse <dt>/<dd> pairs into a label → value map
kv: dict[str, str] = {}
dts = soup.select("dt")
dds = soup.select("dd")
for dt, dd in zip(dts, dds):
label = dt.get_text(strip=True).lower()
value = dd.get_text(strip=True)
kv[label] = value
# Extract postcode from Google Maps URL in iframe src
# Pattern: q=...POSTCODE...,CITY where POSTCODE is 4 digits + 2 letters
postcode = None
m = re.search(r'q=.+?,(\d{4})\s+([A-Z]{2}),', html_text)
if m:
postcode = f"{m.group(1)}{m.group(2)}"
return {
"status": kv.get("status", "beschikbaar").lower(),
"woningtype": kv.get("soort woning"),
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("woonoppervlakte"),
"kamers": kv.get("aantal kamers"),
"slaapkamers": kv.get("aantal slaapkamers"),
"energielabel": kv.get("energielabel"),
"postcode": postcode,
}
except Exception as e:
log.warning("dens: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_dens() -> list[RawListing]:
"""Fetch D&S Makelaars listings with full detail pages."""
listings = []
page = 1
while True:
url = f"{_DS_BASE}/aanbod/koopwoningen?page={page}"
soup = fetch_soup(url)
cards = soup.select(".col-12.col-md-4.object-wrapper")
if not cards:
break
for card in cards:
try:
# Extract URL
a_tag = card.select_one("a.property")
if not a_tag or "href" not in a_tag.attrs:
continue
detail_url = a_tag["href"]
if not detail_url.startswith("http"):
detail_url = _DS_BASE + detail_url
# Extract listing page data
status_label = _text(card, "span.label") or "beschikbaar"
status_label = status_label.strip().lower()
status = _DS_STATUS_MAP.get(status_label, "beschikbaar")
adres = _text(card, "h3")
stad = _text(card, "h4")
prijs_text = _text(card, "div.price")
prijs = parse_prijs(prijs_text)
# Extract area and rooms from footer
footer_spans = card.select("div.footer span")
woonoppervlak = None
kamers = None
for span in footer_spans:
text = span.get_text(strip=True)
if "" in text:
woonoppervlak = parse_m2(text)
elif "kamers" in text.lower():
m = re.search(r"(\d+)", text)
if m:
kamers = int(m.group(1))
# Extract hero image
img_tag = card.select_one("img")
hero = img_tag["src"] if img_tag else None
# Fetch and parse detail page
detail_data = _ds_detail(detail_url)
# Use postcode from detail data (extracted from Google Maps URL)
postcode = detail_data.get("postcode")
# Determine status from detail page if available
if detail_data.get("status"):
status = _DS_STATUS_MAP.get(detail_data["status"], status)
listings.append(RawListing(
url=detail_url,
source_makelaar="dens",
adres=adres,
postcode=postcode,
stad=stad or _infer_stad(postcode),
prijs=prijs,
status=status,
hero_image_url=hero,
woningtype=detail_data.get("woningtype"),
bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None,
woonoppervlak=parse_m2(detail_data.get("woonoppervlak")) or woonoppervlak,
kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else kamers,
slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None,
energielabel=detail_data.get("energielabel"),
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("dens: parse fout: %s", e)
if len(cards) < 10:
break
page += 1
log.info("dens: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# 3D Makelaars (Schiedam/Vlaardingen)
# ---------------------------------------------------------------------------
_3D_BASE = "https://3dmakelaars.nl"
def _3dmakelaars_detail(detail_url: str) -> dict:
"""Fetch 3dmakelaars detail page and extract structured info block."""
try:
soup = fetch_soup(detail_url)
# Parse structured info block: span (label) + p (value) pairs
kv: dict[str, str] = {}
for li in soup.select("div.tl-adiltional-inforamtion ul.tl-adiltional-listed li"):
label_el = li.select_one("span")
value_el = li.select_one("p")
if label_el and value_el:
label = label_el.get_text(strip=True).lower()
value = value_el.get_text(strip=True)
kv[label] = value
# Extract postcode from first description paragraph
postcode = None
p_tag = soup.select_one(".omschrijving > p:nth-child(1)")
if p_tag:
text = p_tag.get_text()
postcode = _extract_postcode(text)
return {
"kamers": int(kv["aantal kamers"].split()[0]) if "aantal kamers" in kv else None,
"slaapkamers": int(kv["aantal slaapkamers"].split()[0]) if "aantal slaapkamers" in kv else None,
"bouwjaar": int(kv["bouwjaar"]) if "bouwjaar" in kv else None,
"woningtype": kv.get("bouwvorm"),
"woonoppervlak": parse_m2(kv.get("oppervlakte")),
"postcode": postcode,
}
except Exception as e:
log.warning("3dmakelaars: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_3dmakelaars() -> list[RawListing]:
"""Fetch 3D Makelaars listings with pagination."""
listings = []
page = 1
while True:
url = (
f"{_3D_BASE}/woningen-te-koop-in-schiedam-en-vlaardingen"
f"?kamers=&oppervlakte=&woonplaats=&video=&prijs=3&page={page}"
)
soup = fetch_soup(url)
cards = soup.select("div.tl-properties-item")
if not cards:
break
for card in cards:
try:
# Extract detail URL from onclick attribute
onclick = card.get("onclick", "")
detail_url = None
if "window.location" in onclick:
m = re.search(r"window\.location\s*=\s*['\"]([^'\"]+)['\"]", onclick)
if m:
detail_url = _3D_BASE + m.group(1)
if not detail_url:
continue
# Extract listing-level info
adres = _text(card, "h3.price")
prijs_text = _text(card, "span.address")
prijs = parse_prijs(prijs_text)
# Extract rooms and area from meta list
kamers = None
woonoppervlak = None
for li in card.select("ul.tl-meta-listed > li"):
text = li.get_text(strip=True)
if "kamers" in text.lower():
m = re.search(r"(\d+)", text)
if m:
kamers = int(m.group(1))
elif "" in text or "m2" in text:
woonoppervlak = parse_m2(text)
# Extract image
img_tag = card.select_one("img")
hero = img_tag["src"] if img_tag else None
if hero and not hero.startswith("http"):
hero = _3D_BASE + hero
# Fetch detail page for full info
detail_data = _3dmakelaars_detail(detail_url)
# Postcode from detail page, fallback to extraction from address
postcode = detail_data.get("postcode")
if not postcode and adres:
postcode = _extract_postcode(adres)
listings.append(RawListing(
url=detail_url,
source_makelaar="3dmakelaars",
adres=adres,
postcode=postcode,
stad=_infer_stad(postcode),
prijs=prijs,
woningtype=detail_data.get("woningtype"),
bouwjaar=detail_data.get("bouwjaar"),
woonoppervlak=woonoppervlak or detail_data.get("woonoppervlak"),
kamers=kamers or detail_data.get("kamers"),
slaapkamers=detail_data.get("slaapkamers"),
hero_image_url=hero,
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("3dmakelaars: parse fout: %s", e)
if len(cards) < 7:
break
page += 1
log.info("3dmakelaars: %d listings opgehaald", len(listings))
return listings
# ---------------------------------------------------------------------------
# Dupont ERA Makelaars (Schiedam/Rotterdam)
# ---------------------------------------------------------------------------
_DUPONT_BASE = "https://www.dupont.nl"
_DUPONT_STATUS_MAP = {
"te koop": "beschikbaar",
"nieuw": "beschikbaar",
"onder bod": "onder_bod",
"verkocht onder voorbehoud": "onder_bod",
"verkocht": "verkocht",
}
def _dupont_detail(detail_url: str) -> dict:
"""Fetch Dupont detail page and extract kenmerken from dt/dd pairs."""
try:
soup = fetch_soup(detail_url)
# Parse dt/dd pairs into label → value map
kv: dict[str, str] = {}
dts = soup.select("dt")
dds = soup.select("dd")
for dt, dd in zip(dts, dds):
label = dt.get_text(strip=True).lower()
value = dd.get_text(strip=True)
kv[label] = value
# Extract postcode from small tag (format: "NNNN AA CITY")
postcode = None
small_tag = soup.select_one("section div.container-fluid small")
if small_tag:
postcode = _extract_postcode(small_tag.get_text())
return {
"postcode": postcode,
"woningtype": kv.get("soort woning"),
"bouwjaar": kv.get("bouwjaar"),
"woonoppervlak": kv.get("woonoppervlakte"),
"kamers": kv.get("aantal kamers"),
"slaapkamers": kv.get("aantal slaapkamers"),
"energielabel": kv.get("energielabel"),
}
except Exception as e:
log.warning("dupont: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_dupont() -> list[RawListing]:
"""Fetch Dupont ERA Makelaars listings with pagination and detail pages."""
listings = []
page = 1
while True:
url = f"{_DUPONT_BASE}/aanbod/koopwoningen?page={page}"
soup = fetch_soup(url)
cards = soup.select("article.object")
if not cards:
break
for card in cards:
try:
# Extract URL
a_tag = card.select_one("a[href]")
if not a_tag or "href" not in a_tag.attrs:
continue
detail_url = a_tag["href"]
if not detail_url.startswith("http"):
detail_url = _DUPONT_BASE + detail_url
# Extract listing-level data
adres = _text(card, "h3")
stad = _text(card, "h4")
prijs_text = _text(card, "div.price")
prijs = parse_prijs(prijs_text)
# Extract status from label
status_label = _text(card, "div.label") or "beschikbaar"
status_label = status_label.strip().lower()
status = _DUPONT_STATUS_MAP.get(status_label, "beschikbaar")
# Extract image
img_tag = card.select_one("img.img-responsive")
hero = img_tag["src"] if img_tag else None
if hero and not hero.startswith("http"):
hero = _DUPONT_BASE + hero
# Fetch detail page for full data
detail_data = _dupont_detail(detail_url)
# Use postcode from detail if available
postcode = detail_data.get("postcode")
listings.append(RawListing(
url=detail_url,
source_makelaar="dupont",
adres=adres,
postcode=postcode,
stad=stad or _infer_stad(postcode),
prijs=prijs,
status=status,
hero_image_url=hero,
woningtype=detail_data.get("woningtype"),
bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None,
woonoppervlak=parse_m2(detail_data.get("woonoppervlak")),
kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else None,
slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None,
energielabel=detail_data.get("energielabel"),
))
if config.APP_ENV == "dev":
break
except Exception as e:
log.warning("dupont: parse fout: %s", e)
if len(cards) < 10:
break
page += 1
log.info("dupont: %d listings opgehaald", len(listings))
return listings