1619 lines
60 KiB
Python
1619 lines
60 KiB
Python
"""
|
|
adapters/ssr.py — HTML/SSR-based makelaars
|
|
|
|
Elke scraper is een functie () -> list[RawListing].
|
|
Voeg nieuwe toe onderaan en registreer in SCRAPERS.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
import time
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
|
|
import config
|
|
from huizenbot import RawListing
|
|
|
|
log = logging.getLogger("huizenbot.ssr")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Gedeelde HTTP helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def fetch_soup(url: str, *, params: dict = None) -> BeautifulSoup:
|
|
"""
|
|
GET request → BeautifulSoup. Handelt 429 af met Retry-After.
|
|
"""
|
|
for attempt in range(3):
|
|
r = httpx.get(
|
|
url,
|
|
params=params,
|
|
headers={"User-Agent": config.USER_AGENT},
|
|
timeout=15,
|
|
follow_redirects=True,
|
|
)
|
|
if r.status_code == 429:
|
|
wait = int(r.headers.get("Retry-After", 60))
|
|
log.warning("429 op %s, wacht %ds", url, wait)
|
|
time.sleep(wait)
|
|
continue
|
|
r.raise_for_status()
|
|
return BeautifulSoup(r.text, "html.parser")
|
|
|
|
raise RuntimeError(f"Blijvend 429 op {url}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Parse helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def parse_prijs(text: str | None) -> int | None:
|
|
"""'€ 325.000 k.k.' → 325000"""
|
|
if not text:
|
|
return None
|
|
digits = re.sub(r"[^\d]", "", text)
|
|
return int(digits) if digits else None
|
|
|
|
|
|
def parse_m2(text: str | None) -> int | None:
|
|
"""'87 m²' → 87"""
|
|
if not text:
|
|
return None
|
|
m = re.search(r"(\d+)", text.replace(".", ""))
|
|
return int(m.group(1)) if m else None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Realworks CMS (shared)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_REALWORKS_STATUS_MAP = {
|
|
"te koop": "beschikbaar",
|
|
"nieuw": "beschikbaar",
|
|
"onder bod": "onder_bod",
|
|
"onder optie": "onder_bod",
|
|
"verkocht o.v.": "verkocht",
|
|
"verkocht": "verkocht",
|
|
}
|
|
|
|
|
|
def _realworks_detail(detail_url: str, makelaar: str) -> dict:
|
|
"""Fetch a Realworks detail page and extract kenmerken. Returns empty dict on failure."""
|
|
try:
|
|
soup = fetch_soup(detail_url)
|
|
|
|
# Build a label→value map from all .kenmerk spans
|
|
kv: dict[str, str] = {}
|
|
for kenmerk in soup.select("span.kenmerk"):
|
|
label_el = kenmerk.select_one("span.kenmerkName")
|
|
value_el = kenmerk.select_one("span.kenmerkValue")
|
|
if label_el and value_el:
|
|
label = label_el.get_text(strip=True).lower()
|
|
value = value_el.get_text(strip=True)
|
|
kv[label] = value
|
|
|
|
return {
|
|
"woningtype": kv.get("type woning"),
|
|
"bouwjaar": kv.get("bouwjaar"),
|
|
"woonoppervlak": kv.get("woonoppervlakte"),
|
|
"perceeloppervlak": kv.get("perceeloppervlakte"),
|
|
"kamers": kv.get("aantal kamers"),
|
|
"slaapkamers": kv.get("aantal slaapkamers"),
|
|
"energielabel": kv.get("energieklasse"),
|
|
}
|
|
except Exception as e:
|
|
log.warning("%s: detail fetch fout %s: %s", makelaar, detail_url, e)
|
|
return {}
|
|
|
|
|
|
def fetch_realworks(base_url: str, makelaar: str) -> list[RawListing]:
|
|
"""
|
|
Generic fetcher for Realworks CMS brokers.
|
|
Paginates via /pagina-{n}/, fetches detail page per listing.
|
|
"""
|
|
listings_path = f"/aanbod/woningaanbod/-{config.MAX_PRICE}/koop"
|
|
listings = []
|
|
page = 1
|
|
|
|
while True:
|
|
url = f"{base_url}{listings_path}/pagina-{page}/"
|
|
soup = fetch_soup(url)
|
|
cards = soup.select("li.aanbodEntry")
|
|
if not cards:
|
|
break
|
|
|
|
for card in cards:
|
|
try:
|
|
a_tag = card.select_one("a.aanbodEntryLink")
|
|
if not a_tag:
|
|
continue
|
|
listing_url = base_url + a_tag["href"]
|
|
|
|
adres = _text(card, ".street-address")
|
|
postcode = (_text(card, ".postal-code") or "").replace(" ", "") or None
|
|
stad = _text(card, ".locality")
|
|
prijs = parse_prijs(_text(card, ".koopprijs .kenmerkValue"))
|
|
|
|
status_text = (_text(card, ".objectstatusbanner") or "").lower()
|
|
status = _REALWORKS_STATUS_MAP.get(status_text, "beschikbaar")
|
|
|
|
img_tag = card.select_one(".hoofdfoto img")
|
|
hero = img_tag["src"] if img_tag else None
|
|
|
|
kk = _realworks_detail(listing_url, makelaar)
|
|
|
|
listings.append(RawListing(
|
|
url=listing_url,
|
|
source_makelaar=makelaar,
|
|
adres=adres,
|
|
postcode=postcode,
|
|
stad=stad,
|
|
prijs=prijs,
|
|
status=status,
|
|
hero_image_url=hero,
|
|
woningtype=kk.get("woningtype"),
|
|
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
|
|
woonoppervlak=parse_m2(kk.get("woonoppervlak")),
|
|
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
|
|
kamers=int(kk["kamers"]) if kk.get("kamers") else None,
|
|
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None,
|
|
energielabel=kk.get("energielabel"),
|
|
))
|
|
if config.APP_ENV == "dev":
|
|
break
|
|
except Exception as e:
|
|
log.warning("%s: parse fout: %s", makelaar, e)
|
|
|
|
if len(cards) < 10:
|
|
break
|
|
page += 1
|
|
|
|
log.info("%s: %d listings opgehaald", makelaar, len(listings))
|
|
return listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Anke Bodewes Makelaardij
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def fetch_ankebodewes() -> list[RawListing]:
|
|
return fetch_realworks("https://www.ankebodewes.nl", "ankebodewes")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Woongoed Makelaars Schiedam
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def fetch_woongoed() -> list[RawListing]:
|
|
return fetch_realworks("https://www.woongoedmakelaars.nl", "woongoed")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# De Witte Garantiemakelaars
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_DEWITTE_BASE = "https://dewittegarantiemakelaars.nl"
|
|
|
|
_DEWITTE_PILL_MAP = {
|
|
"bg-fun-green": "beschikbaar",
|
|
"bg-sold": "verkocht",
|
|
}
|
|
|
|
_DEWITTE_TYPE_MAP = {
|
|
"Apartment": "appartement",
|
|
"House": "woning",
|
|
"SingleFamilyResidence": "woning",
|
|
"Residence": "woning",
|
|
}
|
|
|
|
|
|
def _dewitte_jsonld(detail_url: str) -> dict:
|
|
"""Fetch detail page and return parsed JSON-LD dict, or {} on failure."""
|
|
import json
|
|
try:
|
|
soup = fetch_soup(detail_url)
|
|
tag = soup.select_one('script[type="application/ld+json"]')
|
|
if not tag:
|
|
log.warning("dewitte: geen JSON-LD op %s", detail_url)
|
|
return {}
|
|
return json.loads(tag.string)
|
|
except Exception as e:
|
|
log.warning("dewitte: JSON-LD fout %s: %s", detail_url, e)
|
|
return {}
|
|
|
|
|
|
def fetch_dewittegarantiemakelaars() -> list[RawListing]:
|
|
listings = []
|
|
page = 1
|
|
|
|
while True:
|
|
url = (
|
|
f"{_DEWITTE_BASE}/woningaanbod"
|
|
f"?buy_rent=buy&buy_price=1-{config.MAX_PRICE}&page={page}"
|
|
)
|
|
soup = fetch_soup(url)
|
|
cards = soup.select("div.card.card--property")
|
|
if not cards:
|
|
break
|
|
|
|
for card in cards:
|
|
try:
|
|
a_tag = card.select_one("a.card__anchor")
|
|
if not a_tag:
|
|
continue
|
|
detail_url = a_tag["href"]
|
|
if not detail_url.startswith("http"):
|
|
detail_url = _DEWITTE_BASE + detail_url
|
|
|
|
pill = card.select_one("span.pill")
|
|
pill_classes = pill.get("class", []) if pill else []
|
|
status_key = next(
|
|
(c for c in pill_classes if c.startswith("bg-")), None
|
|
)
|
|
status = _DEWITTE_PILL_MAP.get(status_key, "onder_bod")
|
|
|
|
ld = _dewitte_jsonld(detail_url)
|
|
if not ld:
|
|
continue
|
|
|
|
offered = ld.get("itemOffered", {})
|
|
address = offered.get("address", {})
|
|
floor_size = offered.get("floorSize", {})
|
|
|
|
postcode = address.get("postalCode", "").replace(" ", "") or None
|
|
stad = address.get("addressLocality") or None
|
|
adres = address.get("streetAddress") or None
|
|
|
|
prijs = ld.get("price")
|
|
if prijs and int(prijs) > config.MAX_PRICE:
|
|
continue
|
|
|
|
woningtype = _DEWITTE_TYPE_MAP.get(offered.get("@type", ""))
|
|
woonoppervlak = int(floor_size["value"]) if floor_size.get("value") else None
|
|
kamers = offered.get("numberOfRooms")
|
|
bouwjaar = offered.get("yearBuilt")
|
|
|
|
# Full-res image from JSON-LD, fall back to card thumbnail
|
|
hero = ld.get("image")
|
|
if not hero:
|
|
img = card.select_one("picture img")
|
|
hero = img["src"] if img else None
|
|
|
|
listings.append(RawListing(
|
|
url=detail_url,
|
|
source_makelaar="dewittegarantiemakelaars",
|
|
status=status,
|
|
adres=adres,
|
|
postcode=postcode,
|
|
stad=stad,
|
|
prijs=int(prijs) if prijs else None,
|
|
woningtype=woningtype,
|
|
woonoppervlak=woonoppervlak,
|
|
kamers=int(kamers) if kamers else None,
|
|
bouwjaar=int(bouwjaar) if bouwjaar else None,
|
|
hero_image_url=hero,
|
|
))
|
|
if config.APP_ENV == "dev":
|
|
break
|
|
|
|
except Exception as e:
|
|
log.warning("dewitte: parse fout: %s", e)
|
|
|
|
if len(cards) < 10:
|
|
break
|
|
page += 1
|
|
|
|
log.info("dewittegarantiemakelaars: %d listings opgehaald", len(listings))
|
|
return listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Makelaardij Wassenaar (Schiedam)
|
|
# ---------------------------------------------------------------------------
|
|
# Realworks CMS. Listings page has JSON-LD (Residence) with url/address/price/photo.
|
|
# Detail pages have span.kenmerk with Wassenaar-specific label names.
|
|
|
|
_WASSENAAR_BASE = "https://www.makelaardijwassenaar.nl"
|
|
|
|
_WASSENAAR_STATUS_MAP = {
|
|
"te koop": "beschikbaar",
|
|
"nieuw": "beschikbaar",
|
|
"onder bod": "onder_bod",
|
|
"onder optie": "onder_bod",
|
|
"verkocht o.v.": "onder_bod",
|
|
"verkocht onder voorbehoud": "onder_bod",
|
|
"verkocht": "verkocht",
|
|
}
|
|
|
|
|
|
def _wassenaar_detail(detail_url: str) -> dict:
|
|
"""Fetch Realworks detail page; extract kenmerken with Wassenaar-specific labels."""
|
|
try:
|
|
soup = fetch_soup(detail_url)
|
|
kv: dict[str, str] = {}
|
|
for kenmerk in soup.select("span.kenmerk"):
|
|
label_el = kenmerk.select_one("span.kenmerkName")
|
|
value_el = kenmerk.select_one("span.kenmerkValue")
|
|
if label_el and value_el:
|
|
kv[label_el.get_text(strip=True).lower()] = value_el.get_text(strip=True)
|
|
return {
|
|
"woningtype": kv.get("soort object"),
|
|
"bouwjaar": kv.get("bouwjaar"),
|
|
"woonoppervlak": kv.get("woonoppervlakte"),
|
|
"perceeloppervlak": kv.get("perceeloppervlakte"),
|
|
"kamers": kv.get("aantal kamers"),
|
|
"slaapkamers": kv.get("aantal slaapkamers"),
|
|
"energielabel": kv.get("energieklasse"),
|
|
}
|
|
except Exception as e:
|
|
log.warning("wassenaar: detail fetch fout %s: %s", detail_url, e)
|
|
return {}
|
|
|
|
|
|
def fetch_wassenaar() -> list[RawListing]:
|
|
import json as _json
|
|
soup = fetch_soup(f"{_WASSENAAR_BASE}/aanbod/woningaanbod/-{config.MAX_PRICE}/koop/")
|
|
|
|
# First pass: collect status + thumbnail per relative url
|
|
# Each listing has two a.aanbodEntryLink with the same href;
|
|
# the first has the status banner + photo, the second has address + price.
|
|
status_by_url: dict[str, str] = {}
|
|
photo_by_url: dict[str, str] = {}
|
|
for a in soup.select("a.aanbodEntryLink[href]"):
|
|
href = a["href"]
|
|
if href in status_by_url:
|
|
continue
|
|
banner = a.select_one(".objectstatusbanner")
|
|
status_text = banner.get_text(strip=True).lower() if banner else ""
|
|
status_by_url[href] = _WASSENAAR_STATUS_MAP.get(status_text, "beschikbaar")
|
|
img = a.select_one("span.hoofdfoto img")
|
|
if img:
|
|
src = img.get("src", "")
|
|
if "geenfotobeschikbaar" not in src:
|
|
photo_by_url[href] = src
|
|
|
|
# Second pass: parse JSON-LD blocks (one per listing)
|
|
seen: set[str] = set()
|
|
listings = []
|
|
for tag in soup.select('script[type="application/ld+json"]'):
|
|
try:
|
|
ld = _json.loads(tag.string)
|
|
if ld.get("@type") != "Residence":
|
|
continue
|
|
rel_url = ld.get("url", "")
|
|
if not rel_url or rel_url in seen:
|
|
continue
|
|
seen.add(rel_url)
|
|
|
|
detail_url = _WASSENAAR_BASE + rel_url
|
|
address = ld.get("address", {})
|
|
postcode = address.get("postalCode", "").replace(" ", "") or None
|
|
|
|
price_spec = next(
|
|
(a.get("priceSpecification", {}) for a in ld.get("potentialAction", [])
|
|
if a.get("priceSpecification")),
|
|
{}
|
|
)
|
|
prijs = int(price_spec["price"]) if price_spec.get("price") else None
|
|
if prijs and prijs > config.MAX_PRICE:
|
|
continue
|
|
|
|
hero = ld.get("photo") or photo_by_url.get(rel_url)
|
|
status = status_by_url.get(rel_url, "beschikbaar")
|
|
kk = _wassenaar_detail(detail_url)
|
|
|
|
listings.append(RawListing(
|
|
url=detail_url,
|
|
source_makelaar="wassenaar",
|
|
status=status,
|
|
adres=address.get("streetAddress") or None,
|
|
postcode=postcode,
|
|
stad=address.get("addressLocality") or None,
|
|
prijs=prijs,
|
|
hero_image_url=hero,
|
|
woningtype=kk.get("woningtype"),
|
|
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
|
|
woonoppervlak=parse_m2(kk.get("woonoppervlak")),
|
|
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
|
|
kamers=int(kk["kamers"]) if kk.get("kamers") else None,
|
|
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None,
|
|
energielabel=kk.get("energielabel"),
|
|
))
|
|
if config.APP_ENV == "dev":
|
|
break
|
|
except Exception as e:
|
|
log.warning("wassenaar: parse fout: %s", e)
|
|
|
|
log.info("wassenaar: %d listings opgehaald", len(listings))
|
|
return listings
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SSR helper utils
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _text(soup, selector: str) -> str | None:
|
|
el = soup.select_one(selector)
|
|
return el.get_text(strip=True) if el else None
|
|
|
|
|
|
def _src(soup, selector: str) -> str | None:
|
|
el = soup.select_one(selector)
|
|
if el is None:
|
|
return None
|
|
return el.get("src") or el.get("data-src")
|
|
|
|
|
|
def _extract_postcode(text: str | None) -> str | None:
|
|
if not text:
|
|
return None
|
|
m = re.search(r"\b(\d{4}\s?[A-Z]{2})\b", text)
|
|
return m.group(1).replace(" ", "") if m else None
|
|
|
|
|
|
def _infer_stad(postcode: str | None) -> str | None:
|
|
"""Simpele mapping op basis van postcode range — uitbreiden naar wens."""
|
|
if not postcode:
|
|
return None
|
|
code = int(postcode[:4])
|
|
if 2600 <= code <= 2629:
|
|
return "Delft"
|
|
if 3100 <= code <= 3135:
|
|
return "Schiedam"
|
|
return None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# D&S Makelaars (Schiedam)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_DS_BASE = "https://www.densmakelaars.nl"
|
|
|
|
_DS_STATUS_MAP = {
|
|
"onder bod": "onder_bod",
|
|
"te koop": "beschikbaar",
|
|
"nieuw": "beschikbaar",
|
|
"beschikbaar": "beschikbaar",
|
|
"verkocht": "verkocht",
|
|
}
|
|
|
|
|
|
def _ds_detail(detail_url: str, html_text: str = None) -> dict:
|
|
"""Fetch D&S detail page and extract all kenmerken from <dt>/<dd> pairs and postcode from maps URL."""
|
|
try:
|
|
# If html_text not provided, fetch it
|
|
if html_text is None:
|
|
import httpx
|
|
r = httpx.get(
|
|
detail_url,
|
|
headers={"User-Agent": config.USER_AGENT},
|
|
timeout=15,
|
|
follow_redirects=True,
|
|
)
|
|
html_text = r.text
|
|
|
|
soup = BeautifulSoup(html_text, "html.parser")
|
|
|
|
# Parse <dt>/<dd> pairs into a label → value map
|
|
kv: dict[str, str] = {}
|
|
dts = soup.select("dt")
|
|
dds = soup.select("dd")
|
|
|
|
for dt, dd in zip(dts, dds):
|
|
label = dt.get_text(strip=True).lower()
|
|
value = dd.get_text(strip=True)
|
|
kv[label] = value
|
|
|
|
# Extract postcode from Google Maps URL in iframe src
|
|
# Pattern: q=...POSTCODE...,CITY where POSTCODE is 4 digits + 2 letters
|
|
postcode = None
|
|
m = re.search(r'q=.+?,(\d{4})\s+([A-Z]{2}),', html_text)
|
|
if m:
|
|
postcode = f"{m.group(1)}{m.group(2)}"
|
|
|
|
# Extract specific fields
|
|
result = {
|
|
"status": kv.get("status", "beschikbaar").lower(),
|
|
"woningtype": kv.get("soort woning"),
|
|
"bouwjaar": kv.get("bouwjaar"),
|
|
"woonoppervlak": kv.get("woonoppervlakte"),
|
|
"kamers": kv.get("aantal kamers"),
|
|
"slaapkamers": kv.get("aantal slaapkamers"),
|
|
"energielabel": kv.get("energielabel"),
|
|
"postcode": postcode,
|
|
}
|
|
return result
|
|
except Exception as e:
|
|
log.warning("dens: detail fetch fout %s: %s", detail_url, e)
|
|
return {}
|
|
|
|
|
|
def fetch_dens() -> list[RawListing]:
|
|
"""Fetch D&S Makelaars listings with full detail pages."""
|
|
listings = []
|
|
page = 1
|
|
|
|
while True:
|
|
url = f"{_DS_BASE}/aanbod/koopwoningen?page={page}"
|
|
soup = fetch_soup(url)
|
|
cards = soup.select(".col-12.col-md-4.object-wrapper")
|
|
if not cards:
|
|
break
|
|
|
|
for card in cards:
|
|
try:
|
|
# Extract URL
|
|
a_tag = card.select_one("a.property")
|
|
if not a_tag or "href" not in a_tag.attrs:
|
|
continue
|
|
detail_url = a_tag["href"]
|
|
if not detail_url.startswith("http"):
|
|
detail_url = _DS_BASE + detail_url
|
|
|
|
# Extract listing page data
|
|
status_label = _text(card, "span.label") or "beschikbaar"
|
|
status_label = status_label.strip().lower()
|
|
status = _DS_STATUS_MAP.get(status_label, "beschikbaar")
|
|
|
|
adres = _text(card, "h3")
|
|
stad = _text(card, "h4")
|
|
prijs_text = _text(card, "div.price")
|
|
prijs = parse_prijs(prijs_text)
|
|
|
|
# Extract area and rooms from footer
|
|
footer_spans = card.select("div.footer span")
|
|
woonoppervlak = None
|
|
kamers = None
|
|
for span in footer_spans:
|
|
text = span.get_text(strip=True)
|
|
if "m²" in text:
|
|
woonoppervlak = parse_m2(text)
|
|
elif "kamers" in text.lower():
|
|
m = re.search(r"(\d+)", text)
|
|
if m:
|
|
kamers = int(m.group(1))
|
|
|
|
# Extract hero image
|
|
img_tag = card.select_one("img")
|
|
hero = img_tag["src"] if img_tag else None
|
|
|
|
# Fetch and parse detail page
|
|
detail_data = _ds_detail(detail_url)
|
|
|
|
# Use postcode from detail data (extracted from Google Maps URL)
|
|
postcode = detail_data.get("postcode")
|
|
|
|
# Determine status from detail page if available
|
|
if detail_data.get("status"):
|
|
status = _DS_STATUS_MAP.get(detail_data["status"], status)
|
|
|
|
# Build listing
|
|
listings.append(RawListing(
|
|
url=detail_url,
|
|
source_makelaar="dens",
|
|
adres=adres,
|
|
postcode=postcode,
|
|
stad=stad or _infer_stad(postcode),
|
|
prijs=prijs,
|
|
status=status,
|
|
hero_image_url=hero,
|
|
woningtype=detail_data.get("woningtype"),
|
|
bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None,
|
|
woonoppervlak=parse_m2(detail_data.get("woonoppervlak")) or woonoppervlak,
|
|
kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else kamers,
|
|
slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None,
|
|
energielabel=detail_data.get("energielabel"),
|
|
))
|
|
if config.APP_ENV == "dev":
|
|
break
|
|
except Exception as e:
|
|
log.warning("dens: parse fout: %s", e)
|
|
|
|
if len(cards) < 10:
|
|
break
|
|
page += 1
|
|
|
|
log.info("dens: %d listings opgehaald", len(listings))
|
|
return listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 3D Makelaars (Schiedam/Vlaardingen)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_3D_BASE = "https://3dmakelaars.nl"
|
|
|
|
|
|
def _3dmakelaars_detail(detail_url: str) -> dict:
|
|
"""Fetch 3dmakelaars detail page and extract structured info block."""
|
|
try:
|
|
soup = fetch_soup(detail_url)
|
|
|
|
# Parse structured info block: span (label) + p (value) pairs
|
|
kv: dict[str, str] = {}
|
|
for li in soup.select("div.tl-adiltional-inforamtion ul.tl-adiltional-listed li"):
|
|
label_el = li.select_one("span")
|
|
value_el = li.select_one("p")
|
|
if label_el and value_el:
|
|
label = label_el.get_text(strip=True).lower()
|
|
value = value_el.get_text(strip=True)
|
|
kv[label] = value
|
|
|
|
# Extract postcode from first description paragraph
|
|
postcode = None
|
|
p_tag = soup.select_one(".omschrijving > p:nth-child(1)")
|
|
if p_tag:
|
|
text = p_tag.get_text()
|
|
postcode = _extract_postcode(text)
|
|
|
|
return {
|
|
"kamers": int(kv["aantal kamers"].split()[0]) if "aantal kamers" in kv else None,
|
|
"slaapkamers": int(kv["aantal slaapkamers"].split()[0]) if "aantal slaapkamers" in kv else None,
|
|
"bouwjaar": int(kv["bouwjaar"]) if "bouwjaar" in kv else None,
|
|
"woningtype": kv.get("bouwvorm"),
|
|
"woonoppervlak": parse_m2(kv.get("oppervlakte")),
|
|
"postcode": postcode,
|
|
}
|
|
except Exception as e:
|
|
log.warning("3dmakelaars: detail fetch fout %s: %s", detail_url, e)
|
|
return {}
|
|
|
|
|
|
def fetch_3dmakelaars() -> list[RawListing]:
|
|
"""Fetch 3D Makelaars listings with pagination."""
|
|
listings = []
|
|
page = 1
|
|
|
|
while True:
|
|
url = (
|
|
f"{_3D_BASE}/woningen-te-koop-in-schiedam-en-vlaardingen"
|
|
f"?kamers=&oppervlakte=&woonplaats=&video=&prijs=3&page={page}"
|
|
)
|
|
soup = fetch_soup(url)
|
|
cards = soup.select("div.tl-properties-item")
|
|
if not cards:
|
|
break
|
|
|
|
for card in cards:
|
|
try:
|
|
# Extract detail URL from onclick attribute
|
|
onclick = card.get("onclick", "")
|
|
detail_url = None
|
|
if "window.location" in onclick:
|
|
m = re.search(r"window\.location\s*=\s*['\"]([^'\"]+)['\"]", onclick)
|
|
if m:
|
|
detail_url = _3D_BASE + m.group(1)
|
|
|
|
if not detail_url:
|
|
continue
|
|
|
|
# Extract listing-level info
|
|
adres = _text(card, "h3.price")
|
|
prijs_text = _text(card, "span.address")
|
|
prijs = parse_prijs(prijs_text)
|
|
|
|
# Extract rooms and area from meta list
|
|
kamers = None
|
|
woonoppervlak = None
|
|
for li in card.select("ul.tl-meta-listed > li"):
|
|
text = li.get_text(strip=True)
|
|
if "kamers" in text.lower():
|
|
m = re.search(r"(\d+)", text)
|
|
if m:
|
|
kamers = int(m.group(1))
|
|
elif "m²" in text or "m2" in text:
|
|
woonoppervlak = parse_m2(text)
|
|
|
|
# Extract image
|
|
img_tag = card.select_one("img")
|
|
hero = img_tag["src"] if img_tag else None
|
|
if hero and not hero.startswith("http"):
|
|
hero = _3D_BASE + hero
|
|
|
|
# Fetch detail page for full info
|
|
detail_data = _3dmakelaars_detail(detail_url)
|
|
|
|
# Postcode from detail page, fallback to extraction from address
|
|
postcode = detail_data.get("postcode")
|
|
if not postcode and adres:
|
|
postcode = _extract_postcode(adres)
|
|
|
|
listings.append(RawListing(
|
|
url=detail_url,
|
|
source_makelaar="3dmakelaars",
|
|
adres=adres,
|
|
postcode=postcode,
|
|
stad=_infer_stad(postcode),
|
|
prijs=prijs,
|
|
woningtype=detail_data.get("woningtype"),
|
|
bouwjaar=detail_data.get("bouwjaar"),
|
|
woonoppervlak=woonoppervlak or detail_data.get("woonoppervlak"),
|
|
kamers=kamers or detail_data.get("kamers"),
|
|
slaapkamers=detail_data.get("slaapkamers"),
|
|
hero_image_url=hero,
|
|
))
|
|
if config.APP_ENV == "dev":
|
|
break
|
|
except Exception as e:
|
|
log.warning("3dmakelaars: parse fout: %s", e)
|
|
|
|
if len(cards) < 7:
|
|
break
|
|
page += 1
|
|
|
|
log.info("3dmakelaars: %d listings opgehaald", len(listings))
|
|
return listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Dupont ERA Makelaars (Schiedam/Rotterdam)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_DUPONT_BASE = "https://www.dupont.nl"
|
|
|
|
_DUPONT_STATUS_MAP = {
|
|
"te koop": "beschikbaar",
|
|
"nieuw": "beschikbaar",
|
|
"onder bod": "onder_bod",
|
|
"verkocht onder voorbehoud": "onder_bod",
|
|
"verkocht": "verkocht",
|
|
}
|
|
|
|
|
|
def _dupont_detail(detail_url: str) -> dict:
|
|
"""Fetch Dupont detail page and extract kenmerken from dt/dd pairs."""
|
|
try:
|
|
soup = fetch_soup(detail_url)
|
|
|
|
# Parse dt/dd pairs into label → value map
|
|
kv: dict[str, str] = {}
|
|
dts = soup.select("dt")
|
|
dds = soup.select("dd")
|
|
|
|
for dt, dd in zip(dts, dds):
|
|
label = dt.get_text(strip=True).lower()
|
|
value = dd.get_text(strip=True)
|
|
kv[label] = value
|
|
|
|
# Extract postcode from small tag (format: "NNNN AA CITY")
|
|
postcode = None
|
|
small_tag = soup.select_one("section div.container-fluid small")
|
|
if small_tag:
|
|
postcode = _extract_postcode(small_tag.get_text())
|
|
|
|
return {
|
|
"postcode": postcode,
|
|
"woningtype": kv.get("soort woning"),
|
|
"bouwjaar": kv.get("bouwjaar"),
|
|
"woonoppervlak": kv.get("woonoppervlakte"),
|
|
"kamers": kv.get("aantal kamers"),
|
|
"slaapkamers": kv.get("aantal slaapkamers"),
|
|
"energielabel": kv.get("energielabel"),
|
|
}
|
|
except Exception as e:
|
|
log.warning("dupont: detail fetch fout %s: %s", detail_url, e)
|
|
return {}
|
|
|
|
|
|
def fetch_dupont() -> list[RawListing]:
|
|
"""Fetch Dupont ERA Makelaars listings with pagination and detail pages."""
|
|
listings = []
|
|
page = 1
|
|
|
|
while True:
|
|
url = f"{_DUPONT_BASE}/aanbod/koopwoningen?page={page}"
|
|
soup = fetch_soup(url)
|
|
cards = soup.select("article.object")
|
|
if not cards:
|
|
break
|
|
|
|
for card in cards:
|
|
try:
|
|
# Extract URL
|
|
a_tag = card.select_one("a[href]")
|
|
if not a_tag or "href" not in a_tag.attrs:
|
|
continue
|
|
detail_url = a_tag["href"]
|
|
if not detail_url.startswith("http"):
|
|
detail_url = _DUPONT_BASE + detail_url
|
|
|
|
# Extract listing-level data
|
|
adres = _text(card, "h3")
|
|
stad = _text(card, "h4")
|
|
prijs_text = _text(card, "div.price")
|
|
prijs = parse_prijs(prijs_text)
|
|
|
|
# Extract status from label
|
|
status_label = _text(card, "div.label") or "beschikbaar"
|
|
status_label = status_label.strip().lower()
|
|
status = _DUPONT_STATUS_MAP.get(status_label, "beschikbaar")
|
|
|
|
# Extract image
|
|
img_tag = card.select_one("img.img-responsive")
|
|
hero = img_tag["src"] if img_tag else None
|
|
if hero and not hero.startswith("http"):
|
|
hero = _DUPONT_BASE + hero
|
|
|
|
# Fetch detail page for full data
|
|
detail_data = _dupont_detail(detail_url)
|
|
|
|
# Use postcode from detail if available
|
|
postcode = detail_data.get("postcode")
|
|
|
|
listings.append(RawListing(
|
|
url=detail_url,
|
|
source_makelaar="dupont",
|
|
adres=adres,
|
|
postcode=postcode,
|
|
stad=stad or _infer_stad(postcode),
|
|
prijs=prijs,
|
|
status=status,
|
|
hero_image_url=hero,
|
|
woningtype=detail_data.get("woningtype"),
|
|
bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None,
|
|
woonoppervlak=parse_m2(detail_data.get("woonoppervlak")),
|
|
kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else None,
|
|
slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None,
|
|
energielabel=detail_data.get("energielabel"),
|
|
))
|
|
if config.APP_ENV == "dev":
|
|
break
|
|
|
|
except Exception as e:
|
|
log.warning("dupont: parse fout: %s", e)
|
|
|
|
if len(cards) < 10:
|
|
break
|
|
page += 1
|
|
|
|
log.info("dupont: %d listings opgehaald", len(listings))
|
|
return listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Schieland Borsboom NVM Makelaars (Rotterdam, actief in Schiedam)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_SCHIELAND_BASE = "https://www.schielandborsboom.nl"
|
|
|
|
_SCHIELAND_STATUS_MAP = {
|
|
"beschikbaar": "beschikbaar",
|
|
"onder bod": "onder_bod",
|
|
"onder optie": "onder_bod",
|
|
"verkocht o.v.": "verkocht",
|
|
"verkocht": "verkocht",
|
|
}
|
|
|
|
|
|
def _schieland_detail(detail_url: str) -> dict:
|
|
"""Fetch Schieland Borsboom detail page and extract kenmerken."""
|
|
try:
|
|
soup = fetch_soup(detail_url)
|
|
|
|
# Postcode from house__status p (e.g. "3117 DP Schiedam")
|
|
postcode_el = soup.select_one("div.house__status p")
|
|
postcode = _extract_postcode(postcode_el.get_text()) if postcode_el else None
|
|
|
|
# Parse #kenmerken section: <li><strong>label</strong><span>value</span></li>
|
|
kv: dict[str, str] = {}
|
|
kenmerken = soup.select_one("#kenmerken")
|
|
if kenmerken:
|
|
for li in kenmerken.select("li"):
|
|
label_el = li.select_one("strong")
|
|
value_el = li.select_one("span")
|
|
if label_el and value_el:
|
|
# Strip nested links (e.g. "Hypotheek berekenen")
|
|
for a in value_el.select("a"):
|
|
a.decompose()
|
|
kv[label_el.get_text(strip=True).lower()] = value_el.get_text(strip=True)
|
|
|
|
return {
|
|
"postcode": postcode,
|
|
"status": kv.get("status", "").lower(),
|
|
"woningtype": kv.get("soort bouw"),
|
|
"bouwjaar": kv.get("bouwjaar"),
|
|
"woonoppervlak": kv.get("woonoppervlakte"),
|
|
"perceeloppervlak": kv.get("perceeloppervlakte"),
|
|
"kamers": kv.get("aantal kamers"),
|
|
"slaapkamers": kv.get("aantal slaapkamers"),
|
|
"energielabel": kv.get("energielabel"),
|
|
}
|
|
except Exception as e:
|
|
log.warning("schielandborsboom: detail fetch fout %s: %s", detail_url, e)
|
|
return {}
|
|
|
|
|
|
def fetch_schielandborsboom() -> list[RawListing]:
|
|
"""Fetch Schieland Borsboom NVM listings (koop only, Schiedam)."""
|
|
listings = []
|
|
page = 1
|
|
|
|
while True:
|
|
if page == 1:
|
|
url = f"{_SCHIELAND_BASE}/wonen?sure_koop_huur=koop"
|
|
else:
|
|
url = f"{_SCHIELAND_BASE}/wonen/page/{page}/?sure_koop_huur=koop"
|
|
|
|
soup = fetch_soup(url)
|
|
cards = soup.select("div.card.card--house")
|
|
if not cards:
|
|
break
|
|
|
|
for card in cards:
|
|
try:
|
|
a_tag = card.select_one("a.card__anchor")
|
|
if not a_tag or "href" not in a_tag.attrs:
|
|
continue
|
|
detail_url = a_tag["href"]
|
|
if not detail_url.startswith("http"):
|
|
detail_url = _SCHIELAND_BASE + detail_url
|
|
|
|
# Filter: only Schiedam
|
|
stad_el = card.select_one("p.house-place")
|
|
stad = stad_el.get_text(strip=True) if stad_el else None
|
|
if not stad or stad.lower() != "schiedam":
|
|
continue
|
|
|
|
# Status from card-house__thumb second class
|
|
thumb = card.select_one("div.card-house__thumb")
|
|
status_classes = thumb.get("class", []) if thumb else []
|
|
status_text = next(
|
|
(c for c in status_classes if c != "card-house__thumb"), "beschikbaar"
|
|
).lower()
|
|
status = _SCHIELAND_STATUS_MAP.get(status_text, "beschikbaar")
|
|
|
|
# Price
|
|
prijs = parse_prijs(_text(card, "p.price"))
|
|
if prijs and prijs > config.MAX_PRICE:
|
|
continue
|
|
|
|
adres = _text(card, "h4.house-street")
|
|
|
|
# Hero image from picture source (medium size)
|
|
src_tag = card.select_one('picture source[media="(min-width:100px)"]')
|
|
hero = src_tag["srcset"] if src_tag else _src(card, "img")
|
|
if hero and not hero.startswith("http"):
|
|
hero = _SCHIELAND_BASE + hero
|
|
|
|
# Data icons on card: surface, bedrooms, energy label
|
|
woonoppervlak_card = None
|
|
slaapkamers_card = None
|
|
energielabel_card = None
|
|
for data_div in card.select("div.data"):
|
|
txt = data_div.get_text(strip=True)
|
|
if data_div.select_one("i.icon-surface"):
|
|
woonoppervlak_card = parse_m2(txt)
|
|
elif data_div.select_one("i.icon-bedrooms"):
|
|
m = re.search(r"(\d+)", txt)
|
|
slaapkamers_card = int(m.group(1)) if m else None
|
|
elif data_div.select_one("i.icon-label"):
|
|
energielabel_card = txt.strip() or None
|
|
|
|
# Fetch detail page for full kenmerken
|
|
kk = _schieland_detail(detail_url)
|
|
|
|
# Refine status from detail page
|
|
if kk.get("status"):
|
|
status = _SCHIELAND_STATUS_MAP.get(kk["status"], status)
|
|
|
|
# Parse kamers: "5 kamers" → 5
|
|
kamers = None
|
|
if kk.get("kamers"):
|
|
m = re.search(r"(\d+)", kk["kamers"])
|
|
kamers = int(m.group(1)) if m else None
|
|
|
|
# Parse slaapkamers: "3" or "3 slaapkamers" → 3
|
|
slaapkamers = slaapkamers_card
|
|
if kk.get("slaapkamers"):
|
|
m = re.search(r"(\d+)", kk["slaapkamers"])
|
|
slaapkamers = int(m.group(1)) if m else slaapkamers_card
|
|
|
|
listings.append(RawListing(
|
|
url=detail_url,
|
|
source_makelaar="schielandborsboom",
|
|
status=status,
|
|
adres=adres,
|
|
postcode=kk.get("postcode"),
|
|
stad=stad,
|
|
prijs=prijs,
|
|
hero_image_url=hero,
|
|
woningtype=kk.get("woningtype"),
|
|
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
|
|
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
|
|
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
|
|
kamers=kamers,
|
|
slaapkamers=slaapkamers,
|
|
energielabel=kk.get("energielabel") or energielabel_card,
|
|
))
|
|
if config.APP_ENV == "dev":
|
|
break
|
|
except Exception as e:
|
|
log.warning("schielandborsboom: parse fout: %s", e)
|
|
|
|
if len(cards) < 18:
|
|
break
|
|
page += 1
|
|
|
|
log.info("schielandborsboom: %d listings opgehaald", len(listings))
|
|
return listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Van Silfhout & Hogetoorn Wereldmakelaars (Delft)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_VANSILFHOUT_BASE = "https://www.vansilfhout.nl"
|
|
|
|
_VANSILFHOUT_STATUS_MAP = {
|
|
"te koop": "beschikbaar",
|
|
"onder bod": "onder_bod",
|
|
"verkocht onder voorbehoud": "verkocht",
|
|
"verkocht": "verkocht",
|
|
}
|
|
|
|
|
|
def _vansilfhout_detail(detail_url: str) -> dict:
|
|
"""Fetch Van Silfhout detail page; extract postcode from JS and specs from shortSpecs."""
|
|
try:
|
|
import re as _re
|
|
r = __import__("httpx").get(
|
|
detail_url,
|
|
headers={"User-Agent": config.USER_AGENT},
|
|
timeout=15,
|
|
follow_redirects=True,
|
|
)
|
|
r.raise_for_status()
|
|
html = r.text
|
|
from bs4 import BeautifulSoup as _BS
|
|
soup = _BS(html, "html.parser")
|
|
|
|
# Postcode embedded in JS: objectZipcode': '2624NP'
|
|
m = _re.search(r"objectZipcode':\s*'([^']+)'", html)
|
|
postcode = m.group(1) if m else None
|
|
|
|
# shortSpecs: <li><span>Label:</span><span>Value</span></li>
|
|
kv: dict[str, str] = {}
|
|
for li in soup.select(".shortSpecs li"):
|
|
spans = li.select("span")
|
|
if len(spans) >= 2:
|
|
label = spans[0].get_text(strip=True).rstrip(":").lower()
|
|
value = spans[-1].get_text(strip=True)
|
|
kv[label] = value
|
|
|
|
return {
|
|
"postcode": postcode,
|
|
"bouwjaar": kv.get("bouwjaar"),
|
|
"woonoppervlak": kv.get("oppervlakte"),
|
|
"kamers": kv.get("kamers"),
|
|
"slaapkamers": kv.get("slaapkamers"),
|
|
}
|
|
except Exception as e:
|
|
log.warning("vansilfhout: detail fetch fout %s: %s", detail_url, e)
|
|
return {}
|
|
|
|
|
|
def fetch_vansilfhout() -> list[RawListing]:
|
|
"""Fetch Van Silfhout woningaanbod (alle listings op één pagina)."""
|
|
soup = fetch_soup(f"{_VANSILFHOUT_BASE}/woningaanbod/")
|
|
listings = []
|
|
|
|
for card in soup.select("article.row"):
|
|
try:
|
|
a_tag = card.select_one("a.objectcontainerimg")
|
|
if not a_tag or "href" not in a_tag.attrs:
|
|
continue
|
|
detail_url = a_tag["href"]
|
|
if not detail_url.startswith("http"):
|
|
detail_url = _VANSILFHOUT_BASE + detail_url
|
|
|
|
# Status
|
|
status_text = (_text(card, "span.objectstatus") or "").lower()
|
|
status = _VANSILFHOUT_STATUS_MAP.get(status_text, "beschikbaar")
|
|
|
|
# Address and city
|
|
adres = _text(card, "h2.objecttitle")
|
|
city_el = card.select("a.straatnaamwoonplaats span")
|
|
stad = city_el[-1].get_text(strip=True) if city_el else None
|
|
|
|
# Price from shortSpecs strong
|
|
prijs = parse_prijs(_text(card, "ul.shortSpecs li strong"))
|
|
if prijs and prijs > config.MAX_PRICE:
|
|
continue
|
|
|
|
# Area and rooms from shortSpecs
|
|
woonoppervlak_card = None
|
|
kamers_card = None
|
|
for li in card.select("ul.shortSpecs li"):
|
|
spans = li.select("span")
|
|
if len(spans) >= 2:
|
|
label = spans[0].get_text(strip=True).lower()
|
|
val = spans[-1].get_text(strip=True)
|
|
if "oppervlakt" in label:
|
|
woonoppervlak_card = parse_m2(val)
|
|
elif "kamer" in label:
|
|
m = re.search(r"(\d+)", val)
|
|
kamers_card = int(m.group(1)) if m else None
|
|
|
|
# Hero image: prefer data-lazy-src, fall back to noscript img src
|
|
img_tag = card.select_one("a.objectcontainerimg img")
|
|
hero = None
|
|
if img_tag:
|
|
hero = (img_tag.get("data-lazy-src")
|
|
or img_tag.get("src") or None)
|
|
if hero and hero.startswith("data:"):
|
|
noscript = card.select_one("noscript img")
|
|
hero = noscript["src"] if noscript else None
|
|
|
|
kk = _vansilfhout_detail(detail_url)
|
|
|
|
# Parse kamers/slaapkamers from detail
|
|
kamers = kamers_card
|
|
if kk.get("kamers"):
|
|
m = re.search(r"(\d+)", kk["kamers"])
|
|
kamers = int(m.group(1)) if m else kamers_card
|
|
|
|
slaapkamers = None
|
|
if kk.get("slaapkamers"):
|
|
m = re.search(r"(\d+)", kk["slaapkamers"])
|
|
slaapkamers = int(m.group(1)) if m else None
|
|
|
|
listings.append(RawListing(
|
|
url=detail_url,
|
|
source_makelaar="vansilfhout",
|
|
status=status,
|
|
adres=adres,
|
|
postcode=kk.get("postcode"),
|
|
stad=stad,
|
|
prijs=prijs,
|
|
hero_image_url=hero,
|
|
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
|
|
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
|
|
kamers=kamers,
|
|
slaapkamers=slaapkamers,
|
|
))
|
|
if config.APP_ENV == "dev":
|
|
break
|
|
except Exception as e:
|
|
log.warning("vansilfhout: parse fout: %s", e)
|
|
|
|
log.info("vansilfhout: %d listings opgehaald", len(listings))
|
|
return listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# V&W Makelaars Delft / ZO Makelaars (Delft) — Realworks CMS
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def fetch_vwmakelaars() -> list[RawListing]:
|
|
return fetch_realworks("https://www.vwmakelaars.nl", "vwmakelaars")
|
|
|
|
|
|
def fetch_zomakelaars() -> list[RawListing]:
|
|
return fetch_realworks("https://www.zomakelaars.nl", "zomakelaars")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Roepman Makelaardij NVM (Delft)
|
|
# ---------------------------------------------------------------------------
|
|
# Realworks CMS maar met div.aanbodEntry i.p.v. li.aanbodEntry.
|
|
# Prijs zit in JSON-LD (zelfde structuur als Wassenaar).
|
|
|
|
_ROEPMAN_BASE = "https://www.roepman.nl"
|
|
|
|
|
|
def fetch_roepman() -> list[RawListing]:
|
|
import json as _json
|
|
listings_path = f"/aanbod/woningaanbod/-{config.MAX_PRICE}/koop"
|
|
listings = []
|
|
page = 1
|
|
|
|
while True:
|
|
url = f"{_ROEPMAN_BASE}{listings_path}/pagina-{page}/"
|
|
soup = fetch_soup(url)
|
|
cards = soup.select("div.aanbodEntry")
|
|
if not cards:
|
|
break
|
|
|
|
# Collect status + photo per relative url
|
|
status_by_url: dict[str, str] = {}
|
|
photo_by_url: dict[str, str] = {}
|
|
for card in cards:
|
|
a_tag = card.select_one("a.aanbodEntryLink[href]")
|
|
if not a_tag:
|
|
continue
|
|
href = a_tag["href"]
|
|
if href in status_by_url:
|
|
continue
|
|
banner = card.select_one(".objectstatusbanner")
|
|
status_text = banner.get_text(strip=True).lower() if banner else ""
|
|
status_by_url[href] = _REALWORKS_STATUS_MAP.get(status_text, "beschikbaar")
|
|
img = card.select_one("img")
|
|
if img:
|
|
src = img.get("src", "")
|
|
if "geenfotobeschikbaar" not in src:
|
|
photo_by_url[href] = src
|
|
|
|
# Parse JSON-LD Residence blocks (one per listing)
|
|
seen: set[str] = set()
|
|
for tag in soup.select('script[type="application/ld+json"]'):
|
|
try:
|
|
ld = _json.loads(tag.string)
|
|
if ld.get("@type") != "Residence":
|
|
continue
|
|
rel_url = ld.get("url", "")
|
|
if not rel_url or rel_url in seen:
|
|
continue
|
|
seen.add(rel_url)
|
|
|
|
detail_url = _ROEPMAN_BASE + rel_url
|
|
address = ld.get("address", {})
|
|
postcode = address.get("postalCode", "").replace(" ", "") or None
|
|
|
|
price_spec = next(
|
|
(a.get("priceSpecification", {}) for a in ld.get("potentialAction", [])
|
|
if a.get("priceSpecification")),
|
|
{}
|
|
)
|
|
prijs = int(price_spec["price"]) if price_spec.get("price") else None
|
|
if prijs and prijs > config.MAX_PRICE:
|
|
continue
|
|
|
|
hero = ld.get("photo") or photo_by_url.get(rel_url)
|
|
status = status_by_url.get(rel_url, "beschikbaar")
|
|
kk = _realworks_detail(detail_url, "roepman")
|
|
|
|
listings.append(RawListing(
|
|
url=detail_url,
|
|
source_makelaar="roepman",
|
|
status=status,
|
|
adres=address.get("streetAddress") or None,
|
|
postcode=postcode,
|
|
stad=address.get("addressLocality") or None,
|
|
prijs=prijs,
|
|
hero_image_url=hero,
|
|
woningtype=kk.get("woningtype"),
|
|
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
|
|
woonoppervlak=parse_m2(kk.get("woonoppervlak")),
|
|
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
|
|
kamers=int(kk["kamers"]) if kk.get("kamers") else None,
|
|
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None,
|
|
energielabel=kk.get("energielabel"),
|
|
))
|
|
if config.APP_ENV == "dev":
|
|
break
|
|
except Exception as e:
|
|
log.warning("roepman: parse fout: %s", e)
|
|
|
|
if len(cards) < 10:
|
|
break
|
|
page += 1
|
|
|
|
log.info("roepman: %d listings opgehaald", len(listings))
|
|
return listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Post Makelaardij (v/h Bayense) — Delft & omgeving
|
|
# ---------------------------------------------------------------------------
|
|
# Custom Tailwind CSS site; covers Delft, Pijnacker, Rijswijk etc.
|
|
# Filter for Delft only.
|
|
|
|
_POST_BASE = "https://www.postmakelaardij.nl"
|
|
|
|
_POST_STATUS_MAP = {
|
|
"te koop": "beschikbaar",
|
|
"onder bod": "onder_bod",
|
|
"verkocht": "verkocht",
|
|
}
|
|
|
|
|
|
def _post_detail(detail_url: str) -> dict:
|
|
"""Fetch Post Makelaardij detail page and extract kenmerken."""
|
|
try:
|
|
soup = fetch_soup(detail_url)
|
|
|
|
# Energielabel from CSS class: energielabel-{letter}
|
|
energielabel = None
|
|
for el in soup.select('[class]'):
|
|
for cls in el.get('class', []):
|
|
if cls.startswith('energielabel-') and cls != 'energielabel':
|
|
energielabel = cls.replace('energielabel-', '').upper()
|
|
break
|
|
if energielabel:
|
|
break
|
|
|
|
# Woonoppervlak, perceeloppervlak, slaapkamers from icon spans
|
|
woonoppervlak = None
|
|
perceeloppervlak = None
|
|
slaapkamers = None
|
|
for span in soup.select('span.object-info-icon-text'):
|
|
txt = span.get_text(strip=True)
|
|
if 'slaapkamer' in txt:
|
|
m = re.search(r'(\d+)', txt)
|
|
slaapkamers = int(m.group(1)) if m else None
|
|
elif 'perceel' in txt:
|
|
perceeloppervlak = parse_m2(txt)
|
|
elif 'm²' in txt or 'm2' in txt:
|
|
woonoppervlak = parse_m2(txt)
|
|
|
|
return {
|
|
"woonoppervlak": woonoppervlak,
|
|
"perceeloppervlak": perceeloppervlak,
|
|
"slaapkamers": slaapkamers,
|
|
"energielabel": energielabel,
|
|
}
|
|
except Exception as e:
|
|
log.warning("post: detail fetch fout %s: %s", detail_url, e)
|
|
return {}
|
|
|
|
|
|
def fetch_post() -> list[RawListing]:
|
|
"""Fetch Post Makelaardij listings; only Delft, only koop."""
|
|
listings = []
|
|
page = 1
|
|
|
|
while True:
|
|
url = f"{_POST_BASE}/woningaanbod/koop?page={page}"
|
|
soup = fetch_soup(url)
|
|
cards = soup.select("article")
|
|
if not cards:
|
|
break
|
|
|
|
for card in cards:
|
|
try:
|
|
# URL — first link in image slider
|
|
a_tag = card.select_one("a[href]")
|
|
if not a_tag:
|
|
continue
|
|
href = a_tag["href"]
|
|
detail_url = href if href.startswith("http") else _POST_BASE + href
|
|
|
|
# Postcode + city from span.custom-postcode-text
|
|
pc_el = card.select_one("span.custom-postcode-text")
|
|
if not pc_el:
|
|
continue
|
|
pc_parts = pc_el.get_text(strip=True).split()
|
|
if len(pc_parts) < 3:
|
|
continue
|
|
postcode = pc_parts[0] + pc_parts[1] # "2613BD"
|
|
stad = " ".join(pc_parts[2:]) # "Delft"
|
|
|
|
# Filter: only Delft
|
|
if stad.lower() != "delft":
|
|
continue
|
|
|
|
# Price — filter early
|
|
prijs = parse_prijs(_text(card, "span.price-block"))
|
|
if prijs and prijs > config.MAX_PRICE:
|
|
continue
|
|
|
|
# Status from span.status text
|
|
status_text = (_text(card, "span.status") or "").lower()
|
|
status = _POST_STATUS_MAP.get(status_text, "beschikbaar")
|
|
|
|
# Address
|
|
adres = _text(card, "h4.custom-address-text")
|
|
|
|
# Hero: first img in article
|
|
img = card.select_one("img")
|
|
hero = img["src"] if img else None
|
|
|
|
kk = _post_detail(detail_url)
|
|
|
|
listings.append(RawListing(
|
|
url=detail_url,
|
|
source_makelaar="post",
|
|
status=status,
|
|
adres=adres,
|
|
postcode=postcode,
|
|
stad=stad,
|
|
prijs=prijs,
|
|
hero_image_url=hero,
|
|
woonoppervlak=kk.get("woonoppervlak"),
|
|
perceeloppervlak=kk.get("perceeloppervlak"),
|
|
slaapkamers=kk.get("slaapkamers"),
|
|
energielabel=kk.get("energielabel"),
|
|
))
|
|
if config.APP_ENV == "dev":
|
|
break
|
|
except Exception as e:
|
|
log.warning("post: parse fout: %s", e)
|
|
|
|
if len(cards) < 12:
|
|
break
|
|
page += 1
|
|
|
|
log.info("post: %d listings opgehaald", len(listings))
|
|
return listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Morris NVM Makelaars (Delft) — Realworks CMS
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def fetch_morris() -> list[RawListing]:
|
|
return fetch_realworks("https://www.morrismakelaardij.nl", "morris")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Olsthoorn Makelaars Delft (SURE WordPress plugin)
|
|
# ---------------------------------------------------------------------------
|
|
# Covers Delft, Den Haag, Naaldwijk etc — we filter for Delft only.
|
|
# Detail page has no postcode; leave as None.
|
|
|
|
_OLSTHOORN_BASE = "https://www.olsthoornmakelaars.nl"
|
|
|
|
_OLSTHOORN_STATUS_MAP = {
|
|
"badge-available": "beschikbaar",
|
|
"badge-bid": "onder_bod",
|
|
"badge-option": "onder_bod",
|
|
"badge-sold": "verkocht",
|
|
}
|
|
|
|
_OLSTHOORN_DETAIL_STATUS_MAP = {
|
|
"beschikbaar": "beschikbaar",
|
|
"onder bod": "onder_bod",
|
|
"onder optie": "onder_bod",
|
|
"verkocht": "verkocht",
|
|
}
|
|
|
|
|
|
def _olsthoorn_detail(detail_url: str) -> dict:
|
|
"""Fetch Olsthoorn detail page; extract kenmerken from #kenmerken li pairs."""
|
|
try:
|
|
soup = fetch_soup(detail_url)
|
|
kv: dict[str, str] = {}
|
|
for li in soup.select("#kenmerken li"):
|
|
spans = li.select("span")
|
|
if len(spans) >= 2:
|
|
label = spans[0].get_text(strip=True).lower()
|
|
value = spans[1].get_text(strip=True)
|
|
kv[label] = value
|
|
return {
|
|
"status": kv.get("status", "").lower(),
|
|
"woningtype": kv.get("soort object") or kv.get("soort woning") or kv.get("soort bouw"),
|
|
"bouwjaar": kv.get("bouwjaar"),
|
|
"woonoppervlak": kv.get("gebruiksoppervlakte"),
|
|
"perceeloppervlak": kv.get("perceeloppervlakte"),
|
|
"kamers": kv.get("aantal kamers"),
|
|
"slaapkamers": kv.get("aantal slaapkamers"),
|
|
"energielabel": kv.get("energielabel"),
|
|
}
|
|
except Exception as e:
|
|
log.warning("olsthoorn: detail fetch fout %s: %s", detail_url, e)
|
|
return {}
|
|
|
|
|
|
def fetch_olsthoorn() -> list[RawListing]:
|
|
"""Fetch Olsthoorn Makelaars listings; only Delft, only koop."""
|
|
listings = []
|
|
page = 1
|
|
|
|
while True:
|
|
if page == 1:
|
|
url = f"{_OLSTHOORN_BASE}/wonen?sure_koop_huur=koop"
|
|
else:
|
|
url = f"{_OLSTHOORN_BASE}/wonen/page/{page}/?sure_koop_huur=koop"
|
|
|
|
soup = fetch_soup(url)
|
|
cards = soup.select("a.card-house")
|
|
if not cards:
|
|
break
|
|
|
|
for card in cards:
|
|
try:
|
|
href = card.get("href", "")
|
|
if not href:
|
|
continue
|
|
detail_url = href if href.startswith("http") else _OLSTHOORN_BASE + href
|
|
|
|
# Filter: only Delft
|
|
stad_el = card.select_one("h2.card__title")
|
|
stad = stad_el.get_text(strip=True) if stad_el else None
|
|
if not stad or stad.lower() != "delft":
|
|
continue
|
|
|
|
# Price from bold tag — filter early before detail fetch
|
|
prijs_b = card.select_one("b")
|
|
prijs = parse_prijs(prijs_b.get_text() if prijs_b else None)
|
|
if prijs and prijs > config.MAX_PRICE:
|
|
continue
|
|
|
|
# Status from badge class on label span
|
|
label_span = card.select_one("span.card-house__label")
|
|
status = "beschikbaar"
|
|
if label_span:
|
|
for cls in label_span.get("class", []):
|
|
if cls in _OLSTHOORN_STATUS_MAP:
|
|
status = _OLSTHOORN_STATUS_MAP[cls]
|
|
break
|
|
|
|
# Address: second <p> under .short--info (collapse internal whitespace)
|
|
adres_p = card.select("div.short--info > p")
|
|
if adres_p:
|
|
adres = " ".join(adres_p[0].get_text().split())
|
|
else:
|
|
adres = None
|
|
|
|
# Hero image: largest source srcset
|
|
src_tag = card.select_one('picture source[media="(min-width:1024px)"]')
|
|
hero = src_tag.get("data-srcset") if src_tag else None
|
|
if hero and not hero.startswith("http"):
|
|
hero = _OLSTHOORN_BASE + hero
|
|
|
|
# Woonoppervlak + kamers + energielabel from card data icons
|
|
woonoppervlak_card = None
|
|
kamers_card = None
|
|
energielabel_card = None
|
|
for data_div in card.select("div.data"):
|
|
inner = data_div.select_one("span.date__inner")
|
|
if not inner:
|
|
continue
|
|
txt = inner.get_text(strip=True)
|
|
if data_div.select_one("i.icon-sizes"):
|
|
woonoppervlak_card = parse_m2(txt)
|
|
elif data_div.select_one("i.icon-door"):
|
|
m = re.search(r"(\d+)", txt)
|
|
kamers_card = int(m.group(1)) if m else None
|
|
elif data_div.select_one("i.icon-energylabel"):
|
|
energielabel_card = txt or None
|
|
|
|
kk = _olsthoorn_detail(detail_url)
|
|
|
|
# Refine status from detail page
|
|
detail_status = _OLSTHOORN_DETAIL_STATUS_MAP.get(kk.get("status", ""), "")
|
|
if detail_status:
|
|
status = detail_status
|
|
|
|
listings.append(RawListing(
|
|
url=detail_url,
|
|
source_makelaar="olsthoorn",
|
|
status=status,
|
|
adres=adres,
|
|
postcode=None, # not exposed by broker
|
|
stad=stad,
|
|
prijs=prijs,
|
|
hero_image_url=hero,
|
|
woningtype=kk.get("woningtype"),
|
|
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
|
|
woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card,
|
|
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
|
|
kamers=int(kk["kamers"]) if kk.get("kamers") else kamers_card,
|
|
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None,
|
|
energielabel=kk.get("energielabel") or energielabel_card,
|
|
))
|
|
if config.APP_ENV == "dev":
|
|
break
|
|
except Exception as e:
|
|
log.warning("olsthoorn: parse fout: %s", e)
|
|
|
|
if len(cards) < 15:
|
|
break
|
|
page += 1
|
|
|
|
log.info("olsthoorn: %d listings opgehaald", len(listings))
|
|
return listings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SCRAPERS — exporteer hier alle actieve SSR adapters
|
|
# ---------------------------------------------------------------------------
|
|
|
|
SCRAPERS = {
|
|
'ankebodewes': fetch_ankebodewes,
|
|
'woongoed': fetch_woongoed,
|
|
'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars,
|
|
'wassenaar': fetch_wassenaar,
|
|
'dens': fetch_dens,
|
|
'3dmakelaars': fetch_3dmakelaars,
|
|
'dupont': fetch_dupont,
|
|
'schielandborsboom': fetch_schielandborsboom,
|
|
'vansilfhout': fetch_vansilfhout,
|
|
'vwmakelaars': fetch_vwmakelaars,
|
|
'roepman': fetch_roepman,
|
|
'zomakelaars': fetch_zomakelaars,
|
|
'post': fetch_post,
|
|
'morris': fetch_morris,
|
|
'olsthoorn': fetch_olsthoorn,
|
|
}
|