diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..7731eca --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,45 @@ +# Huizenbot + +## Doel + +Periodiek scrapen van makelaarswebsites in Delft en Schiedam, nieuwe woningen opslaan in SQLite, en pushnotificaties sturen via Home Assistant. Draait als één Docker container op homelab met cron. + +Dit draait op het moment al, dus we zijn nu enkel bezig met uitbreidingen en verbeteringen. + + +# AIDE - An IDE for your Agent +This project uses AIDA to support agents, it increases the robustness of edits and reduces token costs. + +You can always use `aide help` for further info, and use it also on each subcommand. If you do, edit what you learned into the agents.md so we don't have to spend tokens on it repeatedly. + + +## Using aide effectively + +**Always start with aide for codebase exploration — not Read or Grep:** +- Use `aide outline ` first to get the function map of any file before reading it +- Use `aide source ` to read individual functions — never Read a whole large file just to find one function +- This is especially important for large files like `ssr.py` (84KB+) where Read truncates + +**For edits:** `aide insert` is fragile with large inputs (see note above) — fall back to the `Edit` tool for anything non-trivial. `aide replace` is fine for small targeted changes. + +## What aide can do (quick reference) + +| Command | What it replaces | +|---------|-----------------| +| `aide outline ` | `Read` whole file for structure; `ls` + loop | +| `aide source ` | `Read` whole file for one function | +| `aide callers ` | `Grep` for call sites | +| `aide search ` | `Grep` across the project | +| `aide replace ` | `Edit` / `sed` for symbol-level changes | +| `aide replace … --lines N-M ` | `Edit` for intra-function line edits | +| `aide remove ` | Manual splice to delete a symbol | +| `aide insert --after ` | Manual splice to add a new symbol — **insert one function at a time**; large messages cause bash to be killed | +| `aide rename ` | Manual find-and-replace of a name | +| `aide log` | Log related to the undo command; see which files changed in which order | +| `aide annotate ` | Persist a non-obvious invariant or gotcha for a symbol | +| `aide context ` | Read the stored annotation before editing | +| `aide review [path]` | Check for annotations invalidated by recent edits | + +Line numbers in `--lines N-M` are **1-based and relative to the symbol's first +line** (line 1 is the signature / opening line of the symbol). This means they +are stable across edits elsewhere in the file. diff --git a/add_scraper_context.md b/add_scraper_context.md index 52b8a90..c7d6e79 100644 --- a/add_scraper_context.md +++ b/add_scraper_context.md @@ -96,8 +96,13 @@ def fetch_bjornd() -> list[RawListing]: - `fetch_json(url, *, params=None, headers=None)` — GET with User-Agent, timeout, Retry-After handling - Built-in logging via `log = logging.getLogger("huizenbot.api")` -#### 2. **SSR/HTML-based** (`src/adapters/ssr.py`) -For brokers with server-side rendered HTML. +#### 2. **SSR/HTML-based** (`src/adapters/ssr/` package) +For brokers with server-side rendered HTML. The package is split by CMS platform: +- `realworks.py` — Realworks CMS (li/div.aanbodEntry cards + span.kenmerk detail) +- `sure.py` — SURE WordPress plugin (/wonen?sure_koop_huur=koop + #kenmerken detail) +- `schiedam.py` — Custom Schiedam scrapers (diverse platforms) +- `denhaag.py` — Den Haag scrapers (diverse platforms) +- `overige.py` — Other / multi-city scrapers (OG Online WP, Elementor) **Pattern:** ```python @@ -144,18 +149,22 @@ def fetch_vdaal() -> list[RawListing]: ## Registration -Both `api.py` and `ssr.py` have a `SCRAPERS` dict at the bottom: +**API scrapers** (`src/adapters/api.py`): Add your function and register in the `SCRAPERS` dict at the bottom of the file. + +**SSR scrapers**: Add your function to the appropriate submodule (`realworks.py`, `sure.py`, `schiedam.py`, `denhaag.py`, or `overige.py`), then import it in `src/adapters/ssr/__init__.py` and add it to the `SCRAPERS` dict there. ```python -# api.py +# api.py — SCRAPERS dict SCRAPERS = { 'bjornd': fetch_bjornd, 'your_broker': fetch_your_broker, # ← Add here } -# ssr.py +# ssr/__init__.py — import + register +from .realworks import fetch_your_broker # ← import from the right submodule + SCRAPERS = { - 'bjornd_demo': fetch_bjornd_demo, + ... 'your_broker': fetch_your_broker, # ← Add here } ``` @@ -173,7 +182,7 @@ The human will help you: - Write exploratory curl requests (for APIs) or BeautifulSoup inspections ### 2. Develop & Test Locally -- Add your scraper function to the appropriate file (`api.py` or `ssr.py`) +- Add your scraper function to the appropriate file (`api.py` or the right `ssr/` submodule) - Register it in the `SCRAPERS` dict - The human updates `tests/test_adapters.py` to point to your adapter: ```python @@ -208,6 +217,8 @@ Secrets (API keys, webhook URLs) are **environment variables**, not in config. Before investigating a broker's HTML manually, check for known platforms in this order: ### 1. OG Online / realtime-listings (API — fastest) +**File:** `src/adapters/api.py` + Check if `https:///nl/realtime-listings/consumer` returns JSON (with header `X-Requested-With: XMLHttpRequest`). If yes, this is a 10-line addition to `api.py`. Known brokers: bjornd, moerman, vandaal, elzenaar, doen. Fields: `isSales`, `statusOrig`, `salesPrice`, `address`, `zipcode`, `city`, `rooms`, `bedrooms`, `livingSurface`, `plotSurface`, `dateOfConstruction`, `energyLabel`, `type`, `photo`, `url`. @@ -215,6 +226,8 @@ Fields: `isSales`, `statusOrig`, `salesPrice`, `address`, `zipcode`, `city`, `ro Add a `_CITIES` set to filter by city if the broker covers a wide area. Skip statuses `"rented"` and `"rented_ur"`. ### 2. Realworks CMS (SSR — one liner) +**File:** `src/adapters/ssr/realworks.py` + Run `autoscraper.py` or check HTML for `li.aanbodEntry`. If detected: ```python def fetch_mybroker() -> list[RawListing]: @@ -222,6 +235,8 @@ def fetch_mybroker() -> list[RawListing]: ``` ### 3. SURE WordPress Plugin (SSR — ~50 lines) +**File:** `src/adapters/ssr/sure.py` + Check HTML for `sure-` CSS classes or `?sure_koop_huur=koop` filter. Two card variants: - `a.card-house` (single dash) — e.g. Olsthoorn - `a.card--house` (double dash) — e.g. Borgdorff @@ -231,6 +246,8 @@ Both use `?sure_koop_huur=koop` to filter buy listings and `/page/{N}/` paginati Terminate pagination when `len(cards) < expected_per_page` (typically 15 for SURE). ### 4. Unknown CMS +**File:** `src/adapters/ssr/schiedam.py`, `denhaag.py`, or `overige.py` depending on city — or add a new file if needed. + Run the autoscraper tool: ```bash python autoscraper.py listings diff --git a/makelaars.md b/makelaars.md index e2af64f..80deb8e 100644 --- a/makelaars.md +++ b/makelaars.md @@ -2,7 +2,7 @@ ## TODO -- **API scrapers need detail page enrichment**: OG Online API (bjornd, moerman, vandaal, elzenaar, doen, vandriel) sometimes omits fields like `energyLabel`. We should fetch the detail page for each listing and merge in missing fields (especially energielabel, bouwjaar). This is already done for SSR scrapers; needs to be added to API-based ones. +- ~~**API scrapers need detail page enrichment**: OG Online API (bjornd, moerman, vandaal, elzenaar, doen, vandriel) sometimes omits fields like `energyLabel`. We should fetch the detail page for each listing and merge in missing fields (especially energielabel, bouwjaar). This is already done for SSR scrapers; needs to be added to API-based ones.~~ ✅ Done — `_og_detail()` added to `api.py` ## Delft diff --git a/src/adapters/api.py b/src/adapters/api.py index b9b36be..ea1c9d2 100644 --- a/src/adapters/api.py +++ b/src/adapters/api.py @@ -7,9 +7,11 @@ Voeg nieuwe toe onderaan en registreer in SCRAPERS. import json import logging +import re import time import httpx +from bs4 import BeautifulSoup import config from huizenbot import RawListing @@ -40,8 +42,71 @@ def fetch_json(url: str, *, params: dict = None, headers: dict = None) -> dict | return r.json() raise RuntimeError(f"Blijvend 429 op {url}") - - + + +def _og_detail(url: str, makelaar: str) -> dict: + """ + Fetch an OG Online detail page and extract missing fields. + + OG Online sites typically expose kenmerken in one of two patterns: + 1. A table/list with dt/dd or label/value span pairs + 2. An energielabel CSS class (energielabel-A, energielabel-B, etc.) + + Returns a dict with any fields found; empty dict on failure. + """ + try: + r = httpx.get( + url, + headers={"User-Agent": config.USER_AGENT}, + timeout=15, + follow_redirects=True, + ) + r.raise_for_status() + soup = BeautifulSoup(r.text, "html.parser") + + # Pattern 1: energielabel CSS class on any element + energielabel = None + for el in soup.select("[class]"): + for cls in el.get("class", []): + if cls.startswith("energielabel-") and cls != "energielabel": + energielabel = cls.replace("energielabel-", "").upper() + break + if energielabel: + break + + # Pattern 2: kenmerken table — try dt/dd pairs first + kv: dict[str, str] = {} + dts = soup.select("dt") + dds = soup.select("dd") + for dt, dd in zip(dts, dds): + kv[dt.get_text(strip=True).lower()] = dd.get_text(strip=True) + + # Pattern 3: ul.objectkenmerken / div.kenmerken span pairs + if not kv: + for li in soup.select("li"): + spans = li.select("span") + if len(spans) >= 2: + kv[spans[0].get_text(strip=True).lower()] = spans[1].get_text(strip=True) + + if not energielabel: + energielabel = ( + kv.get("energielabel") + or kv.get("energieklasse") + or kv.get("energie") + ) or None + + raw_year = kv.get("bouwjaar") or "" + bouwjaar = int(raw_year) if raw_year.isdigit() else None + + return { + "energielabel": energielabel, + "bouwjaar": bouwjaar, + } + except Exception as e: + log.warning("%s: detail fetch fout %s: %s", makelaar, url, e) + return {} + + # --------------------------------------------------------------------------- # Bjornd # --------------------------------------------------------------------------- @@ -56,26 +121,36 @@ _STATUS_MAP = { "sold": "verkocht", "sold_ur": "verkocht", } - - + + def fetch_bjornd() -> list[RawListing]: data = fetch_json( f"{_BJORND_BASE}/nl/realtime-listings/consumer", headers={"X-Requested-With": "XMLHttpRequest"}, ) - + listings = [] for item in data: if not item.get("isSales"): continue if item.get("statusOrig") in _BJORND_SKIP: continue - if item.get('salesPrice')>config.MAX_PRICE: + if item.get("salesPrice", 0) > config.MAX_PRICE: continue - - + + detail_url = _BJORND_BASE + item["url"] + raw_year = item.get("dateOfConstruction") or "" + bouwjaar = int(raw_year) if raw_year.isdigit() else None + energielabel = item.get("energyLabel") or None + + # Fetch detail page when API omits key fields + if not energielabel or not bouwjaar: + extra_kk = _og_detail(detail_url, "bjornd") + energielabel = energielabel or extra_kk.get("energielabel") + bouwjaar = bouwjaar or extra_kk.get("bouwjaar") + listings.append(RawListing( - url=_BJORND_BASE + item["url"], + url=detail_url, source_makelaar="bjornd", status=_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"), adres=item.get("address") or None, @@ -87,6 +162,8 @@ def fetch_bjornd() -> list[RawListing]: perceeloppervlak=item.get("plotSurface") or None, kamers=item.get("rooms") or None, slaapkamers=item.get("bedrooms") or None, + bouwjaar=bouwjaar, + energielabel=energielabel, hero_image_url=item.get("photo") or None, extra=json.dumps({ "balcony": item.get("balcony"), @@ -102,10 +179,13 @@ def fetch_bjornd() -> list[RawListing]: "photos": item.get("photos"), }, ensure_ascii=False), )) - + if config.APP_ENV == "dev": + break + log.info("bjornd: %d koopwoningen opgehaald", len(listings)) return listings - + + # --------------------------------------------------------------------------- # Ooms # --------------------------------------------------------------------------- @@ -221,9 +301,15 @@ def fetch_moerman() -> list[RawListing]: raw_year = item.get("dateOfConstruction") or "" bouwjaar = int(raw_year) if raw_year.isdigit() else None + energielabel = item.get("energyLabel") or None + + detail_url = _MOERMAN_BASE + item["url"] + if not energielabel: + extra_kk = _og_detail(detail_url, "moerman") + energielabel = extra_kk.get("energielabel") listings.append(RawListing( - url=_MOERMAN_BASE + item["url"], + url=detail_url, source_makelaar="moerman", status=_MOERMAN_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"), adres=item.get("address") or None, @@ -236,9 +322,11 @@ def fetch_moerman() -> list[RawListing]: kamers=item.get("rooms") or None, slaapkamers=item.get("bedrooms") or None, bouwjaar=bouwjaar, - energielabel=item.get("energyLabel") or None, + energielabel=energielabel, hero_image_url=item.get("photo") or None, )) + if config.APP_ENV == "dev": + break log.info("moerman: %d koopwoningen opgehaald", len(listings)) return listings @@ -284,9 +372,15 @@ def fetch_vandaal() -> list[RawListing]: raw_year = item.get("dateOfConstruction") or "" bouwjaar = int(raw_year) if raw_year.isdigit() else None + energielabel = item.get("energyLabel") or None + + detail_url = _VANDAAL_BASE + item["url"] + if not energielabel: + extra_kk = _og_detail(detail_url, "vandaal") + energielabel = extra_kk.get("energielabel") listings.append(RawListing( - url=_VANDAAL_BASE + item["url"], + url=detail_url, source_makelaar="vandaal", status=_VANDAAL_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"), adres=item.get("address") or None, @@ -299,9 +393,11 @@ def fetch_vandaal() -> list[RawListing]: kamers=item.get("rooms") or None, slaapkamers=item.get("bedrooms") or None, bouwjaar=bouwjaar, - energielabel=item.get("energyLabel") or None, + energielabel=energielabel, hero_image_url=item.get("photo") or None, )) + if config.APP_ENV == "dev": + break log.info("vandaal: %d koopwoningen opgehaald", len(listings)) return listings @@ -349,9 +445,15 @@ def fetch_elzenaar() -> list[RawListing]: raw_year = item.get("dateOfConstruction") or "" bouwjaar = int(raw_year) if raw_year.isdigit() else None + energielabel = item.get("energyLabel") or None + + detail_url = _ELZENAAR_BASE + item["url"] + if not energielabel: + extra_kk = _og_detail(detail_url, "elzenaar") + energielabel = extra_kk.get("energielabel") listings.append(RawListing( - url=_ELZENAAR_BASE + item["url"], + url=detail_url, source_makelaar="elzenaar", status=_ELZENAAR_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"), adres=item.get("address") or None, @@ -364,9 +466,11 @@ def fetch_elzenaar() -> list[RawListing]: kamers=item.get("rooms") or None, slaapkamers=item.get("bedrooms") or None, bouwjaar=bouwjaar, - energielabel=item.get("energyLabel") or None, + energielabel=energielabel, hero_image_url=item.get("photo") or None, )) + if config.APP_ENV == "dev": + break log.info("elzenaar: %d koopwoningen opgehaald", len(listings)) return listings @@ -413,9 +517,15 @@ def fetch_doen() -> list[RawListing]: raw_year = item.get("dateOfConstruction") or "" bouwjaar = int(raw_year) if raw_year.isdigit() else None + energielabel = item.get("energyLabel") or None + + detail_url = _DOEN_BASE + item["url"] + if not energielabel: + extra_kk = _og_detail(detail_url, "doen") + energielabel = extra_kk.get("energielabel") listings.append(RawListing( - url=_DOEN_BASE + item["url"], + url=detail_url, source_makelaar="doen", status=_DOEN_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"), adres=item.get("address") or None, @@ -428,9 +538,11 @@ def fetch_doen() -> list[RawListing]: kamers=item.get("rooms") or None, slaapkamers=item.get("bedrooms") or None, bouwjaar=bouwjaar, - energielabel=item.get("energyLabel") or None, + energielabel=energielabel, hero_image_url=item.get("photo") or None, )) + if config.APP_ENV == "dev": + break log.info("doen: %d koopwoningen opgehaald", len(listings)) return listings @@ -476,9 +588,15 @@ def fetch_vandriel() -> list[RawListing]: raw_year = item.get("dateOfConstruction") or "" bouwjaar = int(raw_year) if raw_year.isdigit() else None + energielabel = item.get("energyLabel") or None + + detail_url = _VANDRIEL_BASE + item["url"] + if not energielabel: + extra_kk = _og_detail(detail_url, "vandriel") + energielabel = extra_kk.get("energielabel") listings.append(RawListing( - url=_VANDRIEL_BASE + item["url"], + url=detail_url, source_makelaar="vandriel", status=_VANDRIEL_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"), adres=item.get("address") or None, @@ -491,9 +609,11 @@ def fetch_vandriel() -> list[RawListing]: kamers=item.get("rooms") or None, slaapkamers=item.get("bedrooms") or None, bouwjaar=bouwjaar, - energielabel=item.get("energyLabel") or None, + energielabel=energielabel, hero_image_url=item.get("photo") or None, )) + if config.APP_ENV == "dev": + break log.info("vandriel: %d koopwoningen opgehaald", len(listings)) return listings diff --git a/src/adapters/ssr.py b/src/adapters/ssr.py deleted file mode 100644 index 529bb7c..0000000 --- a/src/adapters/ssr.py +++ /dev/null @@ -1,2160 +0,0 @@ -""" -adapters/ssr.py — HTML/SSR-based makelaars - -Elke scraper is een functie () -> list[RawListing]. -Voeg nieuwe toe onderaan en registreer in SCRAPERS. -""" - -import logging -import re -import time - -import httpx -from bs4 import BeautifulSoup - -import config -from huizenbot import RawListing - -log = logging.getLogger("huizenbot.ssr") - -# --------------------------------------------------------------------------- -# Gedeelde HTTP helper -# --------------------------------------------------------------------------- - -def fetch_soup(url: str, *, params: dict = None) -> BeautifulSoup: - """ - GET request → BeautifulSoup. Handelt 429 af met Retry-After. - """ - for attempt in range(3): - r = httpx.get( - url, - params=params, - headers={"User-Agent": config.USER_AGENT}, - timeout=15, - follow_redirects=True, - ) - if r.status_code == 429: - wait = int(r.headers.get("Retry-After", 60)) - log.warning("429 op %s, wacht %ds", url, wait) - time.sleep(wait) - continue - r.raise_for_status() - return BeautifulSoup(r.text, "html.parser") - - raise RuntimeError(f"Blijvend 429 op {url}") - - -# --------------------------------------------------------------------------- -# Parse helpers -# --------------------------------------------------------------------------- - -def parse_prijs(text: str | None) -> int | None: - """'€ 325.000 k.k.' → 325000""" - if not text: - return None - digits = re.sub(r"[^\d]", "", text) - return int(digits) if digits else None - - -def parse_m2(text: str | None) -> int | None: - """'87 m²' → 87""" - if not text: - return None - m = re.search(r"(\d+)", text.replace(".", "")) - return int(m.group(1)) if m else None - - -# --------------------------------------------------------------------------- -# Realworks CMS (shared) -# --------------------------------------------------------------------------- - -_REALWORKS_STATUS_MAP = { - "te koop": "beschikbaar", - "nieuw": "beschikbaar", - "onder bod": "onder_bod", - "onder optie": "onder_bod", - "verkocht o.v.": "verkocht", - "verkocht": "verkocht", -} - - -def _realworks_detail(detail_url: str, makelaar: str) -> dict: - """Fetch a Realworks detail page and extract kenmerken. Returns empty dict on failure.""" - try: - soup = fetch_soup(detail_url) - - # Build a label→value map from all .kenmerk spans - kv: dict[str, str] = {} - for kenmerk in soup.select("span.kenmerk"): - label_el = kenmerk.select_one("span.kenmerkName") - value_el = kenmerk.select_one("span.kenmerkValue") - if label_el and value_el: - label = label_el.get_text(strip=True).lower() - value = value_el.get_text(strip=True) - kv[label] = value - - return { - "woningtype": kv.get("type woning"), - "bouwjaar": kv.get("bouwjaar"), - "woonoppervlak": kv.get("woonoppervlakte"), - "perceeloppervlak": kv.get("perceeloppervlakte"), - "kamers": kv.get("aantal kamers"), - "slaapkamers": kv.get("aantal slaapkamers"), - "energielabel": kv.get("energieklasse"), - } - except Exception as e: - log.warning("%s: detail fetch fout %s: %s", makelaar, detail_url, e) - return {} - - -def fetch_realworks(base_url: str, makelaar: str) -> list[RawListing]: - """ - Generic fetcher for Realworks CMS brokers. - Paginates via /pagina-{n}/, fetches detail page per listing. - """ - listings_path = f"/aanbod/woningaanbod/-{config.MAX_PRICE}/koop" - listings = [] - page = 1 - - while True: - url = f"{base_url}{listings_path}/pagina-{page}/" - soup = fetch_soup(url) - cards = soup.select("li.aanbodEntry") - if not cards: - break - - for card in cards: - try: - a_tag = card.select_one("a.aanbodEntryLink") - if not a_tag: - continue - listing_url = base_url + a_tag["href"] - - adres = _text(card, ".street-address") - postcode = (_text(card, ".postal-code") or "").replace(" ", "") or None - stad = _text(card, ".locality") - prijs = parse_prijs(_text(card, ".koopprijs .kenmerkValue")) - - status_text = (_text(card, ".objectstatusbanner") or "").lower() - status = _REALWORKS_STATUS_MAP.get(status_text, "beschikbaar") - - img_tag = card.select_one(".hoofdfoto img") - hero = img_tag["src"] if img_tag else None - - kk = _realworks_detail(listing_url, makelaar) - - listings.append(RawListing( - url=listing_url, - source_makelaar=makelaar, - adres=adres, - postcode=postcode, - stad=stad, - prijs=prijs, - status=status, - hero_image_url=hero, - woningtype=kk.get("woningtype"), - bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, - woonoppervlak=parse_m2(kk.get("woonoppervlak")), - perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), - kamers=int(kk["kamers"]) if kk.get("kamers") else None, - slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None, - energielabel=kk.get("energielabel"), - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("%s: parse fout: %s", makelaar, e) - - if len(cards) < 10: - break - page += 1 - - log.info("%s: %d listings opgehaald", makelaar, len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# Anke Bodewes Makelaardij -# --------------------------------------------------------------------------- - -def fetch_ankebodewes() -> list[RawListing]: - return fetch_realworks("https://www.ankebodewes.nl", "ankebodewes") - - -# --------------------------------------------------------------------------- -# Woongoed Makelaars Schiedam -# --------------------------------------------------------------------------- - -def fetch_woongoed() -> list[RawListing]: - return fetch_realworks("https://www.woongoedmakelaars.nl", "woongoed") - - -# --------------------------------------------------------------------------- -# De Witte Garantiemakelaars -# --------------------------------------------------------------------------- - -_DEWITTE_BASE = "https://dewittegarantiemakelaars.nl" - -_DEWITTE_PILL_MAP = { - "bg-fun-green": "beschikbaar", - "bg-sold": "verkocht", -} - -_DEWITTE_TYPE_MAP = { - "Apartment": "appartement", - "House": "woning", - "SingleFamilyResidence": "woning", - "Residence": "woning", -} - - -def _dewitte_jsonld(detail_url: str) -> dict: - """Fetch detail page and return parsed JSON-LD dict, or {} on failure.""" - import json - try: - soup = fetch_soup(detail_url) - tag = soup.select_one('script[type="application/ld+json"]') - if not tag: - log.warning("dewitte: geen JSON-LD op %s", detail_url) - return {} - return json.loads(tag.string) - except Exception as e: - log.warning("dewitte: JSON-LD fout %s: %s", detail_url, e) - return {} - - -def fetch_dewittegarantiemakelaars() -> list[RawListing]: - listings = [] - page = 1 - - while True: - url = ( - f"{_DEWITTE_BASE}/woningaanbod" - f"?buy_rent=buy&buy_price=1-{config.MAX_PRICE}&page={page}" - ) - soup = fetch_soup(url) - cards = soup.select("div.card.card--property") - if not cards: - break - - for card in cards: - try: - a_tag = card.select_one("a.card__anchor") - if not a_tag: - continue - detail_url = a_tag["href"] - if not detail_url.startswith("http"): - detail_url = _DEWITTE_BASE + detail_url - - pill = card.select_one("span.pill") - pill_classes = pill.get("class", []) if pill else [] - status_key = next( - (c for c in pill_classes if c.startswith("bg-")), None - ) - status = _DEWITTE_PILL_MAP.get(status_key, "onder_bod") - - ld = _dewitte_jsonld(detail_url) - if not ld: - continue - - offered = ld.get("itemOffered", {}) - address = offered.get("address", {}) - floor_size = offered.get("floorSize", {}) - - postcode = address.get("postalCode", "").replace(" ", "") or None - stad = address.get("addressLocality") or None - adres = address.get("streetAddress") or None - - prijs = ld.get("price") - if prijs and int(prijs) > config.MAX_PRICE: - continue - - woningtype = _DEWITTE_TYPE_MAP.get(offered.get("@type", "")) - woonoppervlak = int(floor_size["value"]) if floor_size.get("value") else None - kamers = offered.get("numberOfRooms") - bouwjaar = offered.get("yearBuilt") - - # Full-res image from JSON-LD, fall back to card thumbnail - hero = ld.get("image") - if not hero: - img = card.select_one("picture img") - hero = img["src"] if img else None - - listings.append(RawListing( - url=detail_url, - source_makelaar="dewittegarantiemakelaars", - status=status, - adres=adres, - postcode=postcode, - stad=stad, - prijs=int(prijs) if prijs else None, - woningtype=woningtype, - woonoppervlak=woonoppervlak, - kamers=int(kamers) if kamers else None, - bouwjaar=int(bouwjaar) if bouwjaar else None, - hero_image_url=hero, - )) - if config.APP_ENV == "dev": - break - - except Exception as e: - log.warning("dewitte: parse fout: %s", e) - - if len(cards) < 10: - break - page += 1 - - log.info("dewittegarantiemakelaars: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# Makelaardij Wassenaar (Schiedam) -# --------------------------------------------------------------------------- -# Realworks CMS. Listings page has JSON-LD (Residence) with url/address/price/photo. -# Detail pages have span.kenmerk with Wassenaar-specific label names. - -_WASSENAAR_BASE = "https://www.makelaardijwassenaar.nl" - -_WASSENAAR_STATUS_MAP = { - "te koop": "beschikbaar", - "nieuw": "beschikbaar", - "onder bod": "onder_bod", - "onder optie": "onder_bod", - "verkocht o.v.": "onder_bod", - "verkocht onder voorbehoud": "onder_bod", - "verkocht": "verkocht", -} - - -def _wassenaar_detail(detail_url: str) -> dict: - """Fetch Realworks detail page; extract kenmerken with Wassenaar-specific labels.""" - try: - soup = fetch_soup(detail_url) - kv: dict[str, str] = {} - for kenmerk in soup.select("span.kenmerk"): - label_el = kenmerk.select_one("span.kenmerkName") - value_el = kenmerk.select_one("span.kenmerkValue") - if label_el and value_el: - kv[label_el.get_text(strip=True).lower()] = value_el.get_text(strip=True) - return { - "woningtype": kv.get("soort object"), - "bouwjaar": kv.get("bouwjaar"), - "woonoppervlak": kv.get("woonoppervlakte"), - "perceeloppervlak": kv.get("perceeloppervlakte"), - "kamers": kv.get("aantal kamers"), - "slaapkamers": kv.get("aantal slaapkamers"), - "energielabel": kv.get("energieklasse"), - } - except Exception as e: - log.warning("wassenaar: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_wassenaar() -> list[RawListing]: - import json as _json - soup = fetch_soup(f"{_WASSENAAR_BASE}/aanbod/woningaanbod/-{config.MAX_PRICE}/koop/") - - # First pass: collect status + thumbnail per relative url - # Each listing has two a.aanbodEntryLink with the same href; - # the first has the status banner + photo, the second has address + price. - status_by_url: dict[str, str] = {} - photo_by_url: dict[str, str] = {} - for a in soup.select("a.aanbodEntryLink[href]"): - href = a["href"] - if href in status_by_url: - continue - banner = a.select_one(".objectstatusbanner") - status_text = banner.get_text(strip=True).lower() if banner else "" - status_by_url[href] = _WASSENAAR_STATUS_MAP.get(status_text, "beschikbaar") - img = a.select_one("span.hoofdfoto img") - if img: - src = img.get("src", "") - if "geenfotobeschikbaar" not in src: - photo_by_url[href] = src - - # Second pass: parse JSON-LD blocks (one per listing) - seen: set[str] = set() - listings = [] - for tag in soup.select('script[type="application/ld+json"]'): - try: - ld = _json.loads(tag.string) - if ld.get("@type") != "Residence": - continue - rel_url = ld.get("url", "") - if not rel_url or rel_url in seen: - continue - seen.add(rel_url) - - detail_url = _WASSENAAR_BASE + rel_url - address = ld.get("address", {}) - postcode = address.get("postalCode", "").replace(" ", "") or None - - price_spec = next( - (a.get("priceSpecification", {}) for a in ld.get("potentialAction", []) - if a.get("priceSpecification")), - {} - ) - prijs = int(price_spec["price"]) if price_spec.get("price") else None - if prijs and prijs > config.MAX_PRICE: - continue - - hero = ld.get("photo") or photo_by_url.get(rel_url) - status = status_by_url.get(rel_url, "beschikbaar") - kk = _wassenaar_detail(detail_url) - - listings.append(RawListing( - url=detail_url, - source_makelaar="wassenaar", - status=status, - adres=address.get("streetAddress") or None, - postcode=postcode, - stad=address.get("addressLocality") or None, - prijs=prijs, - hero_image_url=hero, - woningtype=kk.get("woningtype"), - bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, - woonoppervlak=parse_m2(kk.get("woonoppervlak")), - perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), - kamers=int(kk["kamers"]) if kk.get("kamers") else None, - slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None, - energielabel=kk.get("energielabel"), - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("wassenaar: parse fout: %s", e) - - log.info("wassenaar: %d listings opgehaald", len(listings)) - return listings - -# --------------------------------------------------------------------------- -# SSR helper utils -# --------------------------------------------------------------------------- - -def _text(soup, selector: str) -> str | None: - el = soup.select_one(selector) - return el.get_text(strip=True) if el else None - - -def _src(soup, selector: str) -> str | None: - el = soup.select_one(selector) - if el is None: - return None - return el.get("src") or el.get("data-src") - - -def _extract_postcode(text: str | None) -> str | None: - if not text: - return None - m = re.search(r"\b(\d{4}\s?[A-Z]{2})\b", text) - return m.group(1).replace(" ", "") if m else None - - -def _infer_stad(postcode: str | None) -> str | None: - """Simpele mapping op basis van postcode range — uitbreiden naar wens.""" - if not postcode: - return None - code = int(postcode[:4]) - if 2600 <= code <= 2629: - return "Delft" - if 3100 <= code <= 3135: - return "Schiedam" - return None - - -# --------------------------------------------------------------------------- -# D&S Makelaars (Schiedam) -# --------------------------------------------------------------------------- - -_DS_BASE = "https://www.densmakelaars.nl" - -_DS_STATUS_MAP = { - "onder bod": "onder_bod", - "te koop": "beschikbaar", - "nieuw": "beschikbaar", - "beschikbaar": "beschikbaar", - "verkocht": "verkocht", -} - - -def _ds_detail(detail_url: str, html_text: str = None) -> dict: - """Fetch D&S detail page and extract all kenmerken from
/
pairs and postcode from maps URL.""" - try: - # If html_text not provided, fetch it - if html_text is None: - import httpx - r = httpx.get( - detail_url, - headers={"User-Agent": config.USER_AGENT}, - timeout=15, - follow_redirects=True, - ) - html_text = r.text - - soup = BeautifulSoup(html_text, "html.parser") - - # Parse
/
pairs into a label → value map - kv: dict[str, str] = {} - dts = soup.select("dt") - dds = soup.select("dd") - - for dt, dd in zip(dts, dds): - label = dt.get_text(strip=True).lower() - value = dd.get_text(strip=True) - kv[label] = value - - # Extract postcode from Google Maps URL in iframe src - # Pattern: q=...POSTCODE...,CITY where POSTCODE is 4 digits + 2 letters - postcode = None - m = re.search(r'q=.+?,(\d{4})\s+([A-Z]{2}),', html_text) - if m: - postcode = f"{m.group(1)}{m.group(2)}" - - # Extract specific fields - result = { - "status": kv.get("status", "beschikbaar").lower(), - "woningtype": kv.get("soort woning"), - "bouwjaar": kv.get("bouwjaar"), - "woonoppervlak": kv.get("woonoppervlakte"), - "kamers": kv.get("aantal kamers"), - "slaapkamers": kv.get("aantal slaapkamers"), - "energielabel": kv.get("energielabel"), - "postcode": postcode, - } - return result - except Exception as e: - log.warning("dens: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_dens() -> list[RawListing]: - """Fetch D&S Makelaars listings with full detail pages.""" - listings = [] - page = 1 - - while True: - url = f"{_DS_BASE}/aanbod/koopwoningen?page={page}" - soup = fetch_soup(url) - cards = soup.select(".col-12.col-md-4.object-wrapper") - if not cards: - break - - for card in cards: - try: - # Extract URL - a_tag = card.select_one("a.property") - if not a_tag or "href" not in a_tag.attrs: - continue - detail_url = a_tag["href"] - if not detail_url.startswith("http"): - detail_url = _DS_BASE + detail_url - - # Extract listing page data - status_label = _text(card, "span.label") or "beschikbaar" - status_label = status_label.strip().lower() - status = _DS_STATUS_MAP.get(status_label, "beschikbaar") - - adres = _text(card, "h3") - stad = _text(card, "h4") - prijs_text = _text(card, "div.price") - prijs = parse_prijs(prijs_text) - - # Extract area and rooms from footer - footer_spans = card.select("div.footer span") - woonoppervlak = None - kamers = None - for span in footer_spans: - text = span.get_text(strip=True) - if "m²" in text: - woonoppervlak = parse_m2(text) - elif "kamers" in text.lower(): - m = re.search(r"(\d+)", text) - if m: - kamers = int(m.group(1)) - - # Extract hero image - img_tag = card.select_one("img") - hero = img_tag["src"] if img_tag else None - - # Fetch and parse detail page - detail_data = _ds_detail(detail_url) - - # Use postcode from detail data (extracted from Google Maps URL) - postcode = detail_data.get("postcode") - - # Determine status from detail page if available - if detail_data.get("status"): - status = _DS_STATUS_MAP.get(detail_data["status"], status) - - # Build listing - listings.append(RawListing( - url=detail_url, - source_makelaar="dens", - adres=adres, - postcode=postcode, - stad=stad or _infer_stad(postcode), - prijs=prijs, - status=status, - hero_image_url=hero, - woningtype=detail_data.get("woningtype"), - bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None, - woonoppervlak=parse_m2(detail_data.get("woonoppervlak")) or woonoppervlak, - kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else kamers, - slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None, - energielabel=detail_data.get("energielabel"), - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("dens: parse fout: %s", e) - - if len(cards) < 10: - break - page += 1 - - log.info("dens: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# 3D Makelaars (Schiedam/Vlaardingen) -# --------------------------------------------------------------------------- - -_3D_BASE = "https://3dmakelaars.nl" - - -def _3dmakelaars_detail(detail_url: str) -> dict: - """Fetch 3dmakelaars detail page and extract structured info block.""" - try: - soup = fetch_soup(detail_url) - - # Parse structured info block: span (label) + p (value) pairs - kv: dict[str, str] = {} - for li in soup.select("div.tl-adiltional-inforamtion ul.tl-adiltional-listed li"): - label_el = li.select_one("span") - value_el = li.select_one("p") - if label_el and value_el: - label = label_el.get_text(strip=True).lower() - value = value_el.get_text(strip=True) - kv[label] = value - - # Extract postcode from first description paragraph - postcode = None - p_tag = soup.select_one(".omschrijving > p:nth-child(1)") - if p_tag: - text = p_tag.get_text() - postcode = _extract_postcode(text) - - return { - "kamers": int(kv["aantal kamers"].split()[0]) if "aantal kamers" in kv else None, - "slaapkamers": int(kv["aantal slaapkamers"].split()[0]) if "aantal slaapkamers" in kv else None, - "bouwjaar": int(kv["bouwjaar"]) if "bouwjaar" in kv else None, - "woningtype": kv.get("bouwvorm"), - "woonoppervlak": parse_m2(kv.get("oppervlakte")), - "postcode": postcode, - } - except Exception as e: - log.warning("3dmakelaars: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_3dmakelaars() -> list[RawListing]: - """Fetch 3D Makelaars listings with pagination.""" - listings = [] - page = 1 - - while True: - url = ( - f"{_3D_BASE}/woningen-te-koop-in-schiedam-en-vlaardingen" - f"?kamers=&oppervlakte=&woonplaats=&video=&prijs=3&page={page}" - ) - soup = fetch_soup(url) - cards = soup.select("div.tl-properties-item") - if not cards: - break - - for card in cards: - try: - # Extract detail URL from onclick attribute - onclick = card.get("onclick", "") - detail_url = None - if "window.location" in onclick: - m = re.search(r"window\.location\s*=\s*['\"]([^'\"]+)['\"]", onclick) - if m: - detail_url = _3D_BASE + m.group(1) - - if not detail_url: - continue - - # Extract listing-level info - adres = _text(card, "h3.price") - prijs_text = _text(card, "span.address") - prijs = parse_prijs(prijs_text) - - # Extract rooms and area from meta list - kamers = None - woonoppervlak = None - for li in card.select("ul.tl-meta-listed > li"): - text = li.get_text(strip=True) - if "kamers" in text.lower(): - m = re.search(r"(\d+)", text) - if m: - kamers = int(m.group(1)) - elif "m²" in text or "m2" in text: - woonoppervlak = parse_m2(text) - - # Extract image - img_tag = card.select_one("img") - hero = img_tag["src"] if img_tag else None - if hero and not hero.startswith("http"): - hero = _3D_BASE + hero - - # Fetch detail page for full info - detail_data = _3dmakelaars_detail(detail_url) - - # Postcode from detail page, fallback to extraction from address - postcode = detail_data.get("postcode") - if not postcode and adres: - postcode = _extract_postcode(adres) - - listings.append(RawListing( - url=detail_url, - source_makelaar="3dmakelaars", - adres=adres, - postcode=postcode, - stad=_infer_stad(postcode), - prijs=prijs, - woningtype=detail_data.get("woningtype"), - bouwjaar=detail_data.get("bouwjaar"), - woonoppervlak=woonoppervlak or detail_data.get("woonoppervlak"), - kamers=kamers or detail_data.get("kamers"), - slaapkamers=detail_data.get("slaapkamers"), - hero_image_url=hero, - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("3dmakelaars: parse fout: %s", e) - - if len(cards) < 7: - break - page += 1 - - log.info("3dmakelaars: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# Dupont ERA Makelaars (Schiedam/Rotterdam) -# --------------------------------------------------------------------------- - -_DUPONT_BASE = "https://www.dupont.nl" - -_DUPONT_STATUS_MAP = { - "te koop": "beschikbaar", - "nieuw": "beschikbaar", - "onder bod": "onder_bod", - "verkocht onder voorbehoud": "onder_bod", - "verkocht": "verkocht", -} - - -def _dupont_detail(detail_url: str) -> dict: - """Fetch Dupont detail page and extract kenmerken from dt/dd pairs.""" - try: - soup = fetch_soup(detail_url) - - # Parse dt/dd pairs into label → value map - kv: dict[str, str] = {} - dts = soup.select("dt") - dds = soup.select("dd") - - for dt, dd in zip(dts, dds): - label = dt.get_text(strip=True).lower() - value = dd.get_text(strip=True) - kv[label] = value - - # Extract postcode from small tag (format: "NNNN AA CITY") - postcode = None - small_tag = soup.select_one("section div.container-fluid small") - if small_tag: - postcode = _extract_postcode(small_tag.get_text()) - - return { - "postcode": postcode, - "woningtype": kv.get("soort woning"), - "bouwjaar": kv.get("bouwjaar"), - "woonoppervlak": kv.get("woonoppervlakte"), - "kamers": kv.get("aantal kamers"), - "slaapkamers": kv.get("aantal slaapkamers"), - "energielabel": kv.get("energielabel"), - } - except Exception as e: - log.warning("dupont: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_dupont() -> list[RawListing]: - """Fetch Dupont ERA Makelaars listings with pagination and detail pages.""" - listings = [] - page = 1 - - while True: - url = f"{_DUPONT_BASE}/aanbod/koopwoningen?page={page}" - soup = fetch_soup(url) - cards = soup.select("article.object") - if not cards: - break - - for card in cards: - try: - # Extract URL - a_tag = card.select_one("a[href]") - if not a_tag or "href" not in a_tag.attrs: - continue - detail_url = a_tag["href"] - if not detail_url.startswith("http"): - detail_url = _DUPONT_BASE + detail_url - - # Extract listing-level data - adres = _text(card, "h3") - stad = _text(card, "h4") - prijs_text = _text(card, "div.price") - prijs = parse_prijs(prijs_text) - - # Extract status from label - status_label = _text(card, "div.label") or "beschikbaar" - status_label = status_label.strip().lower() - status = _DUPONT_STATUS_MAP.get(status_label, "beschikbaar") - - # Extract image - img_tag = card.select_one("img.img-responsive") - hero = img_tag["src"] if img_tag else None - if hero and not hero.startswith("http"): - hero = _DUPONT_BASE + hero - - # Fetch detail page for full data - detail_data = _dupont_detail(detail_url) - - # Use postcode from detail if available - postcode = detail_data.get("postcode") - - listings.append(RawListing( - url=detail_url, - source_makelaar="dupont", - adres=adres, - postcode=postcode, - stad=stad or _infer_stad(postcode), - prijs=prijs, - status=status, - hero_image_url=hero, - woningtype=detail_data.get("woningtype"), - bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None, - woonoppervlak=parse_m2(detail_data.get("woonoppervlak")), - kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else None, - slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None, - energielabel=detail_data.get("energielabel"), - )) - if config.APP_ENV == "dev": - break - - except Exception as e: - log.warning("dupont: parse fout: %s", e) - - if len(cards) < 10: - break - page += 1 - - log.info("dupont: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# Schieland Borsboom NVM Makelaars (Rotterdam, actief in Schiedam) -# --------------------------------------------------------------------------- - -_SCHIELAND_BASE = "https://www.schielandborsboom.nl" - -_SCHIELAND_STATUS_MAP = { - "beschikbaar": "beschikbaar", - "onder bod": "onder_bod", - "onder optie": "onder_bod", - "verkocht o.v.": "verkocht", - "verkocht": "verkocht", -} - - -def _schieland_detail(detail_url: str) -> dict: - """Fetch Schieland Borsboom detail page and extract kenmerken.""" - try: - soup = fetch_soup(detail_url) - - # Postcode from house__status p (e.g. "3117 DP Schiedam") - postcode_el = soup.select_one("div.house__status p") - postcode = _extract_postcode(postcode_el.get_text()) if postcode_el else None - - # Parse #kenmerken section:
  • labelvalue
  • - kv: dict[str, str] = {} - kenmerken = soup.select_one("#kenmerken") - if kenmerken: - for li in kenmerken.select("li"): - label_el = li.select_one("strong") - value_el = li.select_one("span") - if label_el and value_el: - # Strip nested links (e.g. "Hypotheek berekenen") - for a in value_el.select("a"): - a.decompose() - kv[label_el.get_text(strip=True).lower()] = value_el.get_text(strip=True) - - return { - "postcode": postcode, - "status": kv.get("status", "").lower(), - "woningtype": kv.get("soort bouw"), - "bouwjaar": kv.get("bouwjaar"), - "woonoppervlak": kv.get("woonoppervlakte"), - "perceeloppervlak": kv.get("perceeloppervlakte"), - "kamers": kv.get("aantal kamers"), - "slaapkamers": kv.get("aantal slaapkamers"), - "energielabel": kv.get("energielabel"), - } - except Exception as e: - log.warning("schielandborsboom: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_schielandborsboom() -> list[RawListing]: - """Fetch Schieland Borsboom NVM listings (koop only, Schiedam).""" - listings = [] - page = 1 - - while True: - if page == 1: - url = f"{_SCHIELAND_BASE}/wonen?sure_koop_huur=koop" - else: - url = f"{_SCHIELAND_BASE}/wonen/page/{page}/?sure_koop_huur=koop" - - soup = fetch_soup(url) - cards = soup.select("div.card.card--house") - if not cards: - break - - for card in cards: - try: - a_tag = card.select_one("a.card__anchor") - if not a_tag or "href" not in a_tag.attrs: - continue - detail_url = a_tag["href"] - if not detail_url.startswith("http"): - detail_url = _SCHIELAND_BASE + detail_url - - # Filter: only Schiedam - stad_el = card.select_one("p.house-place") - stad = stad_el.get_text(strip=True) if stad_el else None - if not stad or stad.lower() != "schiedam": - continue - - # Status from card-house__thumb second class - thumb = card.select_one("div.card-house__thumb") - status_classes = thumb.get("class", []) if thumb else [] - status_text = next( - (c for c in status_classes if c != "card-house__thumb"), "beschikbaar" - ).lower() - status = _SCHIELAND_STATUS_MAP.get(status_text, "beschikbaar") - - # Price - prijs = parse_prijs(_text(card, "p.price")) - if prijs and prijs > config.MAX_PRICE: - continue - - adres = _text(card, "h4.house-street") - - # Hero image from picture source (medium size) - src_tag = card.select_one('picture source[media="(min-width:100px)"]') - hero = src_tag["srcset"] if src_tag else _src(card, "img") - if hero and not hero.startswith("http"): - hero = _SCHIELAND_BASE + hero - - # Data icons on card: surface, bedrooms, energy label - woonoppervlak_card = None - slaapkamers_card = None - energielabel_card = None - for data_div in card.select("div.data"): - txt = data_div.get_text(strip=True) - if data_div.select_one("i.icon-surface"): - woonoppervlak_card = parse_m2(txt) - elif data_div.select_one("i.icon-bedrooms"): - m = re.search(r"(\d+)", txt) - slaapkamers_card = int(m.group(1)) if m else None - elif data_div.select_one("i.icon-label"): - energielabel_card = txt.strip() or None - - # Fetch detail page for full kenmerken - kk = _schieland_detail(detail_url) - - # Refine status from detail page - if kk.get("status"): - status = _SCHIELAND_STATUS_MAP.get(kk["status"], status) - - # Parse kamers: "5 kamers" → 5 - kamers = None - if kk.get("kamers"): - m = re.search(r"(\d+)", kk["kamers"]) - kamers = int(m.group(1)) if m else None - - # Parse slaapkamers: "3" or "3 slaapkamers" → 3 - slaapkamers = slaapkamers_card - if kk.get("slaapkamers"): - m = re.search(r"(\d+)", kk["slaapkamers"]) - slaapkamers = int(m.group(1)) if m else slaapkamers_card - - listings.append(RawListing( - url=detail_url, - source_makelaar="schielandborsboom", - status=status, - adres=adres, - postcode=kk.get("postcode"), - stad=stad, - prijs=prijs, - hero_image_url=hero, - woningtype=kk.get("woningtype"), - bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, - woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, - perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), - kamers=kamers, - slaapkamers=slaapkamers, - energielabel=kk.get("energielabel") or energielabel_card, - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("schielandborsboom: parse fout: %s", e) - - if len(cards) < 18: - break - page += 1 - - log.info("schielandborsboom: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# Van Silfhout & Hogetoorn Wereldmakelaars (Delft) -# --------------------------------------------------------------------------- - -_VANSILFHOUT_BASE = "https://www.vansilfhout.nl" - -_VANSILFHOUT_STATUS_MAP = { - "te koop": "beschikbaar", - "onder bod": "onder_bod", - "verkocht onder voorbehoud": "verkocht", - "verkocht": "verkocht", -} - - -def _vansilfhout_detail(detail_url: str) -> dict: - """Fetch Van Silfhout detail page; extract postcode from JS and specs from shortSpecs.""" - try: - import re as _re - r = __import__("httpx").get( - detail_url, - headers={"User-Agent": config.USER_AGENT}, - timeout=15, - follow_redirects=True, - ) - r.raise_for_status() - html = r.text - from bs4 import BeautifulSoup as _BS - soup = _BS(html, "html.parser") - - # Postcode embedded in JS: objectZipcode': '2624NP' - m = _re.search(r"objectZipcode':\s*'([^']+)'", html) - postcode = m.group(1) if m else None - - # shortSpecs:
  • Label:Value
  • - kv: dict[str, str] = {} - for li in soup.select(".shortSpecs li"): - spans = li.select("span") - if len(spans) >= 2: - label = spans[0].get_text(strip=True).rstrip(":").lower() - value = spans[-1].get_text(strip=True) - kv[label] = value - - return { - "postcode": postcode, - "bouwjaar": kv.get("bouwjaar"), - "woonoppervlak": kv.get("oppervlakte"), - "kamers": kv.get("kamers"), - "slaapkamers": kv.get("slaapkamers"), - } - except Exception as e: - log.warning("vansilfhout: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_vansilfhout() -> list[RawListing]: - """Fetch Van Silfhout woningaanbod (alle listings op één pagina).""" - soup = fetch_soup(f"{_VANSILFHOUT_BASE}/woningaanbod/") - listings = [] - - for card in soup.select("article.row"): - try: - a_tag = card.select_one("a.objectcontainerimg") - if not a_tag or "href" not in a_tag.attrs: - continue - detail_url = a_tag["href"] - if not detail_url.startswith("http"): - detail_url = _VANSILFHOUT_BASE + detail_url - - # Status - status_text = (_text(card, "span.objectstatus") or "").lower() - status = _VANSILFHOUT_STATUS_MAP.get(status_text, "beschikbaar") - - # Address and city - adres = _text(card, "h2.objecttitle") - city_el = card.select("a.straatnaamwoonplaats span") - stad = city_el[-1].get_text(strip=True) if city_el else None - - # Price from shortSpecs strong - prijs = parse_prijs(_text(card, "ul.shortSpecs li strong")) - if prijs and prijs > config.MAX_PRICE: - continue - - # Area and rooms from shortSpecs - woonoppervlak_card = None - kamers_card = None - for li in card.select("ul.shortSpecs li"): - spans = li.select("span") - if len(spans) >= 2: - label = spans[0].get_text(strip=True).lower() - val = spans[-1].get_text(strip=True) - if "oppervlakt" in label: - woonoppervlak_card = parse_m2(val) - elif "kamer" in label: - m = re.search(r"(\d+)", val) - kamers_card = int(m.group(1)) if m else None - - # Hero image: prefer data-lazy-src, fall back to noscript img src - img_tag = card.select_one("a.objectcontainerimg img") - hero = None - if img_tag: - hero = (img_tag.get("data-lazy-src") - or img_tag.get("src") or None) - if hero and hero.startswith("data:"): - noscript = card.select_one("noscript img") - hero = noscript["src"] if noscript else None - - kk = _vansilfhout_detail(detail_url) - - # Parse kamers/slaapkamers from detail - kamers = kamers_card - if kk.get("kamers"): - m = re.search(r"(\d+)", kk["kamers"]) - kamers = int(m.group(1)) if m else kamers_card - - slaapkamers = None - if kk.get("slaapkamers"): - m = re.search(r"(\d+)", kk["slaapkamers"]) - slaapkamers = int(m.group(1)) if m else None - - listings.append(RawListing( - url=detail_url, - source_makelaar="vansilfhout", - status=status, - adres=adres, - postcode=kk.get("postcode"), - stad=stad, - prijs=prijs, - hero_image_url=hero, - bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, - woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, - kamers=kamers, - slaapkamers=slaapkamers, - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("vansilfhout: parse fout: %s", e) - - log.info("vansilfhout: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# V&W Makelaars Delft / ZO Makelaars (Delft) — Realworks CMS -# --------------------------------------------------------------------------- - -def fetch_vwmakelaars() -> list[RawListing]: - return fetch_realworks("https://www.vwmakelaars.nl", "vwmakelaars") - - -def fetch_zomakelaars() -> list[RawListing]: - return fetch_realworks("https://www.zomakelaars.nl", "zomakelaars") - - -# --------------------------------------------------------------------------- -# Roepman Makelaardij NVM (Delft) -# --------------------------------------------------------------------------- -# Realworks CMS maar met div.aanbodEntry i.p.v. li.aanbodEntry. -# Prijs zit in JSON-LD (zelfde structuur als Wassenaar). - -_ROEPMAN_BASE = "https://www.roepman.nl" - - -def fetch_roepman() -> list[RawListing]: - import json as _json - listings_path = f"/aanbod/woningaanbod/-{config.MAX_PRICE}/koop" - listings = [] - page = 1 - - while True: - url = f"{_ROEPMAN_BASE}{listings_path}/pagina-{page}/" - soup = fetch_soup(url) - cards = soup.select("div.aanbodEntry") - if not cards: - break - - # Collect status + photo per relative url - status_by_url: dict[str, str] = {} - photo_by_url: dict[str, str] = {} - for card in cards: - a_tag = card.select_one("a.aanbodEntryLink[href]") - if not a_tag: - continue - href = a_tag["href"] - if href in status_by_url: - continue - banner = card.select_one(".objectstatusbanner") - status_text = banner.get_text(strip=True).lower() if banner else "" - status_by_url[href] = _REALWORKS_STATUS_MAP.get(status_text, "beschikbaar") - img = card.select_one("img") - if img: - src = img.get("src", "") - if "geenfotobeschikbaar" not in src: - photo_by_url[href] = src - - # Parse JSON-LD Residence blocks (one per listing) - seen: set[str] = set() - for tag in soup.select('script[type="application/ld+json"]'): - try: - ld = _json.loads(tag.string) - if ld.get("@type") != "Residence": - continue - rel_url = ld.get("url", "") - if not rel_url or rel_url in seen: - continue - seen.add(rel_url) - - detail_url = _ROEPMAN_BASE + rel_url - address = ld.get("address", {}) - postcode = address.get("postalCode", "").replace(" ", "") or None - - price_spec = next( - (a.get("priceSpecification", {}) for a in ld.get("potentialAction", []) - if a.get("priceSpecification")), - {} - ) - prijs = int(price_spec["price"]) if price_spec.get("price") else None - if prijs and prijs > config.MAX_PRICE: - continue - - hero = ld.get("photo") or photo_by_url.get(rel_url) - status = status_by_url.get(rel_url, "beschikbaar") - kk = _realworks_detail(detail_url, "roepman") - - listings.append(RawListing( - url=detail_url, - source_makelaar="roepman", - status=status, - adres=address.get("streetAddress") or None, - postcode=postcode, - stad=address.get("addressLocality") or None, - prijs=prijs, - hero_image_url=hero, - woningtype=kk.get("woningtype"), - bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, - woonoppervlak=parse_m2(kk.get("woonoppervlak")), - perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), - kamers=int(kk["kamers"]) if kk.get("kamers") else None, - slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None, - energielabel=kk.get("energielabel"), - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("roepman: parse fout: %s", e) - - if len(cards) < 10: - break - page += 1 - - log.info("roepman: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# Post Makelaardij (v/h Bayense) — Delft & omgeving -# --------------------------------------------------------------------------- -# Custom Tailwind CSS site; covers Delft, Pijnacker, Rijswijk etc. -# Filter for Delft only. - -_POST_BASE = "https://www.postmakelaardij.nl" - -_POST_STATUS_MAP = { - "te koop": "beschikbaar", - "onder bod": "onder_bod", - "verkocht": "verkocht", -} - - -def _post_detail(detail_url: str) -> dict: - """Fetch Post Makelaardij detail page and extract kenmerken.""" - try: - soup = fetch_soup(detail_url) - - # Energielabel from CSS class: energielabel-{letter} - energielabel = None - for el in soup.select('[class]'): - for cls in el.get('class', []): - if cls.startswith('energielabel-') and cls != 'energielabel': - energielabel = cls.replace('energielabel-', '').upper() - break - if energielabel: - break - - # Woonoppervlak, perceeloppervlak, slaapkamers from icon spans - woonoppervlak = None - perceeloppervlak = None - slaapkamers = None - for span in soup.select('span.object-info-icon-text'): - txt = span.get_text(strip=True) - if 'slaapkamer' in txt: - m = re.search(r'(\d+)', txt) - slaapkamers = int(m.group(1)) if m else None - elif 'perceel' in txt: - perceeloppervlak = parse_m2(txt) - elif 'm²' in txt or 'm2' in txt: - woonoppervlak = parse_m2(txt) - - return { - "woonoppervlak": woonoppervlak, - "perceeloppervlak": perceeloppervlak, - "slaapkamers": slaapkamers, - "energielabel": energielabel, - } - except Exception as e: - log.warning("post: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_post() -> list[RawListing]: - """Fetch Post Makelaardij listings; only Delft, only koop.""" - listings = [] - page = 1 - - while True: - url = f"{_POST_BASE}/woningaanbod/koop?page={page}" - soup = fetch_soup(url) - cards = soup.select("article") - if not cards: - break - - for card in cards: - try: - # URL — first link in image slider - a_tag = card.select_one("a[href]") - if not a_tag: - continue - href = a_tag["href"] - detail_url = href if href.startswith("http") else _POST_BASE + href - - # Postcode + city from span.custom-postcode-text - pc_el = card.select_one("span.custom-postcode-text") - if not pc_el: - continue - pc_parts = pc_el.get_text(strip=True).split() - if len(pc_parts) < 3: - continue - postcode = pc_parts[0] + pc_parts[1] # "2613BD" - stad = " ".join(pc_parts[2:]) # "Delft" - - # Filter: only Delft - if stad.lower() != "delft": - continue - - # Price — filter early - prijs = parse_prijs(_text(card, "span.price-block")) - if prijs and prijs > config.MAX_PRICE: - continue - - # Status from span.status text - status_text = (_text(card, "span.status") or "").lower() - status = _POST_STATUS_MAP.get(status_text, "beschikbaar") - - # Address - adres = _text(card, "h4.custom-address-text") - - # Hero: first img in article - img = card.select_one("img") - hero = img["src"] if img else None - - kk = _post_detail(detail_url) - - listings.append(RawListing( - url=detail_url, - source_makelaar="post", - status=status, - adres=adres, - postcode=postcode, - stad=stad, - prijs=prijs, - hero_image_url=hero, - woonoppervlak=kk.get("woonoppervlak"), - perceeloppervlak=kk.get("perceeloppervlak"), - slaapkamers=kk.get("slaapkamers"), - energielabel=kk.get("energielabel"), - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("post: parse fout: %s", e) - - if len(cards) < 12: - break - page += 1 - - log.info("post: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# Morris NVM Makelaars (Delft) — Realworks CMS -# --------------------------------------------------------------------------- - -def fetch_morris() -> list[RawListing]: - return fetch_realworks("https://www.morrismakelaardij.nl", "morris") - - -# --------------------------------------------------------------------------- -# Olsthoorn Makelaars Delft (SURE WordPress plugin) -# --------------------------------------------------------------------------- -# Covers Delft, Den Haag, Naaldwijk etc — we filter for Delft only. -# Detail page has no postcode; leave as None. - -_OLSTHOORN_BASE = "https://www.olsthoornmakelaars.nl" - -_OLSTHOORN_STATUS_MAP = { - "badge-available": "beschikbaar", - "badge-bid": "onder_bod", - "badge-option": "onder_bod", - "badge-sold": "verkocht", -} - -_OLSTHOORN_DETAIL_STATUS_MAP = { - "beschikbaar": "beschikbaar", - "onder bod": "onder_bod", - "onder optie": "onder_bod", - "verkocht": "verkocht", -} - - -def _olsthoorn_detail(detail_url: str) -> dict: - """Fetch Olsthoorn detail page; extract kenmerken from #kenmerken li pairs.""" - try: - soup = fetch_soup(detail_url) - kv: dict[str, str] = {} - for li in soup.select("#kenmerken li"): - spans = li.select("span") - if len(spans) >= 2: - label = spans[0].get_text(strip=True).lower() - value = spans[1].get_text(strip=True) - kv[label] = value - return { - "status": kv.get("status", "").lower(), - "woningtype": kv.get("soort object") or kv.get("soort woning") or kv.get("soort bouw"), - "bouwjaar": kv.get("bouwjaar"), - "woonoppervlak": kv.get("gebruiksoppervlakte"), - "perceeloppervlak": kv.get("perceeloppervlakte"), - "kamers": kv.get("aantal kamers"), - "slaapkamers": kv.get("aantal slaapkamers"), - "energielabel": kv.get("energielabel"), - } - except Exception as e: - log.warning("olsthoorn: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_olsthoorn() -> list[RawListing]: - """Fetch Olsthoorn Makelaars listings; only Delft, only koop.""" - listings = [] - page = 1 - - while True: - if page == 1: - url = f"{_OLSTHOORN_BASE}/wonen?sure_koop_huur=koop" - else: - url = f"{_OLSTHOORN_BASE}/wonen/page/{page}/?sure_koop_huur=koop" - - soup = fetch_soup(url) - cards = soup.select("a.card-house") - if not cards: - break - - for card in cards: - try: - href = card.get("href", "") - if not href: - continue - detail_url = href if href.startswith("http") else _OLSTHOORN_BASE + href - - # Filter: only Delft - stad_el = card.select_one("h2.card__title") - stad = stad_el.get_text(strip=True) if stad_el else None - if not stad or stad.lower() != "delft": - continue - - # Price from bold tag — filter early before detail fetch - prijs_b = card.select_one("b") - prijs = parse_prijs(prijs_b.get_text() if prijs_b else None) - if prijs and prijs > config.MAX_PRICE: - continue - - # Status from badge class on label span - label_span = card.select_one("span.card-house__label") - status = "beschikbaar" - if label_span: - for cls in label_span.get("class", []): - if cls in _OLSTHOORN_STATUS_MAP: - status = _OLSTHOORN_STATUS_MAP[cls] - break - - # Address: second

    under .short--info (collapse internal whitespace) - adres_p = card.select("div.short--info > p") - if adres_p: - adres = " ".join(adres_p[0].get_text().split()) - else: - adres = None - - # Hero image: largest source srcset - src_tag = card.select_one('picture source[media="(min-width:1024px)"]') - hero = src_tag.get("data-srcset") if src_tag else None - if hero and not hero.startswith("http"): - hero = _OLSTHOORN_BASE + hero - - # Woonoppervlak + kamers + energielabel from card data icons - woonoppervlak_card = None - kamers_card = None - energielabel_card = None - for data_div in card.select("div.data"): - inner = data_div.select_one("span.date__inner") - if not inner: - continue - txt = inner.get_text(strip=True) - if data_div.select_one("i.icon-sizes"): - woonoppervlak_card = parse_m2(txt) - elif data_div.select_one("i.icon-door"): - m = re.search(r"(\d+)", txt) - kamers_card = int(m.group(1)) if m else None - elif data_div.select_one("i.icon-energylabel"): - energielabel_card = txt or None - - kk = _olsthoorn_detail(detail_url) - - # Refine status from detail page - detail_status = _OLSTHOORN_DETAIL_STATUS_MAP.get(kk.get("status", ""), "") - if detail_status: - status = detail_status - - listings.append(RawListing( - url=detail_url, - source_makelaar="olsthoorn", - status=status, - adres=adres, - postcode=None, # not exposed by broker - stad=stad, - prijs=prijs, - hero_image_url=hero, - woningtype=kk.get("woningtype"), - bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, - woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, - perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), - kamers=int(kk["kamers"]) if kk.get("kamers") else kamers_card, - slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None, - energielabel=kk.get("energielabel") or energielabel_card, - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("olsthoorn: parse fout: %s", e) - - if len(cards) < 15: - break - page += 1 - - log.info("olsthoorn: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# 88 Makelaars (Den Haag) — Custom WordPress theme -# --------------------------------------------------------------------------- -# Cards on /ons-aanbod/page/{N}/; details in div.listing_detail kv pairs. - -_88_BASE = "https://88makelaars.nl" - -_88_STATUS_MAP = { - "te koop": "beschikbaar", - "beschikbaar": "beschikbaar", - "onder bod": "onder_bod", - "onder optie": "onder_bod", - "verkocht onder voorbehoud": "verkocht", - "verkocht": "verkocht", -} - - -def _88makelaars_detail(detail_url: str) -> dict: - """Fetch 88makelaars detail page; extract kenmerken from div.listing_detail kv pairs.""" - try: - soup = fetch_soup(detail_url) - kv: dict[str, str] = {} - for div in soup.select("div.listing_detail"): - txt = div.get_text(strip=True) - if ":" in txt: - label, _, value = txt.partition(":") - kv[label.strip().lower()] = value.strip() - raw_pc = kv.get("postcode") or "" - pc_match = re.search(r"\d{4}\s*[A-Z]{2}", raw_pc.upper()) - postcode = pc_match.group(0).replace(" ", "") if pc_match else None - return { - "postcode": postcode, - "slaapkamers": kv.get("slaapkamers"), - "woonoppervlak": kv.get("woning grootte"), - "energielabel": kv.get("energieklasse"), - "woningtype": kv.get("soort woning"), - } - except Exception as e: - log.warning("88makelaars: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_88makelaars() -> list[RawListing]: - """Fetch 88 Makelaars listings (Den Haag only).""" - listings = [] - page = 1 - - while True: - if page == 1: - url = f"{_88_BASE}/ons-aanbod/" - else: - url = f"{_88_BASE}/ons-aanbod/page/{page}/" - soup = fetch_soup(url) - cards = soup.select("div.property_listing") - if not cards: - break - - for card in cards: - try: - # URL from carousel - a_tag = card.select_one(".property_unit_carousel a[href]") - if not a_tag: - continue - detail_url = a_tag["href"] - if not detail_url.startswith("http"): - detail_url = _88_BASE + detail_url - - # City — last link in property_location_image - loc_links = card.select(".property_location_image a") - stad = loc_links[-1].get_text(strip=True) if loc_links else None - if not stad or stad.lower() != "den haag": - continue - - # Price - prijs = parse_prijs(_text(card, ".listing_unit_price_wrapper")) - if prijs and prijs > config.MAX_PRICE: - continue - - # Status - status_text = (_text(card, ".ribbon-inside") or "").lower() - status = _88_STATUS_MAP.get(status_text, "beschikbaar") - - # Address - adres = _text(card, "h4 a") or _text(card, "h4") - - # Surface + rooms - woonoppervlak_card = parse_m2(_text(card, "span.infosize")) - kamers_card = None - rooms_txt = _text(card, "span.inforoom") - if rooms_txt: - m = re.search(r"(\d+)", rooms_txt) - kamers_card = int(m.group(1)) if m else None - - # Hero: first active carousel image - img = card.select_one(".item.active img") - hero = img.get("src") or img.get("data-original") if img else None - - kk = _88makelaars_detail(detail_url) - - listings.append(RawListing( - url=detail_url, - source_makelaar="88makelaars", - status=status, - adres=adres, - postcode=kk.get("postcode"), - stad="Den Haag", - prijs=prijs, - hero_image_url=hero, - woningtype=kk.get("woningtype"), - woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, - kamers=kamers_card, - slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None, - energielabel=kk.get("energielabel"), - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("88makelaars: parse fout: %s", e) - - if len(cards) < 10: - break - page += 1 - - log.info("88makelaars: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# Borgdorff Makelaars (Den Haag / Westland) — SURE WordPress plugin -# --------------------------------------------------------------------------- -# Covers Den Haag ('s-gravenhage), Monster, Naaldwijk etc. Filter for Den Haag. -# Same SURE plugin as Schieland Borsboom but uses a.card--house (double dash). -# No postcode on detail page. - -_BORGDORFF_BASE = "https://www.borgdorff.nl" -_BORGDORFF_DEN_HAAG = {"'s-gravenhage", "den haag"} - -_BORGDORFF_BADGE_MAP = { - "badge--info": "beschikbaar", - "badge--warning": "onder_bod", - "badge--danger": "verkocht", -} - - -def _borgdorff_detail(detail_url: str) -> dict: - """Fetch Borgdorff detail page; extract #kenmerken li span pairs.""" - try: - soup = fetch_soup(detail_url) - kv: dict[str, str] = {} - for li in soup.select("#kenmerken li"): - spans = li.select("span") - if len(spans) >= 2: - label = spans[0].get_text(strip=True).lower() - value = spans[1].get_text(strip=True) - kv[label] = value - return { - "status": kv.get("status", "").lower(), - "woningtype": kv.get("soort woonhuis") or kv.get("soort woning") or kv.get("soort bouw"), - "bouwjaar": kv.get("bouwjaar"), - "woonoppervlak": kv.get("gebruiksoppervlakte wonen") or kv.get("gebruiksoppervlakte"), - "perceeloppervlak": kv.get("perceeloppervlakte"), - "slaapkamers": kv.get("aantal slaapkamers"), - "energielabel": kv.get("energielabel"), - } - except Exception as e: - log.warning("borgdorff: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_borgdorff() -> list[RawListing]: - """Fetch Borgdorff listings; only Den Haag / 's-gravenhage, only koop.""" - listings = [] - page = 1 - - while True: - if page == 1: - url = f"{_BORGDORFF_BASE}/wonen?sure_koop_huur=koop" - else: - url = f"{_BORGDORFF_BASE}/wonen/page/{page}/?sure_koop_huur=koop" - - soup = fetch_soup(url) - cards = soup.select("a.card--house") - if not cards: - break - - for card in cards: - try: - href = card.get("href", "") - if not href: - continue - detail_url = href if href.startswith("http") else _BORGDORFF_BASE + href - - # Filter: only Den Haag - stad_el = card.select_one("p.lead-two") - stad = stad_el.get_text(strip=True) if stad_el else None - if not stad or stad.lower() not in _BORGDORFF_DEN_HAAG: - continue - - # Price — filter early - prijs = parse_prijs(_text(card, "p.strong")) - if prijs and prijs > config.MAX_PRICE: - continue - - # Status from badge class - label_span = card.select_one("span.card-house__label") - status = "beschikbaar" - if label_span: - for cls in label_span.get("class", []): - if cls in _BORGDORFF_BADGE_MAP: - status = _BORGDORFF_BADGE_MAP[cls] - break - - # Address - adres = _text(card, "h4") - - # Hero: largest source srcset - src_tag = card.select_one('picture source[media="(min-width:1280px)"]') - hero = src_tag.get("srcset") if src_tag else None - if not hero: - img = card.select_one("img[data-src]") - hero = img.get("data-src") if img else None - if hero and not hero.startswith("http"): - hero = _BORGDORFF_BASE + hero - - # Surface + bedrooms from data icons - woonoppervlak_card = None - slaapkamers_card = None - for data_div in card.select("div.data"): - inner = data_div.select_one("p.small") - if not inner: - continue - txt = inner.get_text(strip=True) - if data_div.select_one("i.icon-surface"): - woonoppervlak_card = parse_m2(txt) - elif data_div.select_one("i.icon-bed"): - m = re.search(r"(\d+)", txt) - slaapkamers_card = int(m.group(1)) if m else None - - kk = _borgdorff_detail(detail_url) - - # Refine status from detail page - detail_status_map = { - "beschikbaar": "beschikbaar", - "onder bod": "onder_bod", - "onder optie": "onder_bod", - "verkocht": "verkocht", - } - if kk.get("status"): - status = detail_status_map.get(kk["status"], status) - - listings.append(RawListing( - url=detail_url, - source_makelaar="borgdorff", - status=status, - adres=adres, - postcode=None, # not exposed by broker - stad=stad, - prijs=prijs, - hero_image_url=hero, - woningtype=kk.get("woningtype"), - bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, - woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, - perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), - slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else slaapkamers_card, - energielabel=kk.get("energielabel"), - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("borgdorff: parse fout: %s", e) - - if len(cards) < 15: - break - page += 1 - - log.info("borgdorff: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# Van Herk Makelaars (Schiedam) — SURE WordPress plugin (card-house) -# --------------------------------------------------------------------------- -# Listings filtered by city + price in URL; pagination via /page/{N}/. -# Detail page: div.features ul.unstyled li with two (label + value). - -_VANHERK_BASE = "https://www.vanherk.nl" -_VANHERK_LISTINGS = "https://www.vanherk.nl/wonen/aanbod/zoeken/schiedam/200000-300000/" - -_VANHERK_STATUS_MAP = { - "beschikbaar": "beschikbaar", - "onder bod": "onder_bod", - "onder optie": "onder_bod", - "verkocht": "verkocht", -} - - -def _vanherk_detail(detail_url: str) -> dict: - """Fetch Van Herk detail page; extract kenmerken from div.features.""" - try: - soup = fetch_soup(detail_url) - kv: dict[str, str] = {} - for li in soup.select("div.features ul.unstyled li"): - spans = li.select("span") - if len(spans) >= 2: - label = spans[0].get_text(strip=True).lower() - value = spans[1].get_text(strip=True) - kv[label] = value - return { - "status": kv.get("status", "").lower(), - "bouwjaar": kv.get("bouwjaar"), - "woonoppervlak": kv.get("woonoppervlakte"), - "slaapkamers": kv.get("aantal slaapkamers"), - } - except Exception as e: - log.warning("vanherk: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_vanherk() -> list[RawListing]: - """Fetch Van Herk listings; only Schiedam, only koop.""" - listings = [] - page = 1 - - while True: - if page == 1: - url = _VANHERK_LISTINGS - else: - url = _VANHERK_LISTINGS + f"page/{page}/" - - soup = fetch_soup(url) - cards = soup.select("a.card-house") - if not cards: - break - - for card in cards: - try: - href = card.get("href", "") - if not href: - continue - detail_url = href if href.startswith("http") else _VANHERK_BASE + href - - # City from lead paragraph - lead = card.select_one("p.lead") - stad = lead.get_text(strip=True) if lead else None - - # Address from h4 (normalize whitespace incl.  ) - h4 = card.select_one("h4") - adres = " ".join(h4.get_text().split()) if h4 else None - - # Price from .subtitle - subtitle = card.select_one("p.subtitle") - prijs = parse_prijs(subtitle.get_text() if subtitle else None) - if prijs and prijs > config.MAX_PRICE: - continue - - # Hero image: largest srcset source - src_tag = card.select_one('picture source[media="(min-width:1280px)"]') - hero = src_tag.get("srcset") if src_tag else None - if hero and not hero.startswith("http"): - hero = _VANHERK_BASE + hero - - # Card data icons: surface, bedrooms, energy label - woonoppervlak_card = None - slaapkamers_card = None - energielabel_card = None - for data_div in card.select("div.data"): - classes = data_div.get("class") or [] - if "d-none" in classes: - continue - if "data-energie" in classes: - inner = data_div.select_one(".date__inner") - energielabel_card = inner.get_text(strip=True) if inner else None - elif data_div.select_one("i.icon-surface"): - inner = data_div.select_one("span.date__inner") - woonoppervlak_card = parse_m2(inner.get_text(strip=True) if inner else None) - elif data_div.select_one("i.icon-bed"): - inner = data_div.select_one("span.date__inner") - txt = inner.get_text(strip=True) if inner else None - m = re.search(r"(\d+)", txt) if txt else None - slaapkamers_card = int(m.group(1)) if m else None - - kk = _vanherk_detail(detail_url) - - status = _VANHERK_STATUS_MAP.get(kk.get("status", ""), "beschikbaar") - - listings.append(RawListing( - url=detail_url, - source_makelaar="vanherk", - status=status, - adres=adres, - stad=stad, - prijs=prijs, - hero_image_url=hero, - bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None, - woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, - slaapkamers=(int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None) or slaapkamers_card, - energielabel=energielabel_card, - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("vanherk: parse fout: %s", e) - - if len(cards) < 15: - break - page += 1 - - log.info("vanherk: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# Van Oord Makelaardij (Delft + Schiedam) — Elementor WordPress -# --------------------------------------------------------------------------- -# Two filtered listing URLs (one per city). Cards are div.e-loop-item. -# Detail page: ul.rw-object-features-list li with label/value spans. - -_VANOORD_BASE = "https://www.vanoordmakelaardij.nl" -_VANOORD_LISTINGS = [ - "https://www.vanoordmakelaardij.nl/aanbod/?view=list&plaats=delft&prijs_vanaf=225000&prijs_tot=300000", - "https://www.vanoordmakelaardij.nl/aanbod/?view=list&plaats=schiedam&prijs_vanaf=225000&prijs_tot=300000", -] - -_VANOORD_STATUS_MAP = { - "beschikbaar": "beschikbaar", - "onder bod": "onder_bod", - "onder optie": "onder_bod", - "verkocht": "verkocht", -} - - -def _vanoord_detail(detail_url: str) -> dict: - """Fetch Van Oord detail page; extract kenmerken from rw-object-features-list.""" - try: - soup = fetch_soup(detail_url) - kv: dict[str, str] = {} - for li in soup.select("ul.rw-object-features-list li"): - label_el = li.select_one("span.rw-object-list-label") - value_el = li.select_one("span.rw-object-list-value") - if label_el and value_el: - label = label_el.get_text(strip=True).lower() - value = value_el.get_text(strip=True) - kv[label] = value - return { - "status": kv.get("status", "").lower(), - "bouwjaar": kv.get("bouwjaar"), - "woonoppervlak": kv.get("woonoppervlakte"), - "kamers": kv.get("aantal kamers"), - "slaapkamers": kv.get("slaapkamers"), - "energielabel": kv.get("energieklasse"), - } - except Exception as e: - log.warning("vanoord: detail fetch fout %s: %s", detail_url, e) - return {} - - -def fetch_vanoord() -> list[RawListing]: - """Fetch Van Oord listings; Delft and Schiedam, only koop.""" - seen: set[str] = set() - listings = [] - - for listing_url in _VANOORD_LISTINGS: - soup = fetch_soup(listing_url) - cards = soup.select("div.e-loop-item") - - for card in cards: - try: - # Detail URL from h3 > a - a_tag = card.select_one("h3.elementor-heading-title a[href]") - if not a_tag: - continue - detail_url = a_tag["href"] - if not detail_url.startswith("http"): - detail_url = _VANOORD_BASE + detail_url - if detail_url in seen: - continue - seen.add(detail_url) - - # Status from rw-status-label widget class - status_el = card.select_one("[class*='rw-status-label--']") - status = "beschikbaar" - if status_el: - status_text = status_el.get_text(strip=True).lower() - status = _VANOORD_STATUS_MAP.get(status_text, "beschikbaar") - - # City from h4 - h4 = card.select_one("h4.elementor-heading-title") - stad = h4.get_text(strip=True) if h4 else None - - # Address from h3 > a text - adres = " ".join(a_tag.get_text().split()) - - # Price from h3 without child - prijs = None - for h3 in card.select("h3.elementor-heading-title"): - if not h3.select_one("a"): - prijs = parse_prijs(h3.get_text()) - break - if prijs and prijs > config.MAX_PRICE: - continue - - # Card icon list: [0]=surface [1]=rooms [2]=energy - icon_items = card.select("ul.elementor-icon-list-items li span.elementor-icon-list-text") - woonoppervlak_card = parse_m2(icon_items[0].get_text()) if len(icon_items) > 0 else None - kamers_card = None - if len(icon_items) > 1: - m = re.search(r"(\d+)", icon_items[1].get_text()) - kamers_card = int(m.group(1)) if m else None - energielabel_card = icon_items[2].get_text(strip=True) if len(icon_items) > 2 else None - - kk = _vanoord_detail(detail_url) - - detail_status = _VANOORD_STATUS_MAP.get(kk.get("status", ""), "") - if detail_status: - status = detail_status - - listings.append(RawListing( - url=detail_url, - source_makelaar="vanoord", - status=status, - adres=adres, - stad=stad, - prijs=prijs, - bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None, - woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, - kamers=(int(kk["kamers"]) if kk.get("kamers", "").isdigit() else None) or kamers_card, - slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None, - energielabel=kk.get("energielabel") or energielabel_card, - )) - if config.APP_ENV == "dev": - break - except Exception as e: - log.warning("vanoord: parse fout: %s", e) - - log.info("vanoord: %d listings opgehaald", len(listings)) - return listings - - -# --------------------------------------------------------------------------- -# SCRAPERS — exporteer hier alle actieve SSR adapters -# --------------------------------------------------------------------------- - -SCRAPERS = { - 'ankebodewes': fetch_ankebodewes, - 'woongoed': fetch_woongoed, - 'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars, - 'wassenaar': fetch_wassenaar, - 'dens': fetch_dens, - '3dmakelaars': fetch_3dmakelaars, - 'dupont': fetch_dupont, - 'schielandborsboom': fetch_schielandborsboom, - 'vansilfhout': fetch_vansilfhout, - 'vwmakelaars': fetch_vwmakelaars, - 'roepman': fetch_roepman, - 'zomakelaars': fetch_zomakelaars, - 'post': fetch_post, - 'morris': fetch_morris, - 'olsthoorn': fetch_olsthoorn, - '88makelaars': fetch_88makelaars, - 'borgdorff': fetch_borgdorff, - 'vanherk': fetch_vanherk, - 'vanoord': fetch_vanoord, -} diff --git a/src/adapters/ssr/__init__.py b/src/adapters/ssr/__init__.py new file mode 100644 index 0000000..6f6b717 --- /dev/null +++ b/src/adapters/ssr/__init__.py @@ -0,0 +1,63 @@ +""" +adapters/ssr — HTML/SSR-based makelaars + +Elke scraper is een functie () -> list[RawListing]. +Om een nieuwe makelaar toe te voegen: + 1. Voeg een fetch_* functie toe in het juiste submodule + (realworks.py, sure.py, schiedam.py, denhaag.py, overige.py) + 2. Importeer de functie hier en registreer in SCRAPERS. + +CMS-typen per module: + realworks.py — Realworks CMS (li/div.aanbodEntry + span.kenmerk detail) + sure.py — SURE WordPress plugin (/wonen?sure_koop_huur=koop + #kenmerken) + schiedam.py — Custom Schiedam scrapers (diverse platforms) + denhaag.py — Den Haag scrapers (diverse platforms) + overige.py — Overige / multi-stad (OG Online WP, Elementor) +""" + +from .realworks import ( + fetch_ankebodewes, + fetch_woongoed, + fetch_vwmakelaars, + fetch_zomakelaars, + fetch_morris, + fetch_wassenaar, + fetch_roepman, + fetch_post, +) +from .sure import ( + fetch_schielandborsboom, + fetch_olsthoorn, + fetch_vanherk, + fetch_borgdorff, +) +from .schiedam import ( + fetch_dewittegarantiemakelaars, + fetch_dens, + fetch_3dmakelaars, + fetch_dupont, +) +from .denhaag import fetch_88makelaars +from .overige import fetch_vansilfhout, fetch_vanoord + +SCRAPERS = { + 'ankebodewes': fetch_ankebodewes, + 'woongoed': fetch_woongoed, + 'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars, + 'wassenaar': fetch_wassenaar, + 'dens': fetch_dens, + '3dmakelaars': fetch_3dmakelaars, + 'dupont': fetch_dupont, + 'schielandborsboom': fetch_schielandborsboom, + 'vansilfhout': fetch_vansilfhout, + 'vwmakelaars': fetch_vwmakelaars, + 'roepman': fetch_roepman, + 'zomakelaars': fetch_zomakelaars, + 'post': fetch_post, + 'morris': fetch_morris, + 'olsthoorn': fetch_olsthoorn, + '88makelaars': fetch_88makelaars, + 'borgdorff': fetch_borgdorff, + 'vanherk': fetch_vanherk, + 'vanoord': fetch_vanoord, +} diff --git a/src/adapters/ssr/_shared.py b/src/adapters/ssr/_shared.py new file mode 100644 index 0000000..85a2486 --- /dev/null +++ b/src/adapters/ssr/_shared.py @@ -0,0 +1,79 @@ +"""Shared utilities for all SSR scrapers.""" +import logging +import re +import time + +import httpx +from bs4 import BeautifulSoup + +import config + +log = logging.getLogger("huizenbot.ssr") + + +def fetch_soup(url: str, *, params: dict = None) -> BeautifulSoup: + """GET request → BeautifulSoup. Handelt 429 af met Retry-After.""" + for attempt in range(3): + r = httpx.get( + url, + params=params, + headers={"User-Agent": config.USER_AGENT}, + timeout=15, + follow_redirects=True, + ) + if r.status_code == 429: + wait = int(r.headers.get("Retry-After", 60)) + log.warning("429 op %s, wacht %ds", url, wait) + time.sleep(wait) + continue + r.raise_for_status() + return BeautifulSoup(r.text, "html.parser") + + raise RuntimeError(f"Blijvend 429 op {url}") + + +def parse_prijs(text: str | None) -> int | None: + """'€ 325.000 k.k.' → 325000""" + if not text: + return None + digits = re.sub(r"[^\d]", "", text) + return int(digits) if digits else None + + +def parse_m2(text: str | None) -> int | None: + """'87 m²' → 87""" + if not text: + return None + m = re.search(r"(\d+)", text.replace(".", "")) + return int(m.group(1)) if m else None + + +def _text(soup, selector: str) -> str | None: + el = soup.select_one(selector) + return el.get_text(strip=True) if el else None + + +def _src(soup, selector: str) -> str | None: + el = soup.select_one(selector) + if el is None: + return None + return el.get("src") or el.get("data-src") + + +def _extract_postcode(text: str | None) -> str | None: + if not text: + return None + m = re.search(r"\b(\d{4}\s?[A-Z]{2})\b", text) + return m.group(1).replace(" ", "") if m else None + + +def _infer_stad(postcode: str | None) -> str | None: + """Simpele mapping op basis van postcode range — uitbreiden naar wens.""" + if not postcode: + return None + code = int(postcode[:4]) + if 2600 <= code <= 2629: + return "Delft" + if 3100 <= code <= 3135: + return "Schiedam" + return None diff --git a/src/adapters/ssr/denhaag.py b/src/adapters/ssr/denhaag.py new file mode 100644 index 0000000..a4f9e0f --- /dev/null +++ b/src/adapters/ssr/denhaag.py @@ -0,0 +1,138 @@ +""" +Den Haag scrapers (custom platforms). + +Scrapers: 88makelaars +Note: borgdorff also covers Den Haag but uses the SURE CMS → see sure.py. +""" +import re + +import config +from huizenbot import RawListing + +from ._shared import fetch_soup, parse_prijs, parse_m2, _text, log + + +# --------------------------------------------------------------------------- +# 88 Makelaars (Den Haag) +# --------------------------------------------------------------------------- + +_88_BASE = "https://88makelaars.nl" + +_88_STATUS_MAP = { + "te koop": "beschikbaar", + "beschikbaar": "beschikbaar", + "onder bod": "onder_bod", + "onder optie": "onder_bod", + "verkocht onder voorbehoud": "verkocht", + "verkocht": "verkocht", +} + + +def _88makelaars_detail(detail_url: str) -> dict: + """Fetch 88makelaars detail page; extract kenmerken from div.listing_detail kv pairs.""" + try: + soup = fetch_soup(detail_url) + kv: dict[str, str] = {} + for div in soup.select("div.listing_detail"): + txt = div.get_text(strip=True) + if ":" in txt: + label, _, value = txt.partition(":") + kv[label.strip().lower()] = value.strip() + raw_pc = kv.get("postcode") or "" + pc_match = re.search(r"\d{4}\s*[A-Z]{2}", raw_pc.upper()) + postcode = pc_match.group(0).replace(" ", "") if pc_match else None + return { + "postcode": postcode, + "slaapkamers": kv.get("slaapkamers"), + "woonoppervlak": kv.get("woning grootte"), + "energielabel": kv.get("energieklasse"), + "woningtype": kv.get("soort woning"), + } + except Exception as e: + log.warning("88makelaars: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_88makelaars() -> list[RawListing]: + """Fetch 88 Makelaars listings (Den Haag only).""" + listings = [] + page = 1 + + while True: + if page == 1: + url = f"{_88_BASE}/ons-aanbod/" + else: + url = f"{_88_BASE}/ons-aanbod/page/{page}/" + soup = fetch_soup(url) + cards = soup.select("div.property_listing") + if not cards: + break + + for card in cards: + try: + # URL from carousel + a_tag = card.select_one(".property_unit_carousel a[href]") + if not a_tag: + continue + detail_url = a_tag["href"] + if not detail_url.startswith("http"): + detail_url = _88_BASE + detail_url + + # City — last link in property_location_image + loc_links = card.select(".property_location_image a") + stad = loc_links[-1].get_text(strip=True) if loc_links else None + if not stad or stad.lower() != "den haag": + continue + + # Price + prijs = parse_prijs(_text(card, ".listing_unit_price_wrapper")) + if prijs and prijs > config.MAX_PRICE: + continue + + # Status + status_text = (_text(card, ".ribbon-inside") or "").lower() + status = _88_STATUS_MAP.get(status_text, "beschikbaar") + + # Address + adres = _text(card, "h4 a") or _text(card, "h4") + + # Surface + rooms + woonoppervlak_card = parse_m2(_text(card, "span.infosize")) + kamers_card = None + rooms_txt = _text(card, "span.inforoom") + if rooms_txt: + m = re.search(r"(\d+)", rooms_txt) + kamers_card = int(m.group(1)) if m else None + + # Hero: first active carousel image + img = card.select_one(".item.active img") + hero = img.get("src") or img.get("data-original") if img else None + + kk = _88makelaars_detail(detail_url) + + listings.append(RawListing( + url=detail_url, + source_makelaar="88makelaars", + status=status, + adres=adres, + postcode=kk.get("postcode"), + stad="Den Haag", + prijs=prijs, + hero_image_url=hero, + woningtype=kk.get("woningtype"), + woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, + kamers=kamers_card, + slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None, + energielabel=kk.get("energielabel"), + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("88makelaars: parse fout: %s", e) + + if len(cards) < 10: + break + page += 1 + + log.info("88makelaars: %d listings opgehaald", len(listings)) + return listings diff --git a/src/adapters/ssr/overige.py b/src/adapters/ssr/overige.py new file mode 100644 index 0000000..57e133f --- /dev/null +++ b/src/adapters/ssr/overige.py @@ -0,0 +1,280 @@ +""" +Overige SSR scrapers (no shared CMS platform, multi-city). + +Scrapers: vansilfhout (OG Online WordPress), vanoord (Elementor/custom) +""" +import re + +import config +from huizenbot import RawListing + +from ._shared import fetch_soup, parse_prijs, parse_m2, _text, log + + +# --------------------------------------------------------------------------- +# Van Silfhout & Hogetoorn Wereldmakelaars (Delft) — OG Online WordPress +# --------------------------------------------------------------------------- +# All listings on one page. Postcode embedded in JS; detail has shortSpecs. +# Also serves as base for fetch_vwmakelaars and fetch_zomakelaars which +# happen to use the standard Realworks CMS instead — see realworks.py. + +_VANSILFHOUT_BASE = "https://www.vansilfhout.nl" + +_VANSILFHOUT_STATUS_MAP = { + "te koop": "beschikbaar", + "onder bod": "onder_bod", + "verkocht": "verkocht", +} + + +def _vansilfhout_detail(detail_url: str) -> dict: + """Fetch Van Silfhout detail page; extract postcode from JS and specs from shortSpecs.""" + try: + import httpx + r = httpx.get( + detail_url, + headers={"User-Agent": config.USER_AGENT}, + timeout=15, + follow_redirects=True, + ) + r.raise_for_status() + html = r.text + from bs4 import BeautifulSoup + soup = BeautifulSoup(html, "html.parser") + + # Postcode embedded in JS: objectZipcode': '2624NP' + m = re.search(r"objectZipcode':\s*'([^']+)'", html) + postcode = m.group(1) if m else None + + # shortSpecs:

  • Label:Value
  • + kv: dict[str, str] = {} + for li in soup.select(".shortSpecs li"): + spans = li.select("span") + if len(spans) >= 2: + label = spans[0].get_text(strip=True).rstrip(":").lower() + value = spans[-1].get_text(strip=True) + kv[label] = value + + return { + "postcode": postcode, + "bouwjaar": kv.get("bouwjaar"), + "woonoppervlak": kv.get("oppervlakte"), + "kamers": kv.get("kamers"), + "slaapkamers": kv.get("slaapkamers"), + } + except Exception as e: + log.warning("vansilfhout: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_vansilfhout() -> list[RawListing]: + """Fetch Van Silfhout woningaanbod (alle listings op één pagina).""" + soup = fetch_soup(f"{_VANSILFHOUT_BASE}/woningaanbod/") + listings = [] + + for card in soup.select("article.row"): + try: + a_tag = card.select_one("a.objectcontainerimg") + if not a_tag or "href" not in a_tag.attrs: + continue + detail_url = a_tag["href"] + if not detail_url.startswith("http"): + detail_url = _VANSILFHOUT_BASE + detail_url + + # Status + status_text = (_text(card, "span.objectstatus") or "").lower() + status = _VANSILFHOUT_STATUS_MAP.get(status_text, "beschikbaar") + + # Address and city + adres = _text(card, "h2.objecttitle") + city_el = card.select("a.straatnaamwoonplaats span") + stad = city_el[-1].get_text(strip=True) if city_el else None + + # Price from shortSpecs strong + prijs = parse_prijs(_text(card, "ul.shortSpecs li strong")) + if prijs and prijs > config.MAX_PRICE: + continue + + # Area and rooms from shortSpecs + woonoppervlak_card = None + kamers_card = None + for li in card.select("ul.shortSpecs li"): + spans = li.select("span") + if len(spans) >= 2: + label = spans[0].get_text(strip=True).lower() + val = spans[-1].get_text(strip=True) + if "oppervlakt" in label: + woonoppervlak_card = parse_m2(val) + elif "kamer" in label: + m = re.search(r"(\d+)", val) + kamers_card = int(m.group(1)) if m else None + + # Hero image: prefer data-lazy-src, fall back to noscript img src + img_tag = card.select_one("a.objectcontainerimg img") + hero = None + if img_tag: + hero = (img_tag.get("data-lazy-src") + or img_tag.get("src") or None) + if hero and hero.startswith("data:"): + noscript = card.select_one("noscript img") + hero = noscript["src"] if noscript else None + + kk = _vansilfhout_detail(detail_url) + + # Parse kamers/slaapkamers from detail + kamers = kamers_card + if kk.get("kamers"): + m = re.search(r"(\d+)", kk["kamers"]) + kamers = int(m.group(1)) if m else kamers_card + + slaapkamers = None + if kk.get("slaapkamers"): + m = re.search(r"(\d+)", kk["slaapkamers"]) + slaapkamers = int(m.group(1)) if m else None + + listings.append(RawListing( + url=detail_url, + source_makelaar="vansilfhout", + status=status, + adres=adres, + postcode=kk.get("postcode"), + stad=stad, + prijs=prijs, + hero_image_url=hero, + bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, + woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, + kamers=kamers, + slaapkamers=slaapkamers, + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("vansilfhout: parse fout: %s", e) + + log.info("vansilfhout: %d listings opgehaald", len(listings)) + return listings + + +# --------------------------------------------------------------------------- +# Van Oord Makelaardij (Delft + Schiedam) — Elementor/custom WordPress +# --------------------------------------------------------------------------- +# Separate listing pages per city; detail page has rw-object-features-list. + +_VANOORD_BASE = "https://www.vanoordmakelaardij.nl" +_VANOORD_LISTINGS = [ + f"https://www.vanoordmakelaardij.nl/aanbod/?_price=0%2C{config.MAX_PRICE}&_city=Delft&_availability=Te+koop", + f"https://www.vanoordmakelaardij.nl/aanbod/?_price=0%2C{config.MAX_PRICE}&_city=Schiedam&_availability=Te+koop", +] + +_VANOORD_STATUS_MAP = { + "te koop": "beschikbaar", + "onder bod": "onder_bod", + "verkocht": "verkocht", +} + + +def _vanoord_detail(detail_url: str) -> dict: + """Fetch Van Oord detail page; extract kenmerken from rw-object-features-list.""" + try: + soup = fetch_soup(detail_url) + kv: dict[str, str] = {} + for li in soup.select("ul.rw-object-features-list li"): + label_el = li.select_one("span.rw-object-list-label") + value_el = li.select_one("span.rw-object-list-value") + if label_el and value_el: + label = label_el.get_text(strip=True).lower() + value = value_el.get_text(strip=True) + kv[label] = value + return { + "status": kv.get("status", "").lower(), + "bouwjaar": kv.get("bouwjaar"), + "woonoppervlak": kv.get("woonoppervlakte"), + "kamers": kv.get("aantal kamers"), + "slaapkamers": kv.get("slaapkamers"), + "energielabel": kv.get("energieklasse"), + } + except Exception as e: + log.warning("vanoord: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_vanoord() -> list[RawListing]: + """Fetch Van Oord listings; Delft and Schiedam, only koop.""" + seen: set[str] = set() + listings = [] + + for listing_url in _VANOORD_LISTINGS: + soup = fetch_soup(listing_url) + cards = soup.select("div.e-loop-item") + + for card in cards: + try: + # Detail URL from h3 > a + a_tag = card.select_one("h3.elementor-heading-title a[href]") + if not a_tag: + continue + detail_url = a_tag["href"] + if not detail_url.startswith("http"): + detail_url = _VANOORD_BASE + detail_url + if detail_url in seen: + continue + seen.add(detail_url) + + # Status from rw-status-label widget class + status_el = card.select_one("[class*='rw-status-label--']") + status = "beschikbaar" + if status_el: + status_text = status_el.get_text(strip=True).lower() + status = _VANOORD_STATUS_MAP.get(status_text, "beschikbaar") + + # City from h4 + h4 = card.select_one("h4.elementor-heading-title") + stad = h4.get_text(strip=True) if h4 else None + + # Address from h3 > a text + adres = " ".join(a_tag.get_text().split()) + + # Price from h3 without child + prijs = None + for h3 in card.select("h3.elementor-heading-title"): + if not h3.select_one("a"): + prijs = parse_prijs(h3.get_text()) + break + if prijs and prijs > config.MAX_PRICE: + continue + + # Card icon list: [0]=surface [1]=rooms [2]=energy + icon_items = card.select("ul.elementor-icon-list-items li span.elementor-icon-list-text") + woonoppervlak_card = parse_m2(icon_items[0].get_text()) if len(icon_items) > 0 else None + kamers_card = None + if len(icon_items) > 1: + m = re.search(r"(\d+)", icon_items[1].get_text()) + kamers_card = int(m.group(1)) if m else None + energielabel_card = icon_items[2].get_text(strip=True) if len(icon_items) > 2 else None + + kk = _vanoord_detail(detail_url) + + detail_status = _VANOORD_STATUS_MAP.get(kk.get("status", ""), "") + if detail_status: + status = detail_status + + listings.append(RawListing( + url=detail_url, + source_makelaar="vanoord", + status=status, + adres=adres, + stad=stad, + prijs=prijs, + bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None, + woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, + kamers=(int(kk["kamers"]) if kk.get("kamers", "").isdigit() else None) or kamers_card, + slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None, + energielabel=kk.get("energielabel") or energielabel_card, + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("vanoord: parse fout: %s", e) + + log.info("vanoord: %d listings opgehaald", len(listings)) + return listings diff --git a/src/adapters/ssr/realworks.py b/src/adapters/ssr/realworks.py new file mode 100644 index 0000000..843d9f8 --- /dev/null +++ b/src/adapters/ssr/realworks.py @@ -0,0 +1,502 @@ +""" +Realworks CMS scrapers. + +All makelaars here run the Realworks CMS. Listings come from paginated +/aanbod/woningaanbod/-{price}/koop/ pages; detail pages have span.kenmerk +label/value pairs. Some variants (Wassenaar, Roepman) expose listing-level +data via JSON-LD instead of card HTML. + +Scrapers: ankebodewes, woongoed, vwmakelaars, zomakelaars, morris, + wassenaar, roepman, post +""" +import json as _json +import re + +import config +from huizenbot import RawListing + +from ._shared import fetch_soup, parse_prijs, parse_m2, _text, log + + +# --------------------------------------------------------------------------- +# Shared Realworks helpers +# --------------------------------------------------------------------------- + +_REALWORKS_STATUS_MAP = { + "te koop": "beschikbaar", + "nieuw": "beschikbaar", + "onder bod": "onder_bod", + "onder optie": "onder_bod", + "verkocht o.v.": "verkocht", + "verkocht": "verkocht", +} + + +def _realworks_detail(detail_url: str, makelaar: str) -> dict: + """Fetch a Realworks detail page and extract kenmerken. Returns empty dict on failure.""" + try: + soup = fetch_soup(detail_url) + + # Build a label→value map from all .kenmerk spans + kv: dict[str, str] = {} + for kenmerk in soup.select("span.kenmerk"): + label_el = kenmerk.select_one("span.kenmerkName") + value_el = kenmerk.select_one("span.kenmerkValue") + if label_el and value_el: + label = label_el.get_text(strip=True).lower() + value = value_el.get_text(strip=True) + kv[label] = value + + return { + "woningtype": kv.get("type woning"), + "bouwjaar": kv.get("bouwjaar"), + "woonoppervlak": kv.get("woonoppervlakte"), + "perceeloppervlak": kv.get("perceeloppervlakte"), + "kamers": kv.get("aantal kamers"), + "slaapkamers": kv.get("aantal slaapkamers"), + "energielabel": kv.get("energieklasse"), + } + except Exception as e: + log.warning("%s: detail fetch fout %s: %s", makelaar, detail_url, e) + return {} + + +def fetch_realworks(base_url: str, makelaar: str) -> list[RawListing]: + """ + Generic fetcher for Realworks CMS brokers. + Paginates via /pagina-{n}/, fetches detail page per listing. + """ + listings_path = f"/aanbod/woningaanbod/-{config.MAX_PRICE}/koop" + listings = [] + page = 1 + + while True: + url = f"{base_url}{listings_path}/pagina-{page}/" + soup = fetch_soup(url) + cards = soup.select("li.aanbodEntry") + if not cards: + break + + for card in cards: + try: + a_tag = card.select_one("a.aanbodEntryLink") + if not a_tag: + continue + listing_url = base_url + a_tag["href"] + + adres = _text(card, ".street-address") + postcode = (_text(card, ".postal-code") or "").replace(" ", "") or None + stad = _text(card, ".locality") + prijs = parse_prijs(_text(card, ".koopprijs .kenmerkValue")) + + status_text = (_text(card, ".objectstatusbanner") or "").lower() + status = _REALWORKS_STATUS_MAP.get(status_text, "beschikbaar") + + img_tag = card.select_one(".hoofdfoto img") + hero = img_tag["src"] if img_tag else None + + kk = _realworks_detail(listing_url, makelaar) + + listings.append(RawListing( + url=listing_url, + source_makelaar=makelaar, + adres=adres, + postcode=postcode, + stad=stad, + prijs=prijs, + status=status, + hero_image_url=hero, + woningtype=kk.get("woningtype"), + bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, + woonoppervlak=parse_m2(kk.get("woonoppervlak")), + perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), + kamers=int(kk["kamers"]) if kk.get("kamers") else None, + slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None, + energielabel=kk.get("energielabel"), + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("%s: parse fout: %s", makelaar, e) + + if len(cards) < 10: + break + page += 1 + + log.info("%s: %d listings opgehaald", makelaar, len(listings)) + return listings + + +# --------------------------------------------------------------------------- +# Simple Realworks wrappers (one-liners) +# --------------------------------------------------------------------------- + +def fetch_ankebodewes() -> list[RawListing]: + return fetch_realworks("https://www.ankebodewes.nl", "ankebodewes") + + +def fetch_woongoed() -> list[RawListing]: + return fetch_realworks("https://www.woongoedmakelaars.nl", "woongoed") + + +def fetch_vwmakelaars() -> list[RawListing]: + return fetch_realworks("https://www.vwmakelaars.nl", "vwmakelaars") + + +def fetch_zomakelaars() -> list[RawListing]: + return fetch_realworks("https://www.zomakelaars.nl", "zomakelaars") + + +def fetch_morris() -> list[RawListing]: + return fetch_realworks("https://www.morrismakelaardij.nl", "morris") + + +# --------------------------------------------------------------------------- +# Makelaardij Wassenaar (Schiedam) — Realworks CMS, JSON-LD listing page +# --------------------------------------------------------------------------- +# Listings page has JSON-LD (Residence) with url/address/price/photo. +# Detail pages have span.kenmerk with Wassenaar-specific label names. + +_WASSENAAR_BASE = "https://www.makelaardijwassenaar.nl" + +_WASSENAAR_STATUS_MAP = { + "te koop": "beschikbaar", + "nieuw": "beschikbaar", + "onder bod": "onder_bod", + "onder optie": "onder_bod", + "verkocht o.v.": "onder_bod", + "verkocht onder voorbehoud": "onder_bod", + "verkocht": "verkocht", +} + + +def _wassenaar_detail(detail_url: str) -> dict: + """Fetch Realworks detail page; extract kenmerken with Wassenaar-specific labels.""" + try: + soup = fetch_soup(detail_url) + kv: dict[str, str] = {} + for kenmerk in soup.select("span.kenmerk"): + label_el = kenmerk.select_one("span.kenmerkName") + value_el = kenmerk.select_one("span.kenmerkValue") + if label_el and value_el: + kv[label_el.get_text(strip=True).lower()] = value_el.get_text(strip=True) + return { + "woningtype": kv.get("soort object"), + "bouwjaar": kv.get("bouwjaar"), + "woonoppervlak": kv.get("woonoppervlakte"), + "perceeloppervlak": kv.get("perceeloppervlakte"), + "kamers": kv.get("aantal kamers"), + "slaapkamers": kv.get("aantal slaapkamers"), + "energielabel": kv.get("energieklasse"), + } + except Exception as e: + log.warning("wassenaar: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_wassenaar() -> list[RawListing]: + soup = fetch_soup(f"{_WASSENAAR_BASE}/aanbod/woningaanbod/-{config.MAX_PRICE}/koop/") + + # First pass: collect status + thumbnail per relative url + # Each listing has two a.aanbodEntryLink with the same href; + # the first has the status banner + photo, the second has address + price. + status_by_url: dict[str, str] = {} + photo_by_url: dict[str, str] = {} + for a in soup.select("a.aanbodEntryLink[href]"): + href = a["href"] + if href in status_by_url: + continue + banner = a.select_one(".objectstatusbanner") + status_text = banner.get_text(strip=True).lower() if banner else "" + status_by_url[href] = _WASSENAAR_STATUS_MAP.get(status_text, "beschikbaar") + img = a.select_one("span.hoofdfoto img") + if img: + src = img.get("src", "") + if "geenfotobeschikbaar" not in src: + photo_by_url[href] = src + + # Second pass: parse JSON-LD blocks (one per listing) + seen: set[str] = set() + listings = [] + for tag in soup.select('script[type="application/ld+json"]'): + try: + ld = _json.loads(tag.string) + if ld.get("@type") != "Residence": + continue + rel_url = ld.get("url", "") + if not rel_url or rel_url in seen: + continue + seen.add(rel_url) + + detail_url = _WASSENAAR_BASE + rel_url + address = ld.get("address", {}) + postcode = address.get("postalCode", "").replace(" ", "") or None + + price_spec = next( + (a.get("priceSpecification", {}) for a in ld.get("potentialAction", []) + if a.get("priceSpecification")), + {} + ) + prijs = int(price_spec["price"]) if price_spec.get("price") else None + if prijs and prijs > config.MAX_PRICE: + continue + + hero = ld.get("photo") or photo_by_url.get(rel_url) + status = status_by_url.get(rel_url, "beschikbaar") + kk = _wassenaar_detail(detail_url) + + listings.append(RawListing( + url=detail_url, + source_makelaar="wassenaar", + status=status, + adres=address.get("streetAddress") or None, + postcode=postcode, + stad=address.get("addressLocality") or None, + prijs=prijs, + hero_image_url=hero, + woningtype=kk.get("woningtype"), + bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, + woonoppervlak=parse_m2(kk.get("woonoppervlak")), + perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), + kamers=int(kk["kamers"]) if kk.get("kamers") else None, + slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None, + energielabel=kk.get("energielabel"), + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("wassenaar: parse fout: %s", e) + + log.info("wassenaar: %d listings opgehaald", len(listings)) + return listings + + +# --------------------------------------------------------------------------- +# Roepman Makelaardij NVM (Delft) — Realworks CMS, JSON-LD listing page +# --------------------------------------------------------------------------- +# Uses div.aanbodEntry instead of li.aanbodEntry; price from JSON-LD. + +_ROEPMAN_BASE = "https://www.roepman.nl" + + +def fetch_roepman() -> list[RawListing]: + listings_path = f"/aanbod/woningaanbod/-{config.MAX_PRICE}/koop" + listings = [] + page = 1 + + while True: + url = f"{_ROEPMAN_BASE}{listings_path}/pagina-{page}/" + soup = fetch_soup(url) + cards = soup.select("div.aanbodEntry") + if not cards: + break + + # Collect status + photo per relative url + status_by_url: dict[str, str] = {} + photo_by_url: dict[str, str] = {} + for card in cards: + a_tag = card.select_one("a.aanbodEntryLink[href]") + if not a_tag: + continue + href = a_tag["href"] + if href in status_by_url: + continue + banner = card.select_one(".objectstatusbanner") + status_text = banner.get_text(strip=True).lower() if banner else "" + status_by_url[href] = _REALWORKS_STATUS_MAP.get(status_text, "beschikbaar") + img = card.select_one("img") + if img: + src = img.get("src", "") + if "geenfotobeschikbaar" not in src: + photo_by_url[href] = src + + # Parse JSON-LD Residence blocks (one per listing) + seen: set[str] = set() + for tag in soup.select('script[type="application/ld+json"]'): + try: + ld = _json.loads(tag.string) + if ld.get("@type") != "Residence": + continue + rel_url = ld.get("url", "") + if not rel_url or rel_url in seen: + continue + seen.add(rel_url) + + detail_url = _ROEPMAN_BASE + rel_url + address = ld.get("address", {}) + postcode = address.get("postalCode", "").replace(" ", "") or None + + price_spec = next( + (a.get("priceSpecification", {}) for a in ld.get("potentialAction", []) + if a.get("priceSpecification")), + {} + ) + prijs = int(price_spec["price"]) if price_spec.get("price") else None + if prijs and prijs > config.MAX_PRICE: + continue + + hero = ld.get("photo") or photo_by_url.get(rel_url) + status = status_by_url.get(rel_url, "beschikbaar") + kk = _realworks_detail(detail_url, "roepman") + + listings.append(RawListing( + url=detail_url, + source_makelaar="roepman", + status=status, + adres=address.get("streetAddress") or None, + postcode=postcode, + stad=address.get("addressLocality") or None, + prijs=prijs, + hero_image_url=hero, + woningtype=kk.get("woningtype"), + bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, + woonoppervlak=parse_m2(kk.get("woonoppervlak")), + perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), + kamers=int(kk["kamers"]) if kk.get("kamers") else None, + slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None, + energielabel=kk.get("energielabel"), + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("roepman: parse fout: %s", e) + + if len(cards) < 10: + break + page += 1 + + log.info("roepman: %d listings opgehaald", len(listings)) + return listings + + +# --------------------------------------------------------------------------- +# Post Makelaardij (Delft) — Realworks CMS, custom detail parser +# --------------------------------------------------------------------------- + +_POST_BASE = "https://www.postmakelaardij.nl" + +_POST_STATUS_MAP = { + "te koop": "beschikbaar", + "onder bod": "onder_bod", + "verkocht": "verkocht", +} + + +def _post_detail(detail_url: str) -> dict: + """Fetch Post Makelaardij detail page and extract kenmerken.""" + try: + soup = fetch_soup(detail_url) + + # Energielabel from CSS class: energielabel-{letter} + energielabel = None + for el in soup.select('[class]'): + for cls in el.get('class', []): + if cls.startswith('energielabel-') and cls != 'energielabel': + energielabel = cls.replace('energielabel-', '').upper() + break + if energielabel: + break + + # Woonoppervlak, perceeloppervlak, slaapkamers from icon spans + woonoppervlak = None + perceeloppervlak = None + slaapkamers = None + for span in soup.select('span.object-info-icon-text'): + txt = span.get_text(strip=True) + if 'slaapkamer' in txt: + m = re.search(r'(\d+)', txt) + slaapkamers = int(m.group(1)) if m else None + elif 'perceel' in txt: + perceeloppervlak = parse_m2(txt) + elif 'm²' in txt or 'm2' in txt: + woonoppervlak = parse_m2(txt) + + return { + "woonoppervlak": woonoppervlak, + "perceeloppervlak": perceeloppervlak, + "slaapkamers": slaapkamers, + "energielabel": energielabel, + } + except Exception as e: + log.warning("post: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_post() -> list[RawListing]: + """Fetch Post Makelaardij listings; only Delft, only koop.""" + listings = [] + page = 1 + + while True: + url = f"{_POST_BASE}/woningaanbod/koop?page={page}" + soup = fetch_soup(url) + cards = soup.select("article") + if not cards: + break + + for card in cards: + try: + # URL — first link in image slider + a_tag = card.select_one("a[href]") + if not a_tag: + continue + href = a_tag["href"] + detail_url = href if href.startswith("http") else _POST_BASE + href + + # Postcode + city from span.custom-postcode-text + pc_el = card.select_one("span.custom-postcode-text") + if not pc_el: + continue + pc_parts = pc_el.get_text(strip=True).split() + if len(pc_parts) < 3: + continue + postcode = pc_parts[0] + pc_parts[1] # "2613BD" + stad = " ".join(pc_parts[2:]) # "Delft" + + # Filter: only Delft + if stad.lower() != "delft": + continue + + # Price — filter early + prijs = parse_prijs(_text(card, "span.price-block")) + if prijs and prijs > config.MAX_PRICE: + continue + + # Status from span.status text + status_text = (_text(card, "span.status") or "").lower() + status = _POST_STATUS_MAP.get(status_text, "beschikbaar") + + # Address + adres = _text(card, "h4.custom-address-text") + + # Hero: first img in article + img = card.select_one("img") + hero = img["src"] if img else None + + kk = _post_detail(detail_url) + + listings.append(RawListing( + url=detail_url, + source_makelaar="post", + status=status, + adres=adres, + postcode=postcode, + stad=stad, + prijs=prijs, + hero_image_url=hero, + woonoppervlak=kk.get("woonoppervlak"), + perceeloppervlak=kk.get("perceeloppervlak"), + slaapkamers=kk.get("slaapkamers"), + energielabel=kk.get("energielabel"), + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("post: parse fout: %s", e) + + if len(cards) < 12: + break + page += 1 + + log.info("post: %d listings opgehaald", len(listings)) + return listings diff --git a/src/adapters/ssr/schiedam.py b/src/adapters/ssr/schiedam.py new file mode 100644 index 0000000..0fdbeb7 --- /dev/null +++ b/src/adapters/ssr/schiedam.py @@ -0,0 +1,542 @@ +""" +Custom Schiedam scrapers (no shared CMS platform). + +Each makelaar here uses a bespoke site structure that required its own parser. + +Scrapers: dewittegarantiemakelaars (JSON-LD), dens, 3dmakelaars, dupont +""" +import re + +import config +from huizenbot import RawListing + +from ._shared import ( + fetch_soup, parse_prijs, parse_m2, _text, + _extract_postcode, _infer_stad, log, +) + + +# --------------------------------------------------------------------------- +# De Witte Garantiemakelaars (Schiedam) +# --------------------------------------------------------------------------- +# Listing cards have a pill badge for status. All detail data comes from +# JSON-LD (schema.org BuyAction/Offer) on the detail page. + +_DEWITTE_BASE = "https://dewittegarantiemakelaars.nl" + +_DEWITTE_PILL_MAP = { + "bg-fun-green": "beschikbaar", + "bg-sold": "verkocht", +} + +_DEWITTE_TYPE_MAP = { + "Apartment": "appartement", + "House": "woning", + "SingleFamilyResidence": "woning", + "Residence": "woning", +} + + +def _dewitte_jsonld(detail_url: str) -> dict: + """Fetch detail page and return parsed JSON-LD dict, or {} on failure.""" + import json + try: + soup = fetch_soup(detail_url) + tag = soup.select_one('script[type="application/ld+json"]') + if not tag: + log.warning("dewitte: geen JSON-LD op %s", detail_url) + return {} + return json.loads(tag.string) + except Exception as e: + log.warning("dewitte: JSON-LD fout %s: %s", detail_url, e) + return {} + + +def fetch_dewittegarantiemakelaars() -> list[RawListing]: + listings = [] + page = 1 + + while True: + url = ( + f"{_DEWITTE_BASE}/woningaanbod" + f"?buy_rent=buy&buy_price=1-{config.MAX_PRICE}&page={page}" + ) + soup = fetch_soup(url) + cards = soup.select("div.card.card--property") + if not cards: + break + + for card in cards: + try: + a_tag = card.select_one("a.card__anchor") + if not a_tag: + continue + detail_url = a_tag["href"] + if not detail_url.startswith("http"): + detail_url = _DEWITTE_BASE + detail_url + + pill = card.select_one("span.pill") + pill_classes = pill.get("class", []) if pill else [] + status_key = next( + (c for c in pill_classes if c.startswith("bg-")), None + ) + status = _DEWITTE_PILL_MAP.get(status_key, "onder_bod") + + ld = _dewitte_jsonld(detail_url) + if not ld: + continue + + offered = ld.get("itemOffered", {}) + address = offered.get("address", {}) + floor_size = offered.get("floorSize", {}) + + postcode = address.get("postalCode", "").replace(" ", "") or None + stad = address.get("addressLocality") or None + adres = address.get("streetAddress") or None + + prijs = ld.get("price") + if prijs and int(prijs) > config.MAX_PRICE: + continue + + woningtype = _DEWITTE_TYPE_MAP.get(offered.get("@type", "")) + woonoppervlak = int(floor_size["value"]) if floor_size.get("value") else None + kamers = offered.get("numberOfRooms") + bouwjaar = offered.get("yearBuilt") + + # Full-res image from JSON-LD, fall back to card thumbnail + hero = ld.get("image") + if not hero: + img = card.select_one("picture img") + hero = img["src"] if img else None + + listings.append(RawListing( + url=detail_url, + source_makelaar="dewittegarantiemakelaars", + status=status, + adres=adres, + postcode=postcode, + stad=stad, + prijs=int(prijs) if prijs else None, + woningtype=woningtype, + woonoppervlak=woonoppervlak, + kamers=int(kamers) if kamers else None, + bouwjaar=int(bouwjaar) if bouwjaar else None, + hero_image_url=hero, + )) + if config.APP_ENV == "dev": + break + + except Exception as e: + log.warning("dewitte: parse fout: %s", e) + + if len(cards) < 10: + break + page += 1 + + log.info("dewittegarantiemakelaars: %d listings opgehaald", len(listings)) + return listings + + +# --------------------------------------------------------------------------- +# D&S Makelaars (Schiedam) +# --------------------------------------------------------------------------- + +_DS_BASE = "https://www.densmakelaars.nl" + +_DS_STATUS_MAP = { + "onder bod": "onder_bod", + "te koop": "beschikbaar", + "nieuw": "beschikbaar", + "beschikbaar": "beschikbaar", + "verkocht": "verkocht", +} + + +def _ds_detail(detail_url: str, html_text: str = None) -> dict: + """Fetch D&S detail page and extract all kenmerken from
    /
    pairs and postcode from maps URL.""" + try: + # If html_text not provided, fetch it + if html_text is None: + import httpx + r = httpx.get( + detail_url, + headers={"User-Agent": config.USER_AGENT}, + timeout=15, + follow_redirects=True, + ) + html_text = r.text + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_text, "html.parser") + + # Parse
    /
    pairs into a label → value map + kv: dict[str, str] = {} + dts = soup.select("dt") + dds = soup.select("dd") + + for dt, dd in zip(dts, dds): + label = dt.get_text(strip=True).lower() + value = dd.get_text(strip=True) + kv[label] = value + + # Extract postcode from Google Maps URL in iframe src + # Pattern: q=...POSTCODE...,CITY where POSTCODE is 4 digits + 2 letters + postcode = None + m = re.search(r'q=.+?,(\d{4})\s+([A-Z]{2}),', html_text) + if m: + postcode = f"{m.group(1)}{m.group(2)}" + + return { + "status": kv.get("status", "beschikbaar").lower(), + "woningtype": kv.get("soort woning"), + "bouwjaar": kv.get("bouwjaar"), + "woonoppervlak": kv.get("woonoppervlakte"), + "kamers": kv.get("aantal kamers"), + "slaapkamers": kv.get("aantal slaapkamers"), + "energielabel": kv.get("energielabel"), + "postcode": postcode, + } + except Exception as e: + log.warning("dens: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_dens() -> list[RawListing]: + """Fetch D&S Makelaars listings with full detail pages.""" + listings = [] + page = 1 + + while True: + url = f"{_DS_BASE}/aanbod/koopwoningen?page={page}" + soup = fetch_soup(url) + cards = soup.select(".col-12.col-md-4.object-wrapper") + if not cards: + break + + for card in cards: + try: + # Extract URL + a_tag = card.select_one("a.property") + if not a_tag or "href" not in a_tag.attrs: + continue + detail_url = a_tag["href"] + if not detail_url.startswith("http"): + detail_url = _DS_BASE + detail_url + + # Extract listing page data + status_label = _text(card, "span.label") or "beschikbaar" + status_label = status_label.strip().lower() + status = _DS_STATUS_MAP.get(status_label, "beschikbaar") + + adres = _text(card, "h3") + stad = _text(card, "h4") + prijs_text = _text(card, "div.price") + prijs = parse_prijs(prijs_text) + + # Extract area and rooms from footer + footer_spans = card.select("div.footer span") + woonoppervlak = None + kamers = None + for span in footer_spans: + text = span.get_text(strip=True) + if "m²" in text: + woonoppervlak = parse_m2(text) + elif "kamers" in text.lower(): + m = re.search(r"(\d+)", text) + if m: + kamers = int(m.group(1)) + + # Extract hero image + img_tag = card.select_one("img") + hero = img_tag["src"] if img_tag else None + + # Fetch and parse detail page + detail_data = _ds_detail(detail_url) + + # Use postcode from detail data (extracted from Google Maps URL) + postcode = detail_data.get("postcode") + + # Determine status from detail page if available + if detail_data.get("status"): + status = _DS_STATUS_MAP.get(detail_data["status"], status) + + listings.append(RawListing( + url=detail_url, + source_makelaar="dens", + adres=adres, + postcode=postcode, + stad=stad or _infer_stad(postcode), + prijs=prijs, + status=status, + hero_image_url=hero, + woningtype=detail_data.get("woningtype"), + bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None, + woonoppervlak=parse_m2(detail_data.get("woonoppervlak")) or woonoppervlak, + kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else kamers, + slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None, + energielabel=detail_data.get("energielabel"), + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("dens: parse fout: %s", e) + + if len(cards) < 10: + break + page += 1 + + log.info("dens: %d listings opgehaald", len(listings)) + return listings + + +# --------------------------------------------------------------------------- +# 3D Makelaars (Schiedam/Vlaardingen) +# --------------------------------------------------------------------------- + +_3D_BASE = "https://3dmakelaars.nl" + + +def _3dmakelaars_detail(detail_url: str) -> dict: + """Fetch 3dmakelaars detail page and extract structured info block.""" + try: + soup = fetch_soup(detail_url) + + # Parse structured info block: span (label) + p (value) pairs + kv: dict[str, str] = {} + for li in soup.select("div.tl-adiltional-inforamtion ul.tl-adiltional-listed li"): + label_el = li.select_one("span") + value_el = li.select_one("p") + if label_el and value_el: + label = label_el.get_text(strip=True).lower() + value = value_el.get_text(strip=True) + kv[label] = value + + # Extract postcode from first description paragraph + postcode = None + p_tag = soup.select_one(".omschrijving > p:nth-child(1)") + if p_tag: + text = p_tag.get_text() + postcode = _extract_postcode(text) + + return { + "kamers": int(kv["aantal kamers"].split()[0]) if "aantal kamers" in kv else None, + "slaapkamers": int(kv["aantal slaapkamers"].split()[0]) if "aantal slaapkamers" in kv else None, + "bouwjaar": int(kv["bouwjaar"]) if "bouwjaar" in kv else None, + "woningtype": kv.get("bouwvorm"), + "woonoppervlak": parse_m2(kv.get("oppervlakte")), + "postcode": postcode, + } + except Exception as e: + log.warning("3dmakelaars: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_3dmakelaars() -> list[RawListing]: + """Fetch 3D Makelaars listings with pagination.""" + listings = [] + page = 1 + + while True: + url = ( + f"{_3D_BASE}/woningen-te-koop-in-schiedam-en-vlaardingen" + f"?kamers=&oppervlakte=&woonplaats=&video=&prijs=3&page={page}" + ) + soup = fetch_soup(url) + cards = soup.select("div.tl-properties-item") + if not cards: + break + + for card in cards: + try: + # Extract detail URL from onclick attribute + onclick = card.get("onclick", "") + detail_url = None + if "window.location" in onclick: + m = re.search(r"window\.location\s*=\s*['\"]([^'\"]+)['\"]", onclick) + if m: + detail_url = _3D_BASE + m.group(1) + + if not detail_url: + continue + + # Extract listing-level info + adres = _text(card, "h3.price") + prijs_text = _text(card, "span.address") + prijs = parse_prijs(prijs_text) + + # Extract rooms and area from meta list + kamers = None + woonoppervlak = None + for li in card.select("ul.tl-meta-listed > li"): + text = li.get_text(strip=True) + if "kamers" in text.lower(): + m = re.search(r"(\d+)", text) + if m: + kamers = int(m.group(1)) + elif "m²" in text or "m2" in text: + woonoppervlak = parse_m2(text) + + # Extract image + img_tag = card.select_one("img") + hero = img_tag["src"] if img_tag else None + if hero and not hero.startswith("http"): + hero = _3D_BASE + hero + + # Fetch detail page for full info + detail_data = _3dmakelaars_detail(detail_url) + + # Postcode from detail page, fallback to extraction from address + postcode = detail_data.get("postcode") + if not postcode and adres: + postcode = _extract_postcode(adres) + + listings.append(RawListing( + url=detail_url, + source_makelaar="3dmakelaars", + adres=adres, + postcode=postcode, + stad=_infer_stad(postcode), + prijs=prijs, + woningtype=detail_data.get("woningtype"), + bouwjaar=detail_data.get("bouwjaar"), + woonoppervlak=woonoppervlak or detail_data.get("woonoppervlak"), + kamers=kamers or detail_data.get("kamers"), + slaapkamers=detail_data.get("slaapkamers"), + hero_image_url=hero, + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("3dmakelaars: parse fout: %s", e) + + if len(cards) < 7: + break + page += 1 + + log.info("3dmakelaars: %d listings opgehaald", len(listings)) + return listings + + +# --------------------------------------------------------------------------- +# Dupont ERA Makelaars (Schiedam/Rotterdam) +# --------------------------------------------------------------------------- + +_DUPONT_BASE = "https://www.dupont.nl" + +_DUPONT_STATUS_MAP = { + "te koop": "beschikbaar", + "nieuw": "beschikbaar", + "onder bod": "onder_bod", + "verkocht onder voorbehoud": "onder_bod", + "verkocht": "verkocht", +} + + +def _dupont_detail(detail_url: str) -> dict: + """Fetch Dupont detail page and extract kenmerken from dt/dd pairs.""" + try: + soup = fetch_soup(detail_url) + + # Parse dt/dd pairs into label → value map + kv: dict[str, str] = {} + dts = soup.select("dt") + dds = soup.select("dd") + + for dt, dd in zip(dts, dds): + label = dt.get_text(strip=True).lower() + value = dd.get_text(strip=True) + kv[label] = value + + # Extract postcode from small tag (format: "NNNN AA CITY") + postcode = None + small_tag = soup.select_one("section div.container-fluid small") + if small_tag: + postcode = _extract_postcode(small_tag.get_text()) + + return { + "postcode": postcode, + "woningtype": kv.get("soort woning"), + "bouwjaar": kv.get("bouwjaar"), + "woonoppervlak": kv.get("woonoppervlakte"), + "kamers": kv.get("aantal kamers"), + "slaapkamers": kv.get("aantal slaapkamers"), + "energielabel": kv.get("energielabel"), + } + except Exception as e: + log.warning("dupont: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_dupont() -> list[RawListing]: + """Fetch Dupont ERA Makelaars listings with pagination and detail pages.""" + listings = [] + page = 1 + + while True: + url = f"{_DUPONT_BASE}/aanbod/koopwoningen?page={page}" + soup = fetch_soup(url) + cards = soup.select("article.object") + if not cards: + break + + for card in cards: + try: + # Extract URL + a_tag = card.select_one("a[href]") + if not a_tag or "href" not in a_tag.attrs: + continue + detail_url = a_tag["href"] + if not detail_url.startswith("http"): + detail_url = _DUPONT_BASE + detail_url + + # Extract listing-level data + adres = _text(card, "h3") + stad = _text(card, "h4") + prijs_text = _text(card, "div.price") + prijs = parse_prijs(prijs_text) + + # Extract status from label + status_label = _text(card, "div.label") or "beschikbaar" + status_label = status_label.strip().lower() + status = _DUPONT_STATUS_MAP.get(status_label, "beschikbaar") + + # Extract image + img_tag = card.select_one("img.img-responsive") + hero = img_tag["src"] if img_tag else None + if hero and not hero.startswith("http"): + hero = _DUPONT_BASE + hero + + # Fetch detail page for full data + detail_data = _dupont_detail(detail_url) + + # Use postcode from detail if available + postcode = detail_data.get("postcode") + + listings.append(RawListing( + url=detail_url, + source_makelaar="dupont", + adres=adres, + postcode=postcode, + stad=stad or _infer_stad(postcode), + prijs=prijs, + status=status, + hero_image_url=hero, + woningtype=detail_data.get("woningtype"), + bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None, + woonoppervlak=parse_m2(detail_data.get("woonoppervlak")), + kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else None, + slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None, + energielabel=detail_data.get("energielabel"), + )) + if config.APP_ENV == "dev": + break + + except Exception as e: + log.warning("dupont: parse fout: %s", e) + + if len(cards) < 10: + break + page += 1 + + log.info("dupont: %d listings opgehaald", len(listings)) + return listings diff --git a/src/adapters/ssr/sure.py b/src/adapters/ssr/sure.py new file mode 100644 index 0000000..be2a021 --- /dev/null +++ b/src/adapters/ssr/sure.py @@ -0,0 +1,630 @@ +""" +SURE WordPress plugin scrapers. + +All makelaars here use the SURE real estate plugin for WordPress. Listings +are at /wonen?sure_koop_huur=koop with pagination via /wonen/page/{N}/. +Cards use class a.card-house or div.card.card--house. +Detail pages have a #kenmerken section with label/value pairs. + +Scrapers: schielandborsboom, olsthoorn, vanherk, borgdorff +""" +import re + +import config +from huizenbot import RawListing + +from ._shared import fetch_soup, parse_prijs, parse_m2, _text, _extract_postcode, log + + +# --------------------------------------------------------------------------- +# Schieland Borsboom NVM Makelaars (Rotterdam, active in Schiedam) +# --------------------------------------------------------------------------- + +_SCHIELAND_BASE = "https://www.schielandborsboom.nl" + +_SCHIELAND_STATUS_MAP = { + "sure-status-available": "beschikbaar", + "sure-status-under_bid": "onder_bod", + "sure-status-sold": "verkocht", +} + + +def _schieland_detail(detail_url: str) -> dict: + """Fetch Schieland Borsboom detail page and extract kenmerken.""" + try: + soup = fetch_soup(detail_url) + + # Postcode from house__status p (e.g. "3117 DP Schiedam") + postcode_el = soup.select_one("div.house__status p") + postcode = _extract_postcode(postcode_el.get_text()) if postcode_el else None + + # Parse #kenmerken section:
  • labelvalue
  • + kv: dict[str, str] = {} + kenmerken = soup.select_one("#kenmerken") + if kenmerken: + for li in kenmerken.select("li"): + label_el = li.select_one("strong") + value_el = li.select_one("span") + if label_el and value_el: + # Strip nested links (e.g. "Hypotheek berekenen") + for a in value_el.select("a"): + a.decompose() + kv[label_el.get_text(strip=True).lower()] = value_el.get_text(strip=True) + + return { + "postcode": postcode, + "status": kv.get("status", "").lower(), + "woningtype": kv.get("soort bouw"), + "bouwjaar": kv.get("bouwjaar"), + "woonoppervlak": kv.get("woonoppervlakte"), + "perceeloppervlak": kv.get("perceeloppervlakte"), + "kamers": kv.get("aantal kamers"), + "slaapkamers": kv.get("aantal slaapkamers"), + "energielabel": kv.get("energielabel"), + } + except Exception as e: + log.warning("schielandborsboom: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_schielandborsboom() -> list[RawListing]: + """Fetch Schieland Borsboom NVM listings (koop only, Schiedam).""" + listings = [] + page = 1 + + while True: + if page == 1: + url = f"{_SCHIELAND_BASE}/wonen?sure_koop_huur=koop" + else: + url = f"{_SCHIELAND_BASE}/wonen/page/{page}/?sure_koop_huur=koop" + + soup = fetch_soup(url) + cards = soup.select("div.card.card--house") + if not cards: + break + + for card in cards: + try: + a_tag = card.select_one("a.card__anchor") + if not a_tag or "href" not in a_tag.attrs: + continue + detail_url = a_tag["href"] + if not detail_url.startswith("http"): + detail_url = _SCHIELAND_BASE + detail_url + + # Filter: only Schiedam + stad_el = card.select_one("p.house-place") + stad = stad_el.get_text(strip=True) if stad_el else None + if not stad or stad.lower() != "schiedam": + continue + + # Status from card-house__thumb second class + thumb = card.select_one("div.card-house__thumb") + status_classes = thumb.get("class", []) if thumb else [] + status_text = next( + (c for c in status_classes if c != "card-house__thumb"), "beschikbaar" + ).lower() + status = _SCHIELAND_STATUS_MAP.get(status_text, "beschikbaar") + + # Price + prijs = parse_prijs(_text(card, "p.price")) + if prijs and prijs > config.MAX_PRICE: + continue + + adres = _text(card, "h4.house-street") + + # Hero image from picture source (medium size) + src_tag = card.select_one('picture source[media="(min-width:100px)"]') + hero = src_tag["srcset"] if src_tag else None + if hero is None: + img = card.select_one("img") + hero = img.get("src") if img else None + if hero and not hero.startswith("http"): + hero = _SCHIELAND_BASE + hero + + # Data icons on card: surface, bedrooms, energy label + woonoppervlak_card = None + slaapkamers_card = None + energielabel_card = None + for data_div in card.select("div.data"): + txt = data_div.get_text(strip=True) + if data_div.select_one("i.icon-surface"): + woonoppervlak_card = parse_m2(txt) + elif data_div.select_one("i.icon-bedrooms"): + m = re.search(r"(\d+)", txt) + slaapkamers_card = int(m.group(1)) if m else None + elif data_div.select_one("i.icon-label"): + energielabel_card = txt.strip() or None + + # Fetch detail page for full kenmerken + kk = _schieland_detail(detail_url) + + # Refine status from detail page + if kk.get("status"): + status = _SCHIELAND_STATUS_MAP.get(kk["status"], status) + + # Parse kamers: "5 kamers" → 5 + kamers = None + if kk.get("kamers"): + m = re.search(r"(\d+)", kk["kamers"]) + kamers = int(m.group(1)) if m else None + + # Parse slaapkamers: "3" or "3 slaapkamers" → 3 + slaapkamers = slaapkamers_card + if kk.get("slaapkamers"): + m = re.search(r"(\d+)", kk["slaapkamers"]) + slaapkamers = int(m.group(1)) if m else slaapkamers_card + + listings.append(RawListing( + url=detail_url, + source_makelaar="schielandborsboom", + status=status, + adres=adres, + postcode=kk.get("postcode"), + stad=stad, + prijs=prijs, + hero_image_url=hero, + woningtype=kk.get("woningtype"), + bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, + woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, + perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), + kamers=kamers, + slaapkamers=slaapkamers, + energielabel=kk.get("energielabel") or energielabel_card, + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("schielandborsboom: parse fout: %s", e) + + if len(cards) < 18: + break + page += 1 + + log.info("schielandborsboom: %d listings opgehaald", len(listings)) + return listings + + +# --------------------------------------------------------------------------- +# Olsthoorn Makelaars Delft (SURE WordPress plugin) +# --------------------------------------------------------------------------- +# Covers Delft, Den Haag, Naaldwijk etc — we filter for Delft only. +# Detail page has no postcode; leave as None. + +_OLSTHOORN_BASE = "https://www.olsthoornmakelaars.nl" + +_OLSTHOORN_STATUS_MAP = { + "badge-available": "beschikbaar", + "badge-bid": "onder_bod", + "badge-option": "onder_bod", + "badge-sold": "verkocht", +} + +_OLSTHOORN_DETAIL_STATUS_MAP = { + "beschikbaar": "beschikbaar", + "onder bod": "onder_bod", + "onder optie": "onder_bod", + "verkocht": "verkocht", +} + + +def _olsthoorn_detail(detail_url: str) -> dict: + """Fetch Olsthoorn detail page; extract kenmerken from #kenmerken li pairs.""" + try: + soup = fetch_soup(detail_url) + kv: dict[str, str] = {} + for li in soup.select("#kenmerken li"): + spans = li.select("span") + if len(spans) >= 2: + label = spans[0].get_text(strip=True).lower() + value = spans[1].get_text(strip=True) + kv[label] = value + return { + "status": kv.get("status", "").lower(), + "woningtype": kv.get("soort object") or kv.get("soort woning") or kv.get("soort bouw"), + "bouwjaar": kv.get("bouwjaar"), + "woonoppervlak": kv.get("gebruiksoppervlakte"), + "perceeloppervlak": kv.get("perceeloppervlakte"), + "kamers": kv.get("aantal kamers"), + "slaapkamers": kv.get("aantal slaapkamers"), + "energielabel": kv.get("energielabel"), + } + except Exception as e: + log.warning("olsthoorn: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_olsthoorn() -> list[RawListing]: + """Fetch Olsthoorn Makelaars listings; only Delft, only koop.""" + listings = [] + page = 1 + + while True: + if page == 1: + url = f"{_OLSTHOORN_BASE}/wonen?sure_koop_huur=koop" + else: + url = f"{_OLSTHOORN_BASE}/wonen/page/{page}/?sure_koop_huur=koop" + + soup = fetch_soup(url) + cards = soup.select("a.card-house") + if not cards: + break + + for card in cards: + try: + href = card.get("href", "") + if not href: + continue + detail_url = href if href.startswith("http") else _OLSTHOORN_BASE + href + + # Filter: only Delft + stad_el = card.select_one("h2.card__title") + stad = stad_el.get_text(strip=True) if stad_el else None + if not stad or stad.lower() != "delft": + continue + + # Price from bold tag — filter early before detail fetch + prijs_b = card.select_one("b") + prijs = parse_prijs(prijs_b.get_text() if prijs_b else None) + if prijs and prijs > config.MAX_PRICE: + continue + + # Status from badge class on label span + label_span = card.select_one("span.card-house__label") + status = "beschikbaar" + if label_span: + for cls in label_span.get("class", []): + if cls in _OLSTHOORN_STATUS_MAP: + status = _OLSTHOORN_STATUS_MAP[cls] + break + + # Address: second

    under .short--info (collapse internal whitespace) + adres_p = card.select("div.short--info > p") + if adres_p: + adres = " ".join(adres_p[0].get_text().split()) + else: + adres = None + + # Hero image: largest source srcset + src_tag = card.select_one('picture source[media="(min-width:1024px)"]') + hero = src_tag.get("data-srcset") if src_tag else None + if hero and not hero.startswith("http"): + hero = _OLSTHOORN_BASE + hero + + # Woonoppervlak + kamers + energielabel from card data icons + woonoppervlak_card = None + kamers_card = None + energielabel_card = None + for data_div in card.select("div.data"): + inner = data_div.select_one("span.date__inner") + if not inner: + continue + txt = inner.get_text(strip=True) + if data_div.select_one("i.icon-sizes"): + woonoppervlak_card = parse_m2(txt) + elif data_div.select_one("i.icon-door"): + m = re.search(r"(\d+)", txt) + kamers_card = int(m.group(1)) if m else None + elif data_div.select_one("i.icon-energylabel"): + energielabel_card = txt or None + + kk = _olsthoorn_detail(detail_url) + + # Refine status from detail page + detail_status = _OLSTHOORN_DETAIL_STATUS_MAP.get(kk.get("status", ""), "") + if detail_status: + status = detail_status + + listings.append(RawListing( + url=detail_url, + source_makelaar="olsthoorn", + status=status, + adres=adres, + postcode=None, # not exposed by broker + stad=stad, + prijs=prijs, + hero_image_url=hero, + woningtype=kk.get("woningtype"), + bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, + woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, + perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), + kamers=int(kk["kamers"]) if kk.get("kamers") else kamers_card, + slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None, + energielabel=kk.get("energielabel") or energielabel_card, + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("olsthoorn: parse fout: %s", e) + + if len(cards) < 15: + break + page += 1 + + log.info("olsthoorn: %d listings opgehaald", len(listings)) + return listings + + +# --------------------------------------------------------------------------- +# Van Herk Makelaars (Schiedam) — SURE WordPress plugin (card-house) +# --------------------------------------------------------------------------- +# Listings filtered by city + price in URL; pagination via /page/{N}/. +# Detail page: div.features ul.unstyled li with two (label + value). + +_VANHERK_BASE = "https://www.vanherk.nl" +_VANHERK_LISTINGS = "https://www.vanherk.nl/wonen/aanbod/zoeken/schiedam/200000-300000/" + +_VANHERK_STATUS_MAP = { + "beschikbaar": "beschikbaar", + "onder bod": "onder_bod", + "onder optie": "onder_bod", + "verkocht": "verkocht", +} + + +def _vanherk_detail(detail_url: str) -> dict: + """Fetch Van Herk detail page; extract kenmerken from div.features.""" + try: + soup = fetch_soup(detail_url) + kv: dict[str, str] = {} + for li in soup.select("div.features ul.unstyled li"): + spans = li.select("span") + if len(spans) >= 2: + label = spans[0].get_text(strip=True).lower() + value = spans[1].get_text(strip=True) + kv[label] = value + return { + "status": kv.get("status", "").lower(), + "bouwjaar": kv.get("bouwjaar"), + "woonoppervlak": kv.get("woonoppervlakte"), + "slaapkamers": kv.get("aantal slaapkamers"), + } + except Exception as e: + log.warning("vanherk: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_vanherk() -> list[RawListing]: + """Fetch Van Herk listings; only Schiedam, only koop.""" + listings = [] + page = 1 + + while True: + if page == 1: + url = _VANHERK_LISTINGS + else: + url = _VANHERK_LISTINGS + f"page/{page}/" + + soup = fetch_soup(url) + cards = soup.select("a.card-house") + if not cards: + break + + for card in cards: + try: + href = card.get("href", "") + if not href: + continue + detail_url = href if href.startswith("http") else _VANHERK_BASE + href + + # City from lead paragraph + lead = card.select_one("p.lead") + stad = lead.get_text(strip=True) if lead else None + + # Address from h4 (normalize whitespace incl.  ) + h4 = card.select_one("h4") + adres = " ".join(h4.get_text().split()) if h4 else None + + # Price from .subtitle + subtitle = card.select_one("p.subtitle") + prijs = parse_prijs(subtitle.get_text() if subtitle else None) + if prijs and prijs > config.MAX_PRICE: + continue + + # Hero image: largest srcset source + src_tag = card.select_one('picture source[media="(min-width:1280px)"]') + hero = src_tag.get("srcset") if src_tag else None + if hero and not hero.startswith("http"): + hero = _VANHERK_BASE + hero + + # Card data icons: surface, bedrooms, energy label + woonoppervlak_card = None + slaapkamers_card = None + energielabel_card = None + for data_div in card.select("div.data"): + classes = data_div.get("class") or [] + if "d-none" in classes: + continue + if "data-energie" in classes: + inner = data_div.select_one(".date__inner") + energielabel_card = inner.get_text(strip=True) if inner else None + elif data_div.select_one("i.icon-surface"): + inner = data_div.select_one("span.date__inner") + woonoppervlak_card = parse_m2(inner.get_text(strip=True) if inner else None) + elif data_div.select_one("i.icon-bed"): + inner = data_div.select_one("span.date__inner") + txt = inner.get_text(strip=True) if inner else None + m = re.search(r"(\d+)", txt) if txt else None + slaapkamers_card = int(m.group(1)) if m else None + + kk = _vanherk_detail(detail_url) + + status = _VANHERK_STATUS_MAP.get(kk.get("status", ""), "beschikbaar") + + listings.append(RawListing( + url=detail_url, + source_makelaar="vanherk", + status=status, + adres=adres, + stad=stad, + prijs=prijs, + hero_image_url=hero, + bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar", "").isdigit() else None, + woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, + slaapkamers=(int(kk["slaapkamers"]) if kk.get("slaapkamers", "").isdigit() else None) or slaapkamers_card, + energielabel=energielabel_card, + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("vanherk: parse fout: %s", e) + + if len(cards) < 15: + break + page += 1 + + log.info("vanherk: %d listings opgehaald", len(listings)) + return listings + + +# --------------------------------------------------------------------------- +# Borgdorff Makelaars (Den Haag / Westland) — SURE WordPress plugin +# --------------------------------------------------------------------------- +# Covers Den Haag ('s-gravenhage), Monster, Naaldwijk etc. Filter for Den Haag. +# Same SURE plugin as Schieland Borsboom but uses a.card--house (double dash). +# No postcode on detail page. + +_BORGDORFF_BASE = "https://www.borgdorff.nl" +_BORGDORFF_DEN_HAAG = {"'s-gravenhage", "den haag"} + +_BORGDORFF_BADGE_MAP = { + "badge--info": "beschikbaar", + "badge--warning": "onder_bod", + "badge--danger": "verkocht", +} + +_BORGDORFF_DETAIL_STATUS_MAP = { + "beschikbaar": "beschikbaar", + "onder bod": "onder_bod", + "onder optie": "onder_bod", + "verkocht": "verkocht", +} + + +def _borgdorff_detail(detail_url: str) -> dict: + """Fetch Borgdorff detail page; extract #kenmerken li span pairs.""" + try: + soup = fetch_soup(detail_url) + kv: dict[str, str] = {} + for li in soup.select("#kenmerken li"): + spans = li.select("span") + if len(spans) >= 2: + label = spans[0].get_text(strip=True).lower() + value = spans[1].get_text(strip=True) + kv[label] = value + return { + "status": kv.get("status", "").lower(), + "woningtype": kv.get("soort woonhuis") or kv.get("soort woning") or kv.get("soort bouw"), + "bouwjaar": kv.get("bouwjaar"), + "woonoppervlak": kv.get("gebruiksoppervlakte wonen") or kv.get("gebruiksoppervlakte"), + "perceeloppervlak": kv.get("perceeloppervlakte"), + "slaapkamers": kv.get("aantal slaapkamers"), + "energielabel": kv.get("energielabel"), + } + except Exception as e: + log.warning("borgdorff: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_borgdorff() -> list[RawListing]: + """Fetch Borgdorff listings; only Den Haag / 's-gravenhage, only koop.""" + listings = [] + page = 1 + + while True: + if page == 1: + url = f"{_BORGDORFF_BASE}/wonen?sure_koop_huur=koop" + else: + url = f"{_BORGDORFF_BASE}/wonen/page/{page}/?sure_koop_huur=koop" + + soup = fetch_soup(url) + cards = soup.select("a.card--house") + if not cards: + break + + for card in cards: + try: + href = card.get("href", "") + if not href: + continue + detail_url = href if href.startswith("http") else _BORGDORFF_BASE + href + + # Filter: only Den Haag + stad_el = card.select_one("p.lead-two") + stad = stad_el.get_text(strip=True) if stad_el else None + if not stad or stad.lower() not in _BORGDORFF_DEN_HAAG: + continue + + # Price — filter early + prijs = parse_prijs(_text(card, "p.strong")) + if prijs and prijs > config.MAX_PRICE: + continue + + # Status from badge class + label_span = card.select_one("span.card-house__label") + status = "beschikbaar" + if label_span: + for cls in label_span.get("class", []): + if cls in _BORGDORFF_BADGE_MAP: + status = _BORGDORFF_BADGE_MAP[cls] + break + + # Address + adres = _text(card, "h4") + + # Hero: largest source srcset + src_tag = card.select_one('picture source[media="(min-width:1280px)"]') + hero = src_tag.get("srcset") if src_tag else None + if not hero: + img = card.select_one("img[data-src]") + hero = img.get("data-src") if img else None + if hero and not hero.startswith("http"): + hero = _BORGDORFF_BASE + hero + + # Surface + bedrooms from data icons + woonoppervlak_card = None + slaapkamers_card = None + for data_div in card.select("div.data"): + inner = data_div.select_one("p.small") + if not inner: + continue + txt = inner.get_text(strip=True) + if data_div.select_one("i.icon-surface"): + woonoppervlak_card = parse_m2(txt) + elif data_div.select_one("i.icon-bed"): + m = re.search(r"(\d+)", txt) + slaapkamers_card = int(m.group(1)) if m else None + + kk = _borgdorff_detail(detail_url) + + # Refine status from detail page + if kk.get("status"): + status = _BORGDORFF_DETAIL_STATUS_MAP.get(kk["status"], status) + + listings.append(RawListing( + url=detail_url, + source_makelaar="borgdorff", + status=status, + adres=adres, + postcode=None, # not exposed by broker + stad=stad, + prijs=prijs, + hero_image_url=hero, + woningtype=kk.get("woningtype"), + bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, + woonoppervlak=parse_m2(kk.get("woonoppervlak")) or woonoppervlak_card, + perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), + slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else slaapkamers_card, + energielabel=kk.get("energielabel"), + )) + if config.APP_ENV == "dev": + break + except Exception as e: + log.warning("borgdorff: parse fout: %s", e) + + if len(cards) < 15: + break + page += 1 + + log.info("borgdorff: %d listings opgehaald", len(listings)) + return listings diff --git a/src/huizenbot.py b/src/huizenbot.py index 347e863..e2e4f01 100644 --- a/src/huizenbot.py +++ b/src/huizenbot.py @@ -159,9 +159,22 @@ def upsert(conn: sqlite3.Connection, listing: RawListing, travel: dict[str,int]) "extra": json.dumps(listing.extra) if listing.extra else None, }) else: - _cursor = conn.execute(""" - UPDATE woningen SET last_seen = ?, status = ? WHERE id = ? - """, (now, listing.status, lid)) + if travel: + conn.execute(""" + UPDATE woningen + SET last_seen = ?, status = ?, + fiets_mark = ?, fiets_michelle = ?, ov_mark = ?, ov_michelle = ? + WHERE id = ? + """, ( + now, listing.status, + travel.get("fiets_mark"), travel.get("fiets_michelle"), + travel.get("ov_mark"), travel.get("ov_michelle"), + lid, + )) + else: + conn.execute(""" + UPDATE woningen SET last_seen = ?, status = ? WHERE id = ? + """, (now, listing.status, lid)) conn.commit() return is_new @@ -391,11 +404,13 @@ def run(scrapers: dict[str,Scraper], db_path: str) -> None: travel = {} try: lid = listing_id(listing.url) - is_existing = conn.execute( - "SELECT id FROM woningen WHERE id = ?", (lid,) - ).fetchone() is not None + row = conn.execute( + "SELECT fiets_mark FROM woningen WHERE id = ?", (lid,) + ).fetchone() + is_existing = row is not None + needs_travel = not is_existing or row[0] is None - if not is_existing: + if needs_travel: travel = bereken_reistijden(listing.postcode, listing.stad) is_new = upsert(conn, listing, travel) diff --git a/tests/cache.py b/tests/cache.py index 01157ba..7190bb9 100644 --- a/tests/cache.py +++ b/tests/cache.py @@ -22,10 +22,10 @@ def _key(url: str, params: dict[str,str] | None) -> str: def _patch(): import adapters.api as api_mod - import adapters.ssr as ssr_mod + import adapters.ssr._shared as ssr_shared _orig_fetch_json = api_mod.fetch_json - _orig_fetch_soup = ssr_mod.fetch_soup + _orig_fetch_soup = ssr_shared.fetch_soup def cached_fetch_json(url, *, params: dict[str,str]|None=None, headers=None): path = CACHE_DIR / (_key(url, params) + ".json") @@ -46,7 +46,15 @@ def _patch(): return result api_mod.fetch_json = cached_fetch_json - ssr_mod.fetch_soup = cached_fetch_soup + # fetch_soup is imported directly in each submodule via `from ._shared import fetch_soup`, + # so we must patch the name in every submodule that uses it. + import adapters.ssr.realworks as _rw + import adapters.ssr.sure as _sure + import adapters.ssr.schiedam as _sch + import adapters.ssr.denhaag as _dh + import adapters.ssr.overige as _ov + for _mod in [ssr_shared, _rw, _sure, _sch, _dh, _ov]: + _mod.fetch_soup = cached_fetch_soup print("[cache] fetch_json and fetch_soup patched")