From 17b35d1997c8b400e9101044cc645b3c24e6c336 Mon Sep 17 00:00:00 2001 From: Mark Kalsbeek Date: Fri, 3 Apr 2026 15:49:42 +0200 Subject: [PATCH] add some more makelaars, and some more infra --- .env.example | 2 - add_scraper_context.md | 358 +++++++++++++++++++++++++++++++++++++++++ autoscraper.py | 290 +++++++++++++++++++++++++++++++++ bsprettify.py | 3 + makelaars.md | 82 ++++++---- new_scraper_prompt.md | 36 +++++ src/adapters/api.py | 78 ++++++++- src/adapters/ssr.py | 145 +++++++++++++---- tests/test_adapters.py | 4 +- 9 files changed, 928 insertions(+), 70 deletions(-) create mode 100644 add_scraper_context.md create mode 100644 autoscraper.py create mode 100644 bsprettify.py create mode 100644 new_scraper_prompt.md diff --git a/.env.example b/.env.example index 1b28ae7..4fcd567 100644 --- a/.env.example +++ b/.env.example @@ -1,5 +1,3 @@ -NAVITIA_API_KEY= - HA_WEBHOOK_URL= SMTP_HOST= diff --git a/add_scraper_context.md b/add_scraper_context.md new file mode 100644 index 0000000..cbd879c --- /dev/null +++ b/add_scraper_context.md @@ -0,0 +1,358 @@ +# Huizenbot — Agent Context for Adding Routes + +## Project Overview + +**Huizenbot** is a periodic scraper of real estate broker websites in Delft and Schiedam (Netherlands). It: +- Fetches property listings from broker websites +- Saves new ones to SQLite with `RawListing` schema +- Calculates travel times (bike + public transit) to two work locations +- Sends push notifications via Home Assistant webhook (with email fallback) + +**Your role:** You will add new broker routes (scrapers) to the `adapters/` directory. A human will: +1. Select a broker from the list +2. Help you investigate the broker's website +3. For API-based brokers: develop curl requests to test +4. For HTML scrapers: develop parsing logic using BeautifulSoup +5. Run `tests/test_adapters.py` to validate +6. Merge your code snippets into the codebase + +--- + +## Key Schema: RawListing + +**Location:** `src/huizenbot.py` (lines 29–52) + +This is the data model you must populate. All fields except `url` are optional: + +```python +@dataclass +class RawListing: + url: str # REQUIRED — the listing URL + + source_makelaar: str = "" # Name of the broker (e.g., "bjornd", "vdaal") + datum_aanmelding: str | None = None # ISO 8601 date if available + status: str = "beschikbaar" # enum: beschikbaar | onder_bod | verkocht + + # Location + adres: str | None = None # Street address (e.g., "Binnenwatersloot 3") + postcode: str | None = None # Dutch postcode (e.g., "2611CA") + stad: str | None = None # City (e.g., "Delft") + + # Property details + prijs: int | None = None # Price in euros (integer, no float) + woningtype: str | None = None # Type (e.g., "appartement", "tussenwoning") + woonoppervlak: int | None = None # Living space in m² + perceeloppervlak: int | None = None # Plot size in m² (NULL for apartments) + kamers: int | None = None # Number of rooms + slaapkamers: int | None = None # Number of bedrooms + bouwjaar: int | None = None # Build year + energielabel: str | None = None # Energy label (e.g., "A", "B") + + # Media + hero_image_url: str | None = None # Main photo URL + + # Extra data (broker-specific fields) + extra: dict[str, Any] = field(default_factory=dict) # Arbitrary JSON data +``` + +**DB Upsert:** The listing is inserted on first run (with `id = sha256(url)`) and updated only on `last_seen` / `status` on subsequent runs. Travel times are calculated only on first insert. + +--- + +## Adapter Structure + +Adapters live in `src/adapters/` and are organized by type: + +### Two Adapter Types + +#### 1. **API-based** (`src/adapters/api.py`) +For brokers with REST/JSON endpoints. + +**Pattern:** +```python +def fetch_bjornd() -> list[RawListing]: + data = fetch_json("https://...", params={...}, headers={...}) + listings = [] + for item in data: + # Filter / validate + if item.get("status") in _SKIP: + continue + if item.get("price") > config.MAX_PRICE: + continue + + listings.append(RawListing( + url=item["url"], + source_makelaar="bjornd", + adres=item.get("address"), + postcode=item.get("zipcode"), + # ... etc + )) + + log.info("bjornd: %d listings", len(listings)) + return listings +``` + +**Helpers available:** +- `fetch_json(url, *, params=None, headers=None)` — GET with User-Agent, timeout, Retry-After handling +- Built-in logging via `log = logging.getLogger("huizenbot.api")` + +#### 2. **SSR/HTML-based** (`src/adapters/ssr.py`) +For brokers with server-side rendered HTML. + +**Pattern:** +```python +def fetch_vdaal() -> list[RawListing]: + soup = fetch_soup("https://vdaalmakelaardij.nl/aanbod") + listings = [] + + for card in soup.select(".property-card"): + try: + url = card.select_one("a[href]")["href"] + if not url.startswith("http"): + url = VDAAL_BASE + url + + adres = _text(card, ".address-selector") + postcode = _extract_postcode(adres) + prijs = parse_prijs(_text(card, ".price")) + + listings.append(RawListing( + url=url, + source_makelaar="vdaal", + adres=adres, + postcode=postcode, + stad=_infer_stad(postcode), + prijs=prijs, + # ... etc + )) + except Exception as e: + log.warning("Parse error: %s", e) + + log.info("vdaal: %d listings", len(listings)) + return listings +``` + +**Helpers available:** +- `fetch_soup(url, *, params=None)` — GET with BeautifulSoup, Retry-After handling +- `parse_prijs(text)` — Extract price from strings like "€ 325.000 k.k." → 325000 +- `parse_m2(text)` — Extract area from "87 m²" → 87 +- `_text(soup, selector)` — Get inner text from element +- `_src(soup, selector)` — Get src or data-src attribute +- `_extract_postcode(text)` — Regex postcode from any text +- `_infer_stad(postcode)` — Simple lookup: 2600–2629 → Delft, 3100–3135 → Schiedam + +--- + +## Registration + +Both `api.py` and `ssr.py` have a `SCRAPERS` dict at the bottom: + +```python +# api.py +SCRAPERS = { + 'bjornd': fetch_bjornd, + 'your_broker': fetch_your_broker, # ← Add here +} + +# ssr.py +SCRAPERS = { + 'bjornd_demo': fetch_bjornd_demo, + 'your_broker': fetch_your_broker, # ← Add here +} +``` + +The `src/adapters/__init__.py` merges both dicts, so the runner picks up all registered adapters automatically. + +--- + +## Testing Workflow + +### 1. Understand the Website +The human will help you: +- Identify the broker's API endpoint (or the HTML structure) +- Check for a `robots.txt` or rate limit headers +- Write exploratory curl requests (for APIs) or BeautifulSoup inspections + +### 2. Develop & Test Locally +- Add your scraper function to the appropriate file (`api.py` or `ssr.py`) +- Register it in the `SCRAPERS` dict +- The human updates `tests/test_adapters.py` to point to your adapter: + ```python + ADAPTER = SCRAPERS['your_broker_name'] + ``` +- Run the test: + ```bash + cd tests && python test_adapters.py + ``` +- The test prints listings in a simple format so you can validate output + +### 3. Merge Code +Once validated, the human will **copy your inline code snippets** into the main codebase. You produce **easily pasteable functions**, not entire files. + +--- + +## Config & Constants + +**Location:** `src/config.py` + +Key values you may reference: +- `MAX_PRICE = 300_000` — Price filter (your scraper can skip listings above this) +- `USER_AGENT = "Huizenbot/1.0 (+mark@kalsbeek.dev) persoonlijk gebruik"` — Used in all HTTP headers +- `MARK_WERK_POSTCODE`, `MICHELLE_WERK_POSTCODE` — Work postcodes for travel time calculation + +Secrets (API keys, webhook URLs) are **environment variables**, not in config. + +--- + +## CMS Detection Tool + +Before investigating a broker's HTML manually, prod the human in the loop to run `autoscraper.py` from the project root: +```bash +python autoscraper.py listings +python autoscraper.py details +``` + +If the broker uses a known CMS, the tool prints the exact code to add — no further investigation needed. Currently detected CMSes: + +- **Realworks** → prints a ready-to-paste `fetch_realworks(...)` one-liner for `ssr.py` + +If the CMS is unknown, the tool prints structural diagnostics (card selectors, field patterns, pagination) to guide manual adapter development. + +## Important Notes + +### Status Mapping +Brokers use different status strings. Always map to one of: +- `"beschikbaar"` — Available for sale +- `"onder_bod"` — Under offer +- `"verkocht"` — Sold + +Example from api.py: +```python +_STATUS_MAP = { + "available": "beschikbaar", + "under_bid": "onder_bod", + "sold": "verkocht", +} +status = _STATUS_MAP.get(item.get("status"), "beschikbaar") +``` + +### Postcode Extraction +Always aim for the **Dutch postcode format** (4 digits + 2 letters, e.g., `"2611CA"`). The travel time calculation depends on it. If a broker only provides the address string, use `_extract_postcode(address)`. + +### Price Handling +Prices are **integers** (euros), never floats. Use `parse_prijs()` for HTML. + +### Image URLs +Store the hero/main image URL in `hero_image_url`. This appears in Home Assistant notifications. + +### Extra Data +If a broker provides extra fields that don't fit the schema (e.g., balcony, garden, orientation), store them in the `extra` dict: +```python +listings.append(RawListing( + url=..., + ... + extra={ + "balcony": item.get("has_balcony"), + "garden": item.get("has_garden"), + "custom_field": item.get("something_else"), + } +)) +``` + +The database stores this as JSON in the `extra` column. + +### Error Handling +- Wrap individual listing parsing in try/except to continue on one bad listing +- Log parse warnings, not errors (brokers' HTML changes) +- Let HTTP errors bubble up (the runner catches them at the adapter level) + +### Rate Limiting & Ethics +- Both `fetch_json()` and `fetch_soup()` handle 429 Retry-After automatically +- Nominatim (geocoding) has a 1 req/s limiter built into `huizenbot.py` +- Never spawn parallel requests without the human's approval +- Always use the `USER_AGENT` header (includes contact info for respectful scraping) + +--- + +## Example: Adding "Van Daal" (API-based) + +### Scenario +The human finds that Van Daal (vandaalmakelaardij.nl) has a JSON API at: +``` +https://api.vandaal.nl/listings?city=delft&status=available +``` + +### Your Code (add to api.py) + +```python +# Van Daal +# -------- +_VANDAAL_BASE = "https://www.vandaalmakelaardij.nl" +_VANDAAL_API = "https://api.vandaal.nl/listings" + +_VANDAAL_STATUS_MAP = { + "available": "beschikbaar", + "under_offer": "onder_bod", + "sold": "verkocht", +} + +def fetch_vandaal() -> list[RawListing]: + listings = [] + for city in ["delft", "schiedam"]: + data = fetch_json( + _VANDAAL_API, + params={"city": city, "status": "available"} + ) + + for item in data.get("listings", []): + if item.get("price", 0) > config.MAX_PRICE: + continue + + listings.append(RawListing( + url=item["url"], + source_makelaar="vandaal", + adres=item.get("address"), + postcode=item.get("postcode"), + stad=item.get("city"), + prijs=item.get("price"), + woningtype=item.get("type"), + woonoppervlak=item.get("living_area"), + slaapkamers=item.get("bedrooms"), + hero_image_url=item.get("image_url"), + )) + + log.info("vandaal: %d listings", len(listings)) + return listings +``` + +### Register in SCRAPERS (in api.py) +```python +SCRAPERS = { + 'bjornd': fetch_bjornd, + 'vandaal': fetch_vandaal, # ← Add this +} +``` + +### Test +Human updates `test_adapters.py`: +```python +ADAPTER = SCRAPERS['vandaal'] +``` + +Then runs: +```bash +cd tests && python test_adapters.py +``` + +If all looks good, the human copies the `fetch_vandaal()` function into the real `api.py` and adds it to `SCRAPERS`. + +--- + +## Summary + +1. **You receive** an adapter request + investigation results (API endpoint or HTML structure) +2. **You write** a clean, self-contained scraper function that returns `list[RawListing]` +3. **You register** it in the appropriate `SCRAPERS` dict +4. **The human tests** it with `test_adapters.py` and validates output +5. **The human merges** your code into the production files + +Keep code simple, use the provided helpers, populate `RawListing` fields as best you can, and always set `source_makelaar` and `url` correctly. diff --git a/autoscraper.py b/autoscraper.py new file mode 100644 index 0000000..ee5d6c3 --- /dev/null +++ b/autoscraper.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +""" +autoscraper.py — detect CMS and extract patterns from broker pages + +Usage: + python autoscraper.py listings — detect CMS + card structure + python autoscraper.py details — detect CMS + kenmerk patterns +""" + +import re +import sys + +import httpx +from bs4 import BeautifulSoup, Tag + +UA = "Huizenbot/1.0 (+mark@kalsbeek.dev) persoonlijk gebruik" + +# --------------------------------------------------------------------------- +# CMS fingerprints +# --------------------------------------------------------------------------- + +# Each entry: (name, listings_signal, details_signal, adapter_hint) +# signals are (selector, min_count) tuples — all must match +CMS_FINGERPRINTS = [ + { + "name": "Realworks", + "listings": [("li.aanbodEntry", 1), ("span.kenmerkValue", 1)], + "details": [("span.kenmerkName", 3), ("span.kenmerkValue", 3)], + "hint": "fetch_realworks('{base_url}', '{makelaar}')", + }, +] + +# --------------------------------------------------------------------------- +# Candidate card selectors (tried in order for unknown CMS) +# --------------------------------------------------------------------------- + +CARD_CANDIDATES = [ + "li.aanbodEntry", + "article", + "li[class*=object]", + "li[class*=woning]", + "li[class*=listing]", + "div[class*=object-item]", + "div[class*=property-item]", + "div[class*=aanbod]", + ".listing-item", +] + +# --------------------------------------------------------------------------- +# Regex patterns for field detection +# --------------------------------------------------------------------------- + +RE_POSTCODE = re.compile(r"\b\d{4}\s?[A-Z]{2}\b") +RE_PRICE = re.compile(r"€\s*[\d.,]+") +RE_M2 = re.compile(r"\d+\s*m[²2]") +RE_PAGE_URL = re.compile(r"pagina[-/]?\d+|[?&]p(?:age)?=\d+|/\d+/?$") + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def fetch(url: str) -> BeautifulSoup: + r = httpx.get(url, headers={"User-Agent": UA}, timeout=15, follow_redirects=True) + r.raise_for_status() + return BeautifulSoup(r.text, "html.parser") + + +def _selector_path(el: Tag) -> str: + """Short CSS-like path for an element: tag.class1.class2""" + parts = [] + for ancestor in reversed(list(el.parents)): + if ancestor.name in (None, "[document]", "html", "body"): + continue + cls = ".".join(ancestor.get("class", [])) + parts.append(f"{ancestor.name}.{cls}" if cls else ancestor.name) + if len(parts) >= 3: + break + cls = ".".join(el.get("class", [])) + parts.append(f"{el.name}.{cls}" if cls else el.name) + return " > ".join(parts[-3:]) + + +def _detect_cms(soup: BeautifulSoup, mode: str) -> dict | None: + key = "listings" if mode == "listings" else "details" + for cms in CMS_FINGERPRINTS: + if all(len(soup.select(sel)) >= n for sel, n in cms[key]): + return cms + return None + + +def _find_cards(soup: BeautifulSoup) -> tuple[list, str | None]: + for sel in CARD_CANDIDATES: + found = soup.select(sel) + if len(found) >= 2: + return found, sel + # fallback: find the most repeated element class + from collections import Counter + class_counts: Counter = Counter() + for el in soup.find_all(True): + cls = tuple(el.get("class", [])) + if cls: + class_counts[cls] += 1 + if class_counts: + top_cls, count = class_counts.most_common(1)[0] + if count >= 2: + sel = "." + ".".join(top_cls) + return soup.select(sel), f"{sel} (auto-detected, count={count})" + return [], None + + +def _pattern_hits(soup: BeautifulSoup, pattern: re.Pattern, label: str): + hits = [] + for el in soup.find_all(string=pattern): + parent = el.parent + if parent: + hits.append((parent.get_text(strip=True)[:80], _selector_path(parent))) + if hits: + print(f"\n [{label}] — {len(hits)} hit(s)") + for text, path in hits[:4]: + print(f" {path}") + print(f" → {text!r}") + + +# --------------------------------------------------------------------------- +# Commands +# --------------------------------------------------------------------------- + +def cmd_listings(url: str): + print(f"Fetching: {url}\n") + soup = fetch(url) + base_url = "/".join(url.split("/")[:3]) + + cms = _detect_cms(soup, "listings") + + if cms: + print(f"✓ CMS DETECTED: {cms['name']}") + hint = cms["hint"].format(base_url=base_url, makelaar="") + print(f"\n Add to ssr.py:\n") + print(f" def fetch_() -> list[RawListing]:") + print(f" return {hint}\n") + print(f" Register in SCRAPERS dict:") + print(f" '': fetch_,") + return + + print("✗ CMS unknown — structural diagnostics:\n") + + # Cards + cards, matched_sel = _find_cards(soup) + print(f"=== CARDS ({matched_sel or 'none found'}: {len(cards)}) ===") + if cards: + print("\n--- FIRST CARD ---") + print(cards[0].prettify()[:2500]) + print("\n--- CHILD ELEMENTS & CLASSES ---") + for el in cards[0].find_all(True): + cls = el.get("class") + text = el.get_text(strip=True)[:50] + if cls: + print(f" <{el.name}> .{' .'.join(cls)} {text!r}") + + # Pattern hits in cards area (or full page if no cards) + search_area = cards[0] if cards else soup + print("\n=== FIELD PATTERNS ===") + _pattern_hits(search_area, RE_POSTCODE, "postcode") + _pattern_hits(search_area, RE_PRICE, "prijs") + _pattern_hits(search_area, RE_M2, "m²") + + # Pagination + print("\n=== PAGINATION ===") + page_links = soup.find_all("a", href=RE_PAGE_URL) + if page_links: + seen = set() + for a in page_links: + href = a.get("href", "") + if href not in seen: + seen.add(href) + print(f" {href!r} — {a.get_text(strip=True)!r}") + else: + print(" No pagination links found") + + +def cmd_details(url: str): + print(f"Fetching: {url}\n") + soup = fetch(url) + + cms = _detect_cms(soup, "details") + + if cms: + print(f"✓ CMS DETECTED: {cms['name']}") + print("\n _realworks_detail() will extract:") + kv: dict[str, str] = {} + for kenmerk in soup.select("span.kenmerk"): + label_el = kenmerk.select_one("span.kenmerkName") + value_el = kenmerk.select_one("span.kenmerkValue") + if label_el and value_el: + label = label_el.get_text(strip=True).lower() + value = value_el.get_text(strip=True) + kv[label] = value + + target_fields = { + "type woning": "woningtype", + "bouwjaar": "bouwjaar", + "woonoppervlakte": "woonoppervlak", + "perceeloppervlakte": "perceeloppervlak", + "aantal kamers": "kamers", + "aantal slaapkamers": "slaapkamers", + "energieklasse": "energielabel", + } + for key, field in target_fields.items(): + val = kv.get(key, "NOT FOUND") + status = "✓" if key in kv else "✗" + print(f" {status} {field:<20} ← {key!r}: {val!r}") + return + + print("✗ CMS unknown — structural diagnostics:\n") + + # Address + print("=== ADDRESS ===") + for tag in ["h1", "h2"]: + for el in soup.select(tag): + t = el.get_text(strip=True) + if t: + print(f" <{tag}> {t!r}") + + # Key-value patterns + print("\n=== KEY-VALUE STRUCTURES ===") + kv_selectors = [ + ("dl", "dt", "dd"), + ("table", "th", "td"), + (".kenmerk", ".kenmerkName", ".kenmerkValue"), + (".spec", ".spec-label", ".spec-value"), + (".feature", ".feature-label", ".feature-value"), + ] + found_any = False + for container_sel, label_sel, value_sel in kv_selectors: + pairs = [] + for container in soup.select(container_sel)[:50]: + label_el = container.select_one(label_sel) + value_el = container.select_one(value_sel) + if label_el and value_el: + l = label_el.get_text(strip=True) + v = value_el.get_text(strip=True) + if l and v: + pairs.append((l, v)) + if pairs: + found_any = True + print(f"\n [{container_sel} > {label_sel} / {value_sel}] — {len(pairs)} pairs") + for l, v in pairs[:10]: + print(f" {l:<30} {v}") + + if not found_any: + print(" No key-value structures detected") + + # Field pattern hits + print("\n=== FIELD PATTERNS ===") + _pattern_hits(soup, RE_POSTCODE, "postcode") + _pattern_hits(soup, RE_PRICE, "prijs") + _pattern_hits(soup, RE_M2, "m²") + + # Images + print("\n=== IMAGES (first 5) ===") + for img in soup.select("img")[:5]: + src = img.get("src") or img.get("data-src") + alt = img.get("alt", "") + print(f" {src} [{alt}]") + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main(): + if len(sys.argv) < 3: + print(__doc__) + sys.exit(1) + + cmd = sys.argv[1] + url = sys.argv[2] + + if cmd == "listings": + cmd_listings(url) + elif cmd == "details": + cmd_details(url) + else: + print(f"Unknown command: {cmd}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/bsprettify.py b/bsprettify.py new file mode 100644 index 0000000..048c093 --- /dev/null +++ b/bsprettify.py @@ -0,0 +1,3 @@ +import sys +from bs4 import BeautifulSoup +print(BeautifulSoup(sys.stdin.read(), 'html.parser').prettify()) diff --git a/makelaars.md b/makelaars.md index 522eaa3..e6f565b 100644 --- a/makelaars.md +++ b/makelaars.md @@ -2,37 +2,57 @@ ## Delft -| Naam | Website | Adres | -|------|---------|-------| -| Van Silfhout & Hogetoorn Wereldmakelaars | vansilfhout.nl | Ireneboulevard 2 | -| Van Daal Makelaardij | vandaalmakelaardij.nl | Voldersgracht 33 | -| Björnd Makelaardij | bjornd.nl | Oude Delft 103 | -| Hof van Delft Makelaardij | hofvandelftmakelaardij.nl | Wateringsevest 26 | -| V&W Makelaars Delft | vwmakelaars.nl | Coenderstraat 31 | -| Roepman Makelaardij NVM | roepman.nl | Molslaan 43 | -| ZO makelaars | zomakelaars.nl | Van Foreestweg 4 | -| Marloes Makelaars | — | Maerten Trompstraat 28 | -| Makelaarskantoor J.E. Mouthaan | — | Julianalaan 43 | -| Olsthoorn Makelaars Delft | olsthoornmakelaars.nl | Noordeinde 51 | -| Post Makelaardij (v/h Bayense) | postmakelaardij.nl | Spoorsingel 1a | -| Morris NVM Makelaars | morrismakelaardij.nl | — | -| Prinsenstad Makelaardij | — | — | -| Oude Delft Makelaardij | — | — | -| Dijksman Woningmakelaars | — | — | -| CORPOwonen | — | — | +| Done | Naam | Website | Adres | +| [ ] | ---- |------|---------|-------| +| [ ] | Van Silfhout & Hogetoorn Wereldmakelaars | vansilfhout.nl | Ireneboulevard 2 | +| [ ] | Van Daal Makelaardij | vandaalmakelaardij.nl | Voldersgracht 33 | +| [x] | Björnd Makelaardij | bjornd.nl | Oude Delft 103 | +| [ ] | Hof van Delft Makelaardij | hofvandelftmakelaardij.nl | Wateringsevest 26 | +| [ ] | V&W Makelaars Delft | vwmakelaars.nl | Coenderstraat 31 | +| [ ] | Roepman Makelaardij NVM | roepman.nl | Molslaan 43 | +| [ ] | ZO makelaars | zomakelaars.nl | Van Foreestweg 4 | +| [ ] | Marloes Makelaars | — | Maerten Trompstraat 28 | +| [ ] | Makelaarskantoor J.E. Mouthaan | — | Julianalaan 43 | +| [ ] | Olsthoorn Makelaars Delft | olsthoornmakelaars.nl | Noordeinde 51 | +| [ ] | Post Makelaardij (v/h Bayense) | postmakelaardij.nl | Spoorsingel 1a | +| [ ] | Morris NVM Makelaars | morrismakelaardij.nl | — | +| [ ] | Prinsenstad Makelaardij | — | — | +| [ ] | Oude Delft Makelaardij | — | — | +| [ ] | Dijksman Woningmakelaars | — | — | +| [ ] | CORPOwonen | — | — | ## Schiedam -| Naam | Website | Adres | -|------|---------|-------| -| Anke Bodewes Makelaardij | ankebodewes.nl | Hargplein 118 | -| Woongoed Makelaars Schiedam | woongoedmakelaars.nl | Oranjestraat 93 | -| Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 | -| De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 | -| Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 | -| 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 | -| Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 | -| D&S Makelaardij | densmakelaars.nl | Land van Belofte 50 | -| Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B | -| Hagestein Makelaardij | — | Degerfors 54 | -| Schieland Borsboom NVM Makelaars | schielandborsboom.nl | (Rotterdam, actief in Schiedam) | +| Done | Naam | Website | Adres | +|------|------|---------|-------| +| [x] | Anke Bodewes Makelaardij | ankebodewes.nl | Hargplein 118 | +| [x] | Woongoed Makelaars Schiedam | woongoedmakelaars.nl | Oranjestraat 93 | +| [x] | Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 | +| [ ] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 | +| [ ] | Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 | +| [ ] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 | +| [ ] | Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 | +| [ ] | D&S Makelaardij | densmakelaars.nl | Land van Belofte 50 | +| [ ] | Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B | +| [ ] | Hagestein Makelaardij | — | Degerfors 54 | +| [ ] | Schieland Borsboom NVM Makelaars | schielandborsboom.nl | (Rotterdam, actief in Schiedam) | + + +## Leiden + +| Done | Naam | Website | Adres | +|------|------|---------|-------| +| [ ] | RE/MAX Makelaarsgilde | makelaars-in-leiden.nl | Levendaal 73-75 | +| [ ] | Hypodomus Leiden | hypodomusleiden.nl | Haarlemmerstraat 268 | +| [ ] | Alpina Leiden (v/h De Leeuw) | advies.alpina.nl | Molenwerf 4 | +| [ ] | Fides makelaars (ERA/NVM) | fidesmakelaarsleiden.nl | Lammenschansweg 76 | +| [ ] | Werk Makelaardij | werkmakelaardij.nl | Stevenshof (Leiden) | +| [ ] | Kerkvliet Makelaars | kerkvlietmakelaars.nl | Hoge Rijndijk 271A | +| [ ] | Kompas Makelaars & Taxateurs | kompasmakelaardij.nl | Maresingel 75-76 | +| [ ] | Hoekstra en Van Eck Leiden | hoekstraenvaneck.nl | Schipholweg 55-75 | +| [ ] | DOEN NVM Makelaars | doenmakelaars.com | Doezastraat 30 | +| [ ] | Oudshoorn Makelaardij | oudshoornmakelaardij.nl | — | +| [ ] | April Makelaars Leiden | aprilmakelaars.nl | Haagweg 55 | +| [ ] | Emil NVM Makelaars | emilmakelaars.nl | — | +| [ ] | Goedhart Makelaars | — | Oude Singel 14 | +| [ ] | Graal Makelaardij & Taxaties | — | Rapenburg 5 | diff --git a/new_scraper_prompt.md b/new_scraper_prompt.md new file mode 100644 index 0000000..b7db7d8 --- /dev/null +++ b/new_scraper_prompt.md @@ -0,0 +1,36 @@ +# SSR +Check out the add_scraper_context.md, let's add a new scraper. + +**Broker:** [name] +**Website:** [base url] +**Listing page URL:** [url with any price/city filters applied] +**Detail page kenmerken:** yes / no + +**Listing page HTML** (one card): +[paste] + +**Detail page dump:** [attached / n.a.] + +**Pagination:** [e.g. 10 per page, pagina-N in URL / no pagination] + +**Notes:** [auth, JS rendering, price filter in URL, etc.] + + +# API + +Check out the add_scraper_context.md, let's add a new scraper. + +**Broker:** [name] +**Website:** [base url] +**API endpoint:** [full url] +**Auth:** [none / header: X-Foo: bar / query param] + +**Example curl:** +[paste] + +**Example response (one item):** +[paste] + +**Pagination:** [e.g. page param / offset / single response] + +**Notes:** [price filter, city filter, status field values, etc.] diff --git a/src/adapters/api.py b/src/adapters/api.py index dd65f76..274e69c 100644 --- a/src/adapters/api.py +++ b/src/adapters/api.py @@ -106,11 +106,87 @@ def fetch_bjornd() -> list[RawListing]: log.info("bjornd: %d koopwoningen opgehaald", len(listings)) return listings - +# --------------------------------------------------------------------------- +# Ooms +# --------------------------------------------------------------------------- + +_OOMS_BASE = "https://ooms.com" +_OOMS_CITIES = {"Delft", "Schiedam", "Rotterdam", "Leiden", "Voorburg", "Pijnacker"} +_OOMS_SKIP_STATUS = {"verhuurd", "verhuurd onder voorbehoud"} +_OOMS_STATUS_MAP = { + "beschikbaar": "beschikbaar", + "onder bod": "onder_bod", + "onder optie": "onder_bod", + "verkocht": "verkocht", + "verkocht onder voorbehoud":"verkocht", +} + + +def fetch_ooms() -> list[RawListing]: + data = fetch_json(f"{_OOMS_BASE}/api/properties/available.json") + listings = [] + + for item in data.get("objects", []): + if item.get("buy_or_rent") != "buy": + continue + if item.get("place") not in _OOMS_CITIES: + continue + if item.get("buy_price", 0) > config.MAX_PRICE: + continue + + status_raw = item.get("availability_status", "") + if status_raw in _OOMS_SKIP_STATUS: + continue + + hnr = item.get("house_number", "") + add = item.get("house_number_addition") or "" + adres = f"{item.get('street_name', '')} {hnr}{(' ' + add) if add else ''}".strip() + + main_images = item.get("realworks_main_images") or item.get("realworks_images") or [] + hero = None + if main_images: + sizes = main_images[0].get("sizes") or [] + best = max(sizes, key=lambda s: s.get("width", 0), default=None) + if best: + hero = _OOMS_BASE + best["imageUrl"] + + perceel = item.get("parcel_surface") or None + if perceel == 0: + perceel = None + + listings.append(RawListing( + url=item["url"], + source_makelaar="ooms", + datum_aanmelding=item.get("publish_date", "")[:10] or None, + status=_OOMS_STATUS_MAP.get(status_raw, "beschikbaar"), + adres=adres or None, + postcode=(item.get("zip_code") or "").replace(" ", "") or None, + stad=item.get("place") or None, + prijs=item.get("buy_price") or None, + woningtype=item.get("appartment_characteristic") or item.get("residential_building_type") or None, + woonoppervlak=item.get("usable_area_living_function") or None, + perceeloppervlak=perceel, + kamers=item.get("amount_of_rooms") or None, + slaapkamers=item.get("amount_of_bedrooms") or None, + hero_image_url=hero, + extra={ + "office": item.get("office", {}).get("name"), + "locations": item.get("locations"), + "garden_types": item.get("garden_types"), + "lat": item.get("lat"), + "lng": item.get("lng"), + "object_code": item.get("object_code"), + }, + )) + + log.info("ooms: %d listings opgehaald", len(listings)) + return listings + # --------------------------------------------------------------------------- # SCRAPERS — exporteer hier alle actieve API adapters # --------------------------------------------------------------------------- SCRAPERS = { 'bjornd': fetch_bjornd, + 'ooms': fetch_ooms, } diff --git a/src/adapters/ssr.py b/src/adapters/ssr.py index 565bdf3..463b078 100644 --- a/src/adapters/ssr.py +++ b/src/adapters/ssr.py @@ -65,51 +65,127 @@ def parse_m2(text: str | None) -> int | None: # --------------------------------------------------------------------------- -# Björn & Dries adapter (bjornd.nl) +# Realworks CMS (shared) # --------------------------------------------------------------------------- -# TODO: vul de echte CSS selectors in na inspectie van de pagina. -# Dit is een structureel sjabloon — de selectors zijn placeholders. -BJORND_BASE = "https://www.bjornd.nl" -BJORND_AANBOD = f"{BJORND_BASE}/aanbod" +_REALWORKS_STATUS_MAP = { + "te koop": "beschikbaar", + "nieuw": "beschikbaar", + "onder bod": "onder_bod", + "onder optie": "onder_bod", + "verkocht o.v.": "verkocht", + "verkocht": "verkocht", +} -def fetch_bjornd_demo() -> list[RawListing]: - soup = fetch_soup(BJORND_AANBOD) +def _realworks_detail(detail_url: str, makelaar: str) -> dict: + """Fetch a Realworks detail page and extract kenmerken. Returns empty dict on failure.""" + try: + soup = fetch_soup(detail_url) + + # Build a label→value map from all .kenmerk spans + kv: dict[str, str] = {} + for kenmerk in soup.select("span.kenmerk"): + label_el = kenmerk.select_one("span.kenmerkName") + value_el = kenmerk.select_one("span.kenmerkValue") + if label_el and value_el: + label = label_el.get_text(strip=True).lower() + value = value_el.get_text(strip=True) + kv[label] = value + + return { + "woningtype": kv.get("type woning"), + "bouwjaar": kv.get("bouwjaar"), + "woonoppervlak": kv.get("woonoppervlakte"), + "perceeloppervlak": kv.get("perceeloppervlakte"), + "kamers": kv.get("aantal kamers"), + "slaapkamers": kv.get("aantal slaapkamers"), + "energielabel": kv.get("energieklasse"), + } + except Exception as e: + log.warning("%s: detail fetch fout %s: %s", makelaar, detail_url, e) + return {} + + +def fetch_realworks(base_url: str, makelaar: str) -> list[RawListing]: + """ + Generic fetcher for Realworks CMS brokers. + Paginates via /pagina-{n}/, fetches detail page per listing. + """ + listings_path = f"/aanbod/woningaanbod/-{config.MAX_PRICE}/koop" listings = [] + page = 1 - # Pas de selector aan op de echte HTML structuur - for card in soup.select(".property-card"): # ← aanpassen - try: - a_tag = card.select_one("a[href]") - if not a_tag: - continue - url = a_tag["href"] - if not url.startswith("http"): - url = BJORND_BASE + url + while True: + url = f"{base_url}{listings_path}/pagina-{page}/" + soup = fetch_soup(url) + cards = soup.select("li.aanbodEntry") + if not cards: + break - adres = _text(card, ".property-address") # ← aanpassen - postcode = _extract_postcode(_text(card, ".property-location")) - prijs = parse_prijs(_text(card, ".property-price")) - opp = parse_m2(_text(card, ".property-area")) - img = _src(card, "img") + for card in cards: + try: + a_tag = card.select_one("a.aanbodEntryLink") + if not a_tag: + continue + listing_url = base_url + a_tag["href"] - listings.append(RawListing( - url=url, - source_makelaar="bjornd", - adres=adres, - postcode=postcode, - stad=_infer_stad(postcode), - prijs=prijs, - woonoppervlak=opp, - hero_image_url=img, - )) - except Exception as e: - log.warning("Fout bij parsen bjornd card: %s", e) + adres = _text(card, ".street-address") + postcode = (_text(card, ".postal-code") or "").replace(" ", "") or None + stad = _text(card, ".locality") + prijs = parse_prijs(_text(card, ".koopprijs .kenmerkValue")) + status_text = (_text(card, ".objectstatusbanner") or "").lower() + status = _REALWORKS_STATUS_MAP.get(status_text, "beschikbaar") + + img_tag = card.select_one(".hoofdfoto img") + hero = img_tag["src"] if img_tag else None + + kk = _realworks_detail(listing_url, makelaar) + + listings.append(RawListing( + url=listing_url, + source_makelaar=makelaar, + adres=adres, + postcode=postcode, + stad=stad, + prijs=prijs, + status=status, + hero_image_url=hero, + woningtype=kk.get("woningtype"), + bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None, + woonoppervlak=parse_m2(kk.get("woonoppervlak")), + perceeloppervlak=parse_m2(kk.get("perceeloppervlak")), + kamers=int(kk["kamers"]) if kk.get("kamers") else None, + slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None, + energielabel=kk.get("energielabel"), + )) + except Exception as e: + log.warning("%s: parse fout: %s", makelaar, e) + + if len(cards) < 10: + break + page += 1 + + log.info("%s: %d listings opgehaald", makelaar, len(listings)) return listings +# --------------------------------------------------------------------------- +# Anke Bodewes Makelaardij +# --------------------------------------------------------------------------- + +def fetch_ankebodewes() -> list[RawListing]: + return fetch_realworks("https://www.ankebodewes.nl", "ankebodewes") + + +# --------------------------------------------------------------------------- +# Woongoed Makelaars Schiedam +# --------------------------------------------------------------------------- + +def fetch_woongoed() -> list[RawListing]: + return fetch_realworks("https://www.woongoedmakelaars.nl", "woongoed") + # --------------------------------------------------------------------------- # SSR helper utils # --------------------------------------------------------------------------- @@ -150,5 +226,6 @@ def _infer_stad(postcode: str | None) -> str | None: # --------------------------------------------------------------------------- SCRAPERS = { - 'bjornd_demo': fetch_bjornd_demo, + 'ankebodewes': fetch_ankebodewes, + 'woongoed': fetch_woongoed, } diff --git a/tests/test_adapters.py b/tests/test_adapters.py index 6b96359..8e7036c 100644 --- a/tests/test_adapters.py +++ b/tests/test_adapters.py @@ -7,11 +7,11 @@ from adapters import SCRAPERS # --- change this to test a different adapter --- -ADAPTER = SCRAPERS['bjornd'] +ADAPTER = SCRAPERS['ooms'] if __name__ == "__main__": print(f"Testing adapter: {ADAPTER.__name__}") listings = ADAPTER() print(f"Got {len(listings)} listings\n") for l in listings: - print(f" {l.adres}, {l.stad} — €{l.prijs} — {l.url}") + print(f" {l.adres}, {l.postcode}, {l.stad} — €{l.prijs} — {l.kamers} rooms — {l.url}")