add some more makelaars, and some more infra

2026-04-03 15:49:42 +02:00
parent 26d9d936f4
commit 17b35d1997
9 changed files with 928 additions and 70 deletions
--- a/autoscraper.py
+++ b/autoscraper.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+"""
+autoscraper.py — detect CMS and extract patterns from broker pages
+
+Usage:
+    python autoscraper.py listings <url>   — detect CMS + card structure
+    python autoscraper.py details <url>    — detect CMS + kenmerk patterns
+"""
+
+import re
+import sys
+
+import httpx
+from bs4 import BeautifulSoup, Tag
+
+UA = "Huizenbot/1.0 (+mark@kalsbeek.dev) persoonlijk gebruik"
+
+# ---------------------------------------------------------------------------
+# CMS fingerprints
+# ---------------------------------------------------------------------------
+
+# Each entry: (name, listings_signal, details_signal, adapter_hint)
+# signals are (selector, min_count) tuples — all must match
+CMS_FINGERPRINTS = [
+    {
+        "name": "Realworks",
+        "listings": [("li.aanbodEntry", 1), ("span.kenmerkValue", 1)],
+        "details":  [("span.kenmerkName", 3), ("span.kenmerkValue", 3)],
+        "hint": "fetch_realworks('{base_url}', '{makelaar}')",
+    },
+]
+
+# ---------------------------------------------------------------------------
+# Candidate card selectors (tried in order for unknown CMS)
+# ---------------------------------------------------------------------------
+
+CARD_CANDIDATES = [
+    "li.aanbodEntry",
+    "article",
+    "li[class*=object]",
+    "li[class*=woning]",
+    "li[class*=listing]",
+    "div[class*=object-item]",
+    "div[class*=property-item]",
+    "div[class*=aanbod]",
+    ".listing-item",
+]
+
+# ---------------------------------------------------------------------------
+# Regex patterns for field detection
+# ---------------------------------------------------------------------------
+
+RE_POSTCODE = re.compile(r"\b\d{4}\s?[A-Z]{2}\b")
+RE_PRICE    = re.compile(r"€\s*[\d.,]+")
+RE_M2       = re.compile(r"\d+\s*m[²2]")
+RE_PAGE_URL = re.compile(r"pagina[-/]?\d+|[?&]p(?:age)?=\d+|/\d+/?$")
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def fetch(url: str) -> BeautifulSoup:
+    r = httpx.get(url, headers={"User-Agent": UA}, timeout=15, follow_redirects=True)
+    r.raise_for_status()
+    return BeautifulSoup(r.text, "html.parser")
+
+
+def _selector_path(el: Tag) -> str:
+    """Short CSS-like path for an element: tag.class1.class2"""
+    parts = []
+    for ancestor in reversed(list(el.parents)):
+        if ancestor.name in (None, "[document]", "html", "body"):
+            continue
+        cls = ".".join(ancestor.get("class", []))
+        parts.append(f"{ancestor.name}.{cls}" if cls else ancestor.name)
+        if len(parts) >= 3:
+            break
+    cls = ".".join(el.get("class", []))
+    parts.append(f"{el.name}.{cls}" if cls else el.name)
+    return " > ".join(parts[-3:])
+
+
+def _detect_cms(soup: BeautifulSoup, mode: str) -> dict | None:
+    key = "listings" if mode == "listings" else "details"
+    for cms in CMS_FINGERPRINTS:
+        if all(len(soup.select(sel)) >= n for sel, n in cms[key]):
+            return cms
+    return None
+
+
+def _find_cards(soup: BeautifulSoup) -> tuple[list, str | None]:
+    for sel in CARD_CANDIDATES:
+        found = soup.select(sel)
+        if len(found) >= 2:
+            return found, sel
+    # fallback: find the most repeated element class
+    from collections import Counter
+    class_counts: Counter = Counter()
+    for el in soup.find_all(True):
+        cls = tuple(el.get("class", []))
+        if cls:
+            class_counts[cls] += 1
+    if class_counts:
+        top_cls, count = class_counts.most_common(1)[0]
+        if count >= 2:
+            sel = "." + ".".join(top_cls)
+            return soup.select(sel), f"{sel} (auto-detected, count={count})"
+    return [], None
+
+
+def _pattern_hits(soup: BeautifulSoup, pattern: re.Pattern, label: str):
+    hits = []
+    for el in soup.find_all(string=pattern):
+        parent = el.parent
+        if parent:
+            hits.append((parent.get_text(strip=True)[:80], _selector_path(parent)))
+    if hits:
+        print(f"\n  [{label}] — {len(hits)} hit(s)")
+        for text, path in hits[:4]:
+            print(f"    {path}")
+            print(f"    → {text!r}")
+
+
+# ---------------------------------------------------------------------------
+# Commands
+# ---------------------------------------------------------------------------
+
+def cmd_listings(url: str):
+    print(f"Fetching: {url}\n")
+    soup = fetch(url)
+    base_url = "/".join(url.split("/")[:3])
+
+    cms = _detect_cms(soup, "listings")
+
+    if cms:
+        print(f"✓ CMS DETECTED: {cms['name']}")
+        hint = cms["hint"].format(base_url=base_url, makelaar="<name>")
+        print(f"\n  Add to ssr.py:\n")
+        print(f"    def fetch_<name>() -> list[RawListing]:")
+        print(f"        return {hint}\n")
+        print(f"  Register in SCRAPERS dict:")
+        print(f"    '<name>': fetch_<name>,")
+        return
+
+    print("✗ CMS unknown — structural diagnostics:\n")
+
+    # Cards
+    cards, matched_sel = _find_cards(soup)
+    print(f"=== CARDS ({matched_sel or 'none found'}: {len(cards)}) ===")
+    if cards:
+        print("\n--- FIRST CARD ---")
+        print(cards[0].prettify()[:2500])
+        print("\n--- CHILD ELEMENTS & CLASSES ---")
+        for el in cards[0].find_all(True):
+            cls = el.get("class")
+            text = el.get_text(strip=True)[:50]
+            if cls:
+                print(f"  <{el.name}> .{' .'.join(cls)}   {text!r}")
+
+    # Pattern hits in cards area (or full page if no cards)
+    search_area = cards[0] if cards else soup
+    print("\n=== FIELD PATTERNS ===")
+    _pattern_hits(search_area, RE_POSTCODE, "postcode")
+    _pattern_hits(search_area, RE_PRICE,    "prijs")
+    _pattern_hits(search_area, RE_M2,       "m²")
+
+    # Pagination
+    print("\n=== PAGINATION ===")
+    page_links = soup.find_all("a", href=RE_PAGE_URL)
+    if page_links:
+        seen = set()
+        for a in page_links:
+            href = a.get("href", "")
+            if href not in seen:
+                seen.add(href)
+                print(f"  {href!r}  — {a.get_text(strip=True)!r}")
+    else:
+        print("  No pagination links found")
+
+
+def cmd_details(url: str):
+    print(f"Fetching: {url}\n")
+    soup = fetch(url)
+
+    cms = _detect_cms(soup, "details")
+
+    if cms:
+        print(f"✓ CMS DETECTED: {cms['name']}")
+        print("\n  _realworks_detail() will extract:")
+        kv: dict[str, str] = {}
+        for kenmerk in soup.select("span.kenmerk"):
+            label_el = kenmerk.select_one("span.kenmerkName")
+            value_el = kenmerk.select_one("span.kenmerkValue")
+            if label_el and value_el:
+                label = label_el.get_text(strip=True).lower()
+                value = value_el.get_text(strip=True)
+                kv[label] = value
+
+        target_fields = {
+            "type woning":        "woningtype",
+            "bouwjaar":           "bouwjaar",
+            "woonoppervlakte":    "woonoppervlak",
+            "perceeloppervlakte": "perceeloppervlak",
+            "aantal kamers":      "kamers",
+            "aantal slaapkamers": "slaapkamers",
+            "energieklasse":      "energielabel",
+        }
+        for key, field in target_fields.items():
+            val = kv.get(key, "NOT FOUND")
+            status = "✓" if key in kv else "✗"
+            print(f"    {status} {field:<20} ← {key!r}: {val!r}")
+        return
+
+    print("✗ CMS unknown — structural diagnostics:\n")
+
+    # Address
+    print("=== ADDRESS ===")
+    for tag in ["h1", "h2"]:
+        for el in soup.select(tag):
+            t = el.get_text(strip=True)
+            if t:
+                print(f"  <{tag}> {t!r}")
+
+    # Key-value patterns
+    print("\n=== KEY-VALUE STRUCTURES ===")
+    kv_selectors = [
+        ("dl", "dt", "dd"),
+        ("table", "th", "td"),
+        (".kenmerk", ".kenmerkName", ".kenmerkValue"),
+        (".spec", ".spec-label", ".spec-value"),
+        (".feature", ".feature-label", ".feature-value"),
+    ]
+    found_any = False
+    for container_sel, label_sel, value_sel in kv_selectors:
+        pairs = []
+        for container in soup.select(container_sel)[:50]:
+            label_el = container.select_one(label_sel)
+            value_el = container.select_one(value_sel)
+            if label_el and value_el:
+                l = label_el.get_text(strip=True)
+                v = value_el.get_text(strip=True)
+                if l and v:
+                    pairs.append((l, v))
+        if pairs:
+            found_any = True
+            print(f"\n  [{container_sel} > {label_sel} / {value_sel}] — {len(pairs)} pairs")
+            for l, v in pairs[:10]:
+                print(f"    {l:<30} {v}")
+
+    if not found_any:
+        print("  No key-value structures detected")
+
+    # Field pattern hits
+    print("\n=== FIELD PATTERNS ===")
+    _pattern_hits(soup, RE_POSTCODE, "postcode")
+    _pattern_hits(soup, RE_PRICE,    "prijs")
+    _pattern_hits(soup, RE_M2,       "m²")
+
+    # Images
+    print("\n=== IMAGES (first 5) ===")
+    for img in soup.select("img")[:5]:
+        src = img.get("src") or img.get("data-src")
+        alt = img.get("alt", "")
+        print(f"  {src}  [{alt}]")
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+def main():
+    if len(sys.argv) < 3:
+        print(__doc__)
+        sys.exit(1)
+
+    cmd = sys.argv[1]
+    url = sys.argv[2]
+
+    if cmd == "listings":
+        cmd_listings(url)
+    elif cmd == "details":
+        cmd_details(url)
+    else:
+        print(f"Unknown command: {cmd}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()