317 lines
11 KiB
Python
317 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
autoscraper.py — detect CMS and extract patterns from broker pages
|
|
|
|
Usage:
|
|
python autoscraper.py listings <url> — detect CMS + card structure
|
|
python autoscraper.py details <url> — detect CMS + kenmerk patterns
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
import json
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup, Tag
|
|
|
|
UA = "Huizenbot/1.0 (+mark@kalsbeek.dev) persoonlijk gebruik"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CMS fingerprints
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Each entry: (name, listings_signal, details_signal, adapter_hint)
|
|
# signals are (selector, min_count) tuples — all must match
|
|
CMS_FINGERPRINTS = [
|
|
{
|
|
"name": "Realworks",
|
|
"listings": [("li.aanbodEntry", 1), ("span.kenmerkValue", 1)],
|
|
"details": [("span.kenmerkName", 3), ("span.kenmerkValue", 3)],
|
|
"hint": "fetch_realworks('{base_url}', '{makelaar}')",
|
|
},
|
|
]
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Candidate card selectors (tried in order for unknown CMS)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
CARD_CANDIDATES = [
|
|
"li.aanbodEntry",
|
|
"article",
|
|
"li[class*=object]",
|
|
"li[class*=woning]",
|
|
"li[class*=listing]",
|
|
"div[class*=object-item]",
|
|
"div[class*=property-item]",
|
|
"div[class*=aanbod]",
|
|
".listing-item",
|
|
]
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Regex patterns for field detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
RE_POSTCODE = re.compile(r"\b\d{4}\s?[A-Z]{2}\b")
|
|
RE_PRICE = re.compile(r"€\s*[\d.,]+")
|
|
RE_M2 = re.compile(r"\d+\s*m[²2]")
|
|
RE_PAGE_URL = re.compile(r"pagina[-/]?\d+|[?&]p(?:age)?=\d+|/\d+/?$")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def fetch(url: str) -> BeautifulSoup:
|
|
r = httpx.get(url, headers={"User-Agent": UA}, timeout=15, follow_redirects=True)
|
|
r.raise_for_status()
|
|
return BeautifulSoup(r.text, "html.parser")
|
|
|
|
|
|
def _selector_path(el: Tag) -> str:
|
|
"""Short CSS-like path for an element: tag.class1.class2"""
|
|
parts = []
|
|
for ancestor in reversed(list(el.parents)):
|
|
if ancestor.name in (None, "[document]", "html", "body"):
|
|
continue
|
|
cls = ".".join(ancestor.get("class", []))
|
|
parts.append(f"{ancestor.name}.{cls}" if cls else ancestor.name)
|
|
if len(parts) >= 3:
|
|
break
|
|
cls = ".".join(el.get("class", []))
|
|
parts.append(f"{el.name}.{cls}" if cls else el.name)
|
|
return " > ".join(parts[-3:])
|
|
|
|
|
|
def _detect_cms(soup: BeautifulSoup, mode: str) -> dict | None:
|
|
key = "listings" if mode == "listings" else "details"
|
|
for cms in CMS_FINGERPRINTS:
|
|
if all(len(soup.select(sel)) >= n for sel, n in cms[key]):
|
|
return cms
|
|
return None
|
|
|
|
|
|
def _find_cards(soup: BeautifulSoup) -> tuple[list, str | None]:
|
|
for sel in CARD_CANDIDATES:
|
|
found = soup.select(sel)
|
|
if len(found) >= 2:
|
|
return found, sel
|
|
# fallback: find the most repeated element class
|
|
from collections import Counter
|
|
class_counts: Counter = Counter()
|
|
for el in soup.find_all(True):
|
|
cls = tuple(el.get("class", []))
|
|
if cls:
|
|
class_counts[cls] += 1
|
|
if class_counts:
|
|
top_cls, count = class_counts.most_common(1)[0]
|
|
if count >= 2:
|
|
sel = "." + ".".join(top_cls)
|
|
return soup.select(sel), f"{sel} (auto-detected, count={count})"
|
|
return [], None
|
|
|
|
|
|
def _pattern_hits(soup: BeautifulSoup, pattern: re.Pattern, label: str):
|
|
hits = []
|
|
for el in soup.find_all(string=pattern):
|
|
parent = el.parent
|
|
if parent:
|
|
hits.append((parent.get_text(strip=True)[:80], _selector_path(parent)))
|
|
if hits:
|
|
print(f"\n [{label}] — {len(hits)} hit(s)")
|
|
for text, path in hits[:4]:
|
|
print(f" {path}")
|
|
print(f" → {text!r}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Commands
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def cmd_listings(url: str):
|
|
print(f"Fetching: {url}\n")
|
|
soup = fetch(url)
|
|
base_url = "/".join(url.split("/")[:3])
|
|
|
|
cms = _detect_cms(soup, "listings")
|
|
|
|
if cms:
|
|
print(f"✓ CMS DETECTED: {cms['name']}")
|
|
hint = cms["hint"].format(base_url=base_url, makelaar="<name>")
|
|
print(f"\n Add to ssr.py:\n")
|
|
print(f" def fetch_<name>() -> list[RawListing]:")
|
|
print(f" return {hint}\n")
|
|
print(f" Register in SCRAPERS dict:")
|
|
print(f" '<name>': fetch_<name>,")
|
|
return
|
|
|
|
print("✗ CMS unknown — structural diagnostics:\n")
|
|
|
|
# Cards
|
|
cards, matched_sel = _find_cards(soup)
|
|
print(f"=== CARDS ({matched_sel or 'none found'}: {len(cards)}) ===")
|
|
if cards:
|
|
print("\n--- FIRST CARD ---")
|
|
print(cards[0].prettify()[:2500])
|
|
print("\n--- CHILD ELEMENTS & CLASSES ---")
|
|
for el in cards[0].find_all(True):
|
|
cls = el.get("class")
|
|
text = el.get_text(strip=True)[:50]
|
|
if cls:
|
|
print(f" <{el.name}> .{' .'.join(cls)} {text!r}")
|
|
|
|
# Pattern hits in cards area (or full page if no cards)
|
|
search_area = cards[0] if cards else soup
|
|
print("\n=== FIELD PATTERNS ===")
|
|
_pattern_hits(search_area, RE_POSTCODE, "postcode")
|
|
_pattern_hits(search_area, RE_PRICE, "prijs")
|
|
_pattern_hits(search_area, RE_M2, "m²")
|
|
|
|
# Pagination
|
|
print("\n=== PAGINATION ===")
|
|
page_links = soup.find_all("a", href=RE_PAGE_URL)
|
|
if page_links:
|
|
seen = set()
|
|
for a in page_links:
|
|
href = a.get("href", "")
|
|
if href not in seen:
|
|
seen.add(href)
|
|
print(f" {href!r} — {a.get_text(strip=True)!r}")
|
|
else:
|
|
print(" No pagination links found")
|
|
|
|
|
|
def cmd_details(url: str):
|
|
print(f"Fetching: {url}\n")
|
|
soup = fetch(url)
|
|
|
|
cms = _detect_cms(soup, "details")
|
|
|
|
if cms:
|
|
print(f"✓ CMS DETECTED: {cms['name']}")
|
|
print("\n _realworks_detail() will extract:")
|
|
kv: dict[str, str] = {}
|
|
for kenmerk in soup.select("span.kenmerk"):
|
|
label_el = kenmerk.select_one("span.kenmerkName")
|
|
value_el = kenmerk.select_one("span.kenmerkValue")
|
|
if label_el and value_el:
|
|
label = label_el.get_text(strip=True).lower()
|
|
value = value_el.get_text(strip=True)
|
|
kv[label] = value
|
|
|
|
target_fields = {
|
|
"type woning": "woningtype",
|
|
"bouwjaar": "bouwjaar",
|
|
"woonoppervlakte": "woonoppervlak",
|
|
"perceeloppervlakte": "perceeloppervlak",
|
|
"aantal kamers": "kamers",
|
|
"aantal slaapkamers": "slaapkamers",
|
|
"energieklasse": "energielabel",
|
|
}
|
|
for key, field in target_fields.items():
|
|
val = kv.get(key, "NOT FOUND")
|
|
status = "✓" if key in kv else "✗"
|
|
print(f" {status} {field:<20} ← {key!r}: {val!r}")
|
|
return
|
|
|
|
print("✗ CMS unknown — structural diagnostics:\n")
|
|
|
|
# Address
|
|
print("=== ADDRESS ===")
|
|
for tag in ["h1", "h2"]:
|
|
for el in soup.select(tag):
|
|
t = el.get_text(strip=True)
|
|
if t:
|
|
print(f" <{tag}> {t!r}")
|
|
|
|
# Key-value patterns
|
|
print("\n=== KEY-VALUE STRUCTURES ===")
|
|
kv_selectors = [
|
|
("dl", "dt", "dd"),
|
|
("table", "th", "td"),
|
|
(".kenmerk", ".kenmerkName", ".kenmerkValue"),
|
|
(".spec", ".spec-label", ".spec-value"),
|
|
(".feature", ".feature-label", ".feature-value"),
|
|
]
|
|
found_any = False
|
|
for container_sel, label_sel, value_sel in kv_selectors:
|
|
pairs = []
|
|
for container in soup.select(container_sel)[:50]:
|
|
label_el = container.select_one(label_sel)
|
|
value_el = container.select_one(value_sel)
|
|
if label_el and value_el:
|
|
l = label_el.get_text(strip=True)
|
|
v = value_el.get_text(strip=True)
|
|
if l and v:
|
|
pairs.append((l, v))
|
|
if pairs:
|
|
found_any = True
|
|
print(f"\n [{container_sel} > {label_sel} / {value_sel}] — {len(pairs)} pairs")
|
|
for l, v in pairs[:10]:
|
|
print(f" {l:<30} {v}")
|
|
|
|
if not found_any:
|
|
print(" No key-value structures detected")
|
|
|
|
# Field pattern hits
|
|
print("\n=== FIELD PATTERNS ===")
|
|
_pattern_hits(soup, RE_POSTCODE, "postcode")
|
|
_pattern_hits(soup, RE_PRICE, "prijs")
|
|
_pattern_hits(soup, RE_M2, "m²")
|
|
|
|
# Images
|
|
print("\n=== IMAGES (first 5) ===")
|
|
for img in soup.select("img")[:5]:
|
|
src = img.get("src") or img.get("data-src")
|
|
alt = img.get("alt", "")
|
|
print(f" {src} [{alt}]")
|
|
|
|
# JSON-LD
|
|
print("\n=== JSON-LD (schema.org) ===")
|
|
for tag in soup.select('script[type="application/ld+json"]'):
|
|
try:
|
|
ld = json.loads(tag.string)
|
|
offered = ld.get("itemOffered", {})
|
|
address = offered.get("address", {})
|
|
floor_size = offered.get("floorSize", {})
|
|
fields = {
|
|
"woningtype": offered.get("@type"),
|
|
"adres": address.get("streetAddress"),
|
|
"postcode": address.get("postalCode"),
|
|
"stad": address.get("addressLocality"),
|
|
"prijs": ld.get("price"),
|
|
"woonoppervlak": floor_size.get("value"),
|
|
"kamers": offered.get("numberOfRooms"),
|
|
"bouwjaar": offered.get("yearBuilt"),
|
|
"availability": ld.get("availability"),
|
|
"image": ld.get("image"),
|
|
}
|
|
for k, v in fields.items():
|
|
mark = "✓" if v is not None else "✗"
|
|
print(f" {mark} {k:<16} {v!r}")
|
|
except Exception as e:
|
|
print(f" parse fout: {e}")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
if len(sys.argv) < 3:
|
|
print(__doc__)
|
|
sys.exit(1)
|
|
|
|
cmd = sys.argv[1]
|
|
url = sys.argv[2]
|
|
|
|
if cmd == "listings":
|
|
cmd_listings(url)
|
|
elif cmd == "details":
|
|
cmd_details(url)
|
|
else:
|
|
print(f"Unknown command: {cmd}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|