first setup, travel works, bjornd api works

This commit is contained in:
2026-04-03 13:50:28 +02:00
commit 26d9d936f4
19 changed files with 1152 additions and 0 deletions

154
src/adapters/ssr.py Normal file
View File

@@ -0,0 +1,154 @@
"""
adapters/ssr.py — HTML/SSR-based makelaars
Elke scraper is een functie () -> list[RawListing].
Voeg nieuwe toe onderaan en registreer in SCRAPERS.
"""
import logging
import re
import time
import httpx
from bs4 import BeautifulSoup
import config
from huizenbot import RawListing
log = logging.getLogger("huizenbot.ssr")
# ---------------------------------------------------------------------------
# Gedeelde HTTP helper
# ---------------------------------------------------------------------------
def fetch_soup(url: str, *, params: dict = None) -> BeautifulSoup:
"""
GET request → BeautifulSoup. Handelt 429 af met Retry-After.
"""
for attempt in range(3):
r = httpx.get(
url,
params=params,
headers={"User-Agent": config.USER_AGENT},
timeout=15,
follow_redirects=True,
)
if r.status_code == 429:
wait = int(r.headers.get("Retry-After", 60))
log.warning("429 op %s, wacht %ds", url, wait)
time.sleep(wait)
continue
r.raise_for_status()
return BeautifulSoup(r.text, "html.parser")
raise RuntimeError(f"Blijvend 429 op {url}")
# ---------------------------------------------------------------------------
# Parse helpers
# ---------------------------------------------------------------------------
def parse_prijs(text: str | None) -> int | None:
"""'€ 325.000 k.k.' → 325000"""
if not text:
return None
digits = re.sub(r"[^\d]", "", text)
return int(digits) if digits else None
def parse_m2(text: str | None) -> int | None:
"""'87 m²' → 87"""
if not text:
return None
m = re.search(r"(\d+)", text.replace(".", ""))
return int(m.group(1)) if m else None
# ---------------------------------------------------------------------------
# Björn & Dries adapter (bjornd.nl)
# ---------------------------------------------------------------------------
# TODO: vul de echte CSS selectors in na inspectie van de pagina.
# Dit is een structureel sjabloon — de selectors zijn placeholders.
BJORND_BASE = "https://www.bjornd.nl"
BJORND_AANBOD = f"{BJORND_BASE}/aanbod"
def fetch_bjornd_demo() -> list[RawListing]:
soup = fetch_soup(BJORND_AANBOD)
listings = []
# Pas de selector aan op de echte HTML structuur
for card in soup.select(".property-card"): # ← aanpassen
try:
a_tag = card.select_one("a[href]")
if not a_tag:
continue
url = a_tag["href"]
if not url.startswith("http"):
url = BJORND_BASE + url
adres = _text(card, ".property-address") # ← aanpassen
postcode = _extract_postcode(_text(card, ".property-location"))
prijs = parse_prijs(_text(card, ".property-price"))
opp = parse_m2(_text(card, ".property-area"))
img = _src(card, "img")
listings.append(RawListing(
url=url,
source_makelaar="bjornd",
adres=adres,
postcode=postcode,
stad=_infer_stad(postcode),
prijs=prijs,
woonoppervlak=opp,
hero_image_url=img,
))
except Exception as e:
log.warning("Fout bij parsen bjornd card: %s", e)
return listings
# ---------------------------------------------------------------------------
# SSR helper utils
# ---------------------------------------------------------------------------
def _text(soup, selector: str) -> str | None:
el = soup.select_one(selector)
return el.get_text(strip=True) if el else None
def _src(soup, selector: str) -> str | None:
el = soup.select_one(selector)
if el is None:
return None
return el.get("src") or el.get("data-src")
def _extract_postcode(text: str | None) -> str | None:
if not text:
return None
m = re.search(r"\b(\d{4}\s?[A-Z]{2})\b", text)
return m.group(1).replace(" ", "") if m else None
def _infer_stad(postcode: str | None) -> str | None:
"""Simpele mapping op basis van postcode range — uitbreiden naar wens."""
if not postcode:
return None
code = int(postcode[:4])
if 2600 <= code <= 2629:
return "Delft"
if 3100 <= code <= 3135:
return "Schiedam"
return None
# ---------------------------------------------------------------------------
# SCRAPERS — exporteer hier alle actieve SSR adapters
# ---------------------------------------------------------------------------
SCRAPERS = {
'bjornd_demo': fetch_bjornd_demo,
}