first setup, travel works, bjornd api works
This commit is contained in:
154
src/adapters/ssr.py
Normal file
154
src/adapters/ssr.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""
|
||||
adapters/ssr.py — HTML/SSR-based makelaars
|
||||
|
||||
Elke scraper is een functie () -> list[RawListing].
|
||||
Voeg nieuwe toe onderaan en registreer in SCRAPERS.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import config
|
||||
from huizenbot import RawListing
|
||||
|
||||
log = logging.getLogger("huizenbot.ssr")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gedeelde HTTP helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fetch_soup(url: str, *, params: dict = None) -> BeautifulSoup:
|
||||
"""
|
||||
GET request → BeautifulSoup. Handelt 429 af met Retry-After.
|
||||
"""
|
||||
for attempt in range(3):
|
||||
r = httpx.get(
|
||||
url,
|
||||
params=params,
|
||||
headers={"User-Agent": config.USER_AGENT},
|
||||
timeout=15,
|
||||
follow_redirects=True,
|
||||
)
|
||||
if r.status_code == 429:
|
||||
wait = int(r.headers.get("Retry-After", 60))
|
||||
log.warning("429 op %s, wacht %ds", url, wait)
|
||||
time.sleep(wait)
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return BeautifulSoup(r.text, "html.parser")
|
||||
|
||||
raise RuntimeError(f"Blijvend 429 op {url}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parse helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_prijs(text: str | None) -> int | None:
|
||||
"""'€ 325.000 k.k.' → 325000"""
|
||||
if not text:
|
||||
return None
|
||||
digits = re.sub(r"[^\d]", "", text)
|
||||
return int(digits) if digits else None
|
||||
|
||||
|
||||
def parse_m2(text: str | None) -> int | None:
|
||||
"""'87 m²' → 87"""
|
||||
if not text:
|
||||
return None
|
||||
m = re.search(r"(\d+)", text.replace(".", ""))
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Björn & Dries adapter (bjornd.nl)
|
||||
# ---------------------------------------------------------------------------
|
||||
# TODO: vul de echte CSS selectors in na inspectie van de pagina.
|
||||
# Dit is een structureel sjabloon — de selectors zijn placeholders.
|
||||
|
||||
BJORND_BASE = "https://www.bjornd.nl"
|
||||
BJORND_AANBOD = f"{BJORND_BASE}/aanbod"
|
||||
|
||||
|
||||
def fetch_bjornd_demo() -> list[RawListing]:
|
||||
soup = fetch_soup(BJORND_AANBOD)
|
||||
listings = []
|
||||
|
||||
# Pas de selector aan op de echte HTML structuur
|
||||
for card in soup.select(".property-card"): # ← aanpassen
|
||||
try:
|
||||
a_tag = card.select_one("a[href]")
|
||||
if not a_tag:
|
||||
continue
|
||||
url = a_tag["href"]
|
||||
if not url.startswith("http"):
|
||||
url = BJORND_BASE + url
|
||||
|
||||
adres = _text(card, ".property-address") # ← aanpassen
|
||||
postcode = _extract_postcode(_text(card, ".property-location"))
|
||||
prijs = parse_prijs(_text(card, ".property-price"))
|
||||
opp = parse_m2(_text(card, ".property-area"))
|
||||
img = _src(card, "img")
|
||||
|
||||
listings.append(RawListing(
|
||||
url=url,
|
||||
source_makelaar="bjornd",
|
||||
adres=adres,
|
||||
postcode=postcode,
|
||||
stad=_infer_stad(postcode),
|
||||
prijs=prijs,
|
||||
woonoppervlak=opp,
|
||||
hero_image_url=img,
|
||||
))
|
||||
except Exception as e:
|
||||
log.warning("Fout bij parsen bjornd card: %s", e)
|
||||
|
||||
return listings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SSR helper utils
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _text(soup, selector: str) -> str | None:
|
||||
el = soup.select_one(selector)
|
||||
return el.get_text(strip=True) if el else None
|
||||
|
||||
|
||||
def _src(soup, selector: str) -> str | None:
|
||||
el = soup.select_one(selector)
|
||||
if el is None:
|
||||
return None
|
||||
return el.get("src") or el.get("data-src")
|
||||
|
||||
|
||||
def _extract_postcode(text: str | None) -> str | None:
|
||||
if not text:
|
||||
return None
|
||||
m = re.search(r"\b(\d{4}\s?[A-Z]{2})\b", text)
|
||||
return m.group(1).replace(" ", "") if m else None
|
||||
|
||||
|
||||
def _infer_stad(postcode: str | None) -> str | None:
|
||||
"""Simpele mapping op basis van postcode range — uitbreiden naar wens."""
|
||||
if not postcode:
|
||||
return None
|
||||
code = int(postcode[:4])
|
||||
if 2600 <= code <= 2629:
|
||||
return "Delft"
|
||||
if 3100 <= code <= 3135:
|
||||
return "Schiedam"
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SCRAPERS — exporteer hier alle actieve SSR adapters
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SCRAPERS = {
|
||||
'bjornd_demo': fetch_bjornd_demo,
|
||||
}
|
||||
Reference in New Issue
Block a user