From 26d9d936f4ed1a03feb8f378a8ebb053a9ecce3e Mon Sep 17 00:00:00 2001 From: Mark Kalsbeek Date: Fri, 3 Apr 2026 13:50:28 +0200 Subject: [PATCH] first setup, travel works, bjornd api works --- .env.example | 13 ++ .envrc | 1 + .gitignore | 7 + huizenbot-spec.md | 167 +++++++++++++++++ makelaars.md | 38 ++++ shell.nix | 20 +++ src/__init__.py | 0 src/adapters/__init__.py | 6 + src/adapters/api.py | 116 ++++++++++++ src/adapters/ssr.py | 154 ++++++++++++++++ src/config.py | 25 +++ src/huizenbot.py | 374 +++++++++++++++++++++++++++++++++++++++ src/main.py | 16 ++ src/nine292.py | 95 ++++++++++ tests/cache.py | 53 ++++++ tests/test_adapters.py | 17 ++ tests/test_email.py | 26 +++ tests/test_ha.py | 0 tests/test_travel.py | 24 +++ 19 files changed, 1152 insertions(+) create mode 100644 .env.example create mode 100644 .envrc create mode 100644 .gitignore create mode 100644 huizenbot-spec.md create mode 100644 makelaars.md create mode 100644 shell.nix create mode 100644 src/__init__.py create mode 100644 src/adapters/__init__.py create mode 100644 src/adapters/api.py create mode 100644 src/adapters/ssr.py create mode 100644 src/config.py create mode 100644 src/huizenbot.py create mode 100644 src/main.py create mode 100644 src/nine292.py create mode 100644 tests/cache.py create mode 100644 tests/test_adapters.py create mode 100644 tests/test_email.py create mode 100644 tests/test_ha.py create mode 100644 tests/test_travel.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..1b28ae7 --- /dev/null +++ b/.env.example @@ -0,0 +1,13 @@ +NAVITIA_API_KEY= + +HA_WEBHOOK_URL= + +SMTP_HOST= +SMTP_PORT=587 +SMTP_FROM= +SMTP_TO= +SMTP_USER= +SMTP_PASSWORD= + +DB_PATH=/data/huizenbot.db + diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..1d953f4 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use nix diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c2f9b05 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.env +.direnv/ + +*.dump +**/__pycache__/ + +tests/cache/ diff --git a/huizenbot-spec.md b/huizenbot-spec.md new file mode 100644 index 0000000..cdd1932 --- /dev/null +++ b/huizenbot-spec.md @@ -0,0 +1,167 @@ +# Huizenbot — Project Spec + +## Doel + +Periodiek scrapen van makelaarswebsites in Delft en Schiedam, nieuwe woningen opslaan in SQLite, en pushnotificaties sturen via Home Assistant (+ optioneel email). Draait als één Docker container op homelab met cron. + +## Projectstructuur + +``` +huizenbot/ +├── db.py # schema, migraties, upsert helpers +├── notify.py # HA webhook + email +├── travel.py # OSRM (fiets) + Navitia (OV) clients +├── base.py # AbstractScraper, orchestratie, run loop +├── adapters/ # één file per makelaar +│ ├── bjornd.py +│ └── ... +├── main.py # entry point (aangeroepen door cron) +├── config.py # locaties, credentials, endpoints +└── Dockerfile +``` + +Nieuwe makelaar toevoegen = nieuwe file in `adapters/` die `AbstractScraper` implementeert. De runner pikt hem automatisch op via de base class registry. + +## Database + +SQLite, één file gemount als Docker volume. + +```sql +CREATE TABLE woningen ( + id TEXT PRIMARY KEY, -- sha256(url) + url TEXT UNIQUE NOT NULL, + source_makelaar TEXT NOT NULL, + first_seen TEXT NOT NULL, -- ISO8601 + last_seen TEXT NOT NULL, -- ISO8601, geüpdatet elke run + datum_aanmelding TEXT, -- datum van makelaar zelf, indien beschikbaar + + status TEXT NOT NULL DEFAULT 'beschikbaar', + -- enum: beschikbaar | onder_bod | verkocht + + -- locatie + adres TEXT, + postcode TEXT, + stad TEXT, + + -- woning + prijs INTEGER, -- euros, geen float + woningtype TEXT, -- appartement | tussenwoning | hoekwoning | vrijstaand | ... + woonoppervlak INTEGER, -- m2 + perceeloppervlak INTEGER, -- m2, NULL voor appartementen + kamers INTEGER, + slaapkamers INTEGER, + bouwjaar INTEGER, + energielabel TEXT, + + -- media + hero_image_url TEXT, + + -- reistijd in minuten (berekend bij first_seen, niet opnieuw) + fiets_persoon1 INTEGER, + fiets_persoon2 INTEGER, + ov_persoon1 INTEGER, + ov_persoon2 INTEGER, + + -- makelaar-specifieke velden die niet in het schema passen + extra TEXT -- JSON (sqlite 3.38+ json_extract() werkt hierop) +); +``` + +Upsert strategie: `INSERT OR IGNORE` op `id` voor nieuwe woningen, daarna `UPDATE last_seen` en `status` op elke run. Reistijd wordt alleen berekend bij `first_seen`. + +## Scraper architectuur + +### AbstractScraper (base.py) + +Elke adapter erft hiervan en implementeert één methode: + +```python +def fetch_listings(self) -> list[RawListing]: + ... +``` + +`RawListing` is een dataclass met exact de velden uit het schema (allemaal optioneel behalve `url`). De base class regelt: + +- deduplicatie / upsert naar DB +- reistijdberekening aanroepen voor nieuwe woningen +- notificatie triggeren voor nieuwe woningen +- logging + +### Twee adapter-smaken + +**API-based** (JSON response, bijv. makelaars met een interne REST API): +- Doet een `httpx` request, parsed JSON direct naar `RawListing` + +**SSR-based** (HTML scraping): +- Doet een `httpx` request met nette `User-Agent` header +- Parsed HTML via `BeautifulSoup` + +Beide smaken zijn gewone subclasses — geen aparte base per smaak, het verschil zit alleen in de implementatie van `fetch_listings`. + +## Reistijd (travel.py) + +Twee backends, beiden aangeroepen bij `first_seen` van een woning. + +**Fiets — OSRM** (publieke instance, geen key nodig): +``` +http://router.project-osrm.org/route/v1/cycling/{lon},{lat};{lon},{lat}?overview=false +``` + +**OV — Navitia.io** (gratis tier, API key nodig): +``` +https://api.navitia.io/v1/coverage/nl/journeys?from=...&to=...&datetime=... +``` +Voor OV wordt een vaste reistijd op een doordeweekse ochtend (bijv. 08:30) gebruikt als referentie — niet real-time. + +Invoer zijn twee postcodes uit `config.py` (werklocatie persoon 1 en 2). Postcode → coördinaten via de Nominatim geocoder (OSM, geen key nodig, respecteer 1 req/s limiet). + +## Notificaties (notify.py) + +### Home Assistant +Webhook call naar HA met een payload die de notification service aanroept. Bevat: +- adres + stad +- prijs +- status +- hero image URL (als `image` in de notification) +- reistijden persoon 1 en 2 (fiets + OV) +- directe link naar de listing + +### Email (optioneel, fallback) +Plain HTML mail via SMTP met dezelfde info. Handig als HA buiten bereik is. + +## Configuratie (config.py) + +```python +PERSOON1_WERK_POSTCODE = "2600AA" +PERSOON2_WERK_POSTCODE = "3000AA" + +HA_WEBHOOK_URL = "https://ha.jouwdomain.nl/api/webhook/huizenbot" + +SMTP_HOST = "..." +SMTP_FROM = "..." +SMTP_TO = "..." + +USER_AGENT = "Huizenbot/1.0 (+jouw@email.nl)" + +SCRAPE_INTERVAL_HOURS = 3 # alleen informatief, cron regelt de scheduling +``` + +Secrets (API keys etc.) via environment variables, niet in config.py. + +## Docker & cron + +Één container, cron runt `main.py` elke 3 uur tussen 08:00 en 20:00: + +```cron +0 8,11,14,17,20 * * * python /app/main.py >> /var/log/huizenbot.log 2>&1 +``` + +SQLite DB en logfile als named volumes gemount. + +## Ethische scraping + +- `User-Agent` met contactinfo (zie config) +- Één request tegelijk per domein +- Respecteer `Retry-After` bij 429-responses +- Geen nachtelijke runs +- Alleen persoonlijk gebruik diff --git a/makelaars.md b/makelaars.md new file mode 100644 index 0000000..522eaa3 --- /dev/null +++ b/makelaars.md @@ -0,0 +1,38 @@ +# Verkoopmakelaars Delft & Schiedam + +## Delft + +| Naam | Website | Adres | +|------|---------|-------| +| Van Silfhout & Hogetoorn Wereldmakelaars | vansilfhout.nl | Ireneboulevard 2 | +| Van Daal Makelaardij | vandaalmakelaardij.nl | Voldersgracht 33 | +| Björnd Makelaardij | bjornd.nl | Oude Delft 103 | +| Hof van Delft Makelaardij | hofvandelftmakelaardij.nl | Wateringsevest 26 | +| V&W Makelaars Delft | vwmakelaars.nl | Coenderstraat 31 | +| Roepman Makelaardij NVM | roepman.nl | Molslaan 43 | +| ZO makelaars | zomakelaars.nl | Van Foreestweg 4 | +| Marloes Makelaars | — | Maerten Trompstraat 28 | +| Makelaarskantoor J.E. Mouthaan | — | Julianalaan 43 | +| Olsthoorn Makelaars Delft | olsthoornmakelaars.nl | Noordeinde 51 | +| Post Makelaardij (v/h Bayense) | postmakelaardij.nl | Spoorsingel 1a | +| Morris NVM Makelaars | morrismakelaardij.nl | — | +| Prinsenstad Makelaardij | — | — | +| Oude Delft Makelaardij | — | — | +| Dijksman Woningmakelaars | — | — | +| CORPOwonen | — | — | + +## Schiedam + +| Naam | Website | Adres | +|------|---------|-------| +| Anke Bodewes Makelaardij | ankebodewes.nl | Hargplein 118 | +| Woongoed Makelaars Schiedam | woongoedmakelaars.nl | Oranjestraat 93 | +| Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 | +| De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 | +| Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 | +| 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 | +| Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 | +| D&S Makelaardij | densmakelaars.nl | Land van Belofte 50 | +| Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B | +| Hagestein Makelaardij | — | Degerfors 54 | +| Schieland Borsboom NVM Makelaars | schielandborsboom.nl | (Rotterdam, actief in Schiedam) | diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..91fd9a9 --- /dev/null +++ b/shell.nix @@ -0,0 +1,20 @@ +{ pkgs ? import {} }: + +pkgs.mkShell { + packages = [ + (pkgs.python3.withPackages (ps: with ps; [ + httpx + beautifulsoup4 + lxml + ])) + ]; + + shellHook = '' + if [ -f .env ]; then + set -a + source .env + set +a + echo ".env geladen" + fi + ''; +} diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py new file mode 100644 index 0000000..cad50c2 --- /dev/null +++ b/src/adapters/__init__.py @@ -0,0 +1,6 @@ +from os import wait +from typing import Callable +from adapters.api import SCRAPERS as _API +from adapters.ssr import SCRAPERS as _SSR + +SCRAPERS: dict[str,Callable] = _API | _SSR diff --git a/src/adapters/api.py b/src/adapters/api.py new file mode 100644 index 0000000..dd65f76 --- /dev/null +++ b/src/adapters/api.py @@ -0,0 +1,116 @@ +""" +adapters/api.py — JSON/API-based makelaars + +Elke scraper is een functie () -> list[RawListing]. +Voeg nieuwe toe onderaan en registreer in SCRAPERS. +""" + +import json +import logging +import time + +import httpx + +import config +from huizenbot import RawListing + +log = logging.getLogger("huizenbot.api") + +# --------------------------------------------------------------------------- +# Gedeelde HTTP helper +# --------------------------------------------------------------------------- + +def fetch_json(url: str, *, params: dict = None, headers: dict = None) -> dict | list: + """ + GET request met User-Agent, timeout en Retry-After afhandeling. + Raises httpx.HTTPError bij aanhoudende fouten. + """ + hdrs = {"User-Agent": config.USER_AGENT} + if headers: + hdrs.update(headers) + + for attempt in range(3): + r = httpx.get(url, params=params, headers=hdrs, timeout=15) + if r.status_code == 429: + wait = int(r.headers.get("Retry-After", 60)) + log.warning("429 op %s, wacht %ds", url, wait) + time.sleep(wait) + continue + r.raise_for_status() + return r.json() + + raise RuntimeError(f"Blijvend 429 op {url}") + + +# --------------------------------------------------------------------------- +# Bjornd +# --------------------------------------------------------------------------- + +_BJORND_BASE = "https://www.bjornd.nl" +_BJORND_SKIP = {"rented", "rented_ur"} + +_STATUS_MAP = { + "available": "beschikbaar", + "under_bid": "onder_bod", + "under_option": "onder_bod", + "sold": "verkocht", + "sold_ur": "verkocht", +} + + +def fetch_bjornd() -> list[RawListing]: + data = fetch_json( + f"{_BJORND_BASE}/nl/realtime-listings/consumer", + headers={"X-Requested-With": "XMLHttpRequest"}, + ) + + listings = [] + for item in data: + if not item.get("isSales"): + continue + if item.get("statusOrig") in _BJORND_SKIP: + continue + if item.get('salesPrice')>config.MAX_PRICE: + continue + + + listings.append(RawListing( + url=_BJORND_BASE + item["url"], + source_makelaar="bjornd", + status=_STATUS_MAP.get(item.get("statusOrig", ""), "beschikbaar"), + adres=item.get("address") or None, + postcode=item.get("zipcode") or None, + stad=item.get("city") or None, + prijs=item.get("salesPrice") or None, + woningtype=item.get("type") or None, + woonoppervlak=item.get("livingSurface") or None, + perceeloppervlak=item.get("plotSurface") or None, + kamers=item.get("rooms") or None, + slaapkamers=item.get("bedrooms") or None, + hero_image_url=item.get("photo") or None, + extra=json.dumps({ + "balcony": item.get("balcony"), + "garden": item.get("garden"), + "mainType": item.get("mainType"), + "buildType": item.get("buildType"), + "district": item.get("district"), + "lat": item.get("lat"), + "lng": item.get("lng"), + "isFurnished": item.get("isFurnished"), + "hasOpenHouse": item.get("hasOpenHouse"), + "description": item.get("description"), + "photos": item.get("photos"), + }, ensure_ascii=False), + )) + + log.info("bjornd: %d koopwoningen opgehaald", len(listings)) + return listings + + +# --------------------------------------------------------------------------- +# SCRAPERS — exporteer hier alle actieve API adapters +# --------------------------------------------------------------------------- + +SCRAPERS = { + 'bjornd': fetch_bjornd, +} diff --git a/src/adapters/ssr.py b/src/adapters/ssr.py new file mode 100644 index 0000000..565bdf3 --- /dev/null +++ b/src/adapters/ssr.py @@ -0,0 +1,154 @@ +""" +adapters/ssr.py — HTML/SSR-based makelaars + +Elke scraper is een functie () -> list[RawListing]. +Voeg nieuwe toe onderaan en registreer in SCRAPERS. +""" + +import logging +import re +import time + +import httpx +from bs4 import BeautifulSoup + +import config +from huizenbot import RawListing + +log = logging.getLogger("huizenbot.ssr") + +# --------------------------------------------------------------------------- +# Gedeelde HTTP helper +# --------------------------------------------------------------------------- + +def fetch_soup(url: str, *, params: dict = None) -> BeautifulSoup: + """ + GET request → BeautifulSoup. Handelt 429 af met Retry-After. + """ + for attempt in range(3): + r = httpx.get( + url, + params=params, + headers={"User-Agent": config.USER_AGENT}, + timeout=15, + follow_redirects=True, + ) + if r.status_code == 429: + wait = int(r.headers.get("Retry-After", 60)) + log.warning("429 op %s, wacht %ds", url, wait) + time.sleep(wait) + continue + r.raise_for_status() + return BeautifulSoup(r.text, "html.parser") + + raise RuntimeError(f"Blijvend 429 op {url}") + + +# --------------------------------------------------------------------------- +# Parse helpers +# --------------------------------------------------------------------------- + +def parse_prijs(text: str | None) -> int | None: + """'€ 325.000 k.k.' → 325000""" + if not text: + return None + digits = re.sub(r"[^\d]", "", text) + return int(digits) if digits else None + + +def parse_m2(text: str | None) -> int | None: + """'87 m²' → 87""" + if not text: + return None + m = re.search(r"(\d+)", text.replace(".", "")) + return int(m.group(1)) if m else None + + +# --------------------------------------------------------------------------- +# Björn & Dries adapter (bjornd.nl) +# --------------------------------------------------------------------------- +# TODO: vul de echte CSS selectors in na inspectie van de pagina. +# Dit is een structureel sjabloon — de selectors zijn placeholders. + +BJORND_BASE = "https://www.bjornd.nl" +BJORND_AANBOD = f"{BJORND_BASE}/aanbod" + + +def fetch_bjornd_demo() -> list[RawListing]: + soup = fetch_soup(BJORND_AANBOD) + listings = [] + + # Pas de selector aan op de echte HTML structuur + for card in soup.select(".property-card"): # ← aanpassen + try: + a_tag = card.select_one("a[href]") + if not a_tag: + continue + url = a_tag["href"] + if not url.startswith("http"): + url = BJORND_BASE + url + + adres = _text(card, ".property-address") # ← aanpassen + postcode = _extract_postcode(_text(card, ".property-location")) + prijs = parse_prijs(_text(card, ".property-price")) + opp = parse_m2(_text(card, ".property-area")) + img = _src(card, "img") + + listings.append(RawListing( + url=url, + source_makelaar="bjornd", + adres=adres, + postcode=postcode, + stad=_infer_stad(postcode), + prijs=prijs, + woonoppervlak=opp, + hero_image_url=img, + )) + except Exception as e: + log.warning("Fout bij parsen bjornd card: %s", e) + + return listings + + +# --------------------------------------------------------------------------- +# SSR helper utils +# --------------------------------------------------------------------------- + +def _text(soup, selector: str) -> str | None: + el = soup.select_one(selector) + return el.get_text(strip=True) if el else None + + +def _src(soup, selector: str) -> str | None: + el = soup.select_one(selector) + if el is None: + return None + return el.get("src") or el.get("data-src") + + +def _extract_postcode(text: str | None) -> str | None: + if not text: + return None + m = re.search(r"\b(\d{4}\s?[A-Z]{2})\b", text) + return m.group(1).replace(" ", "") if m else None + + +def _infer_stad(postcode: str | None) -> str | None: + """Simpele mapping op basis van postcode range — uitbreiden naar wens.""" + if not postcode: + return None + code = int(postcode[:4]) + if 2600 <= code <= 2629: + return "Delft" + if 3100 <= code <= 3135: + return "Schiedam" + return None + + +# --------------------------------------------------------------------------- +# SCRAPERS — exporteer hier alle actieve SSR adapters +# --------------------------------------------------------------------------- + +SCRAPERS = { + 'bjornd_demo': fetch_bjornd_demo, +} diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..e5dd292 --- /dev/null +++ b/src/config.py @@ -0,0 +1,25 @@ +""" +config.py — vul aan met je eigen waarden. Secrets via environment variables. +""" +import os + +MARK_WERK_POSTCODE = "2629HG" +MICHELLE_WERK_POSTCODE = "3133AV" +MARK_WERK_9292 = "delft/"+MARK_WERK_POSTCODE +MICHELLE_WERK_9292 = "vlaardingen/"+MICHELLE_WERK_POSTCODE + +HA_WEBHOOK_URL = os.environ.get("HA_WEBHOOK_URL", "") + +SMTP_HOST = os.environ.get("SMTP_HOST", "") +SMTP_PORT = int(os.environ.get("SMTP_PORT", "587")) +SMTP_FROM = os.environ.get("SMTP_FROM", "") +SMTP_TO = os.environ.get("SMTP_TO", "") +SMTP_USER = os.environ.get("SMTP_USER", "") + +USER_AGENT = "Huizenbot/1.0 (+mark@kalsbeek.dev) persoonlijk gebruik" + +DB_PATH = os.environ.get("DB_PATH", "/data/huizenbot.db") + +FIETS_SNELHEID_FACTOR = 1.27 + +MAX_PRICE = 300_000 diff --git a/src/huizenbot.py b/src/huizenbot.py new file mode 100644 index 0000000..7bdf66d --- /dev/null +++ b/src/huizenbot.py @@ -0,0 +1,374 @@ +""" +huizenbot.py — models, db, travel, notify, orchestration +""" + +import hashlib +import json +import logging +import os +import smtplib +import sqlite3 +import time +from dataclasses import dataclass, field +from datetime import datetime, date +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from typing import Callable, Any + +import httpx + +import config +from nine292 import ov_minuten_9292 + +log = logging.getLogger("huizenbot") + +# --------------------------------------------------------------------------- +# Model +# --------------------------------------------------------------------------- + +@dataclass +class RawListing: + url: str # required + + source_makelaar: str = "" + datum_aanmelding: str | None = None + status: str = "beschikbaar" # beschikbaar | onder_bod | verkocht + + adres: str | None = None + postcode: str | None = None + stad: str | None = None + + prijs: int | None = None + woningtype: str | None = None + woonoppervlak: int | None = None + perceeloppervlak: int | None = None + kamers: int | None = None + slaapkamers: int | None = None + bouwjaar: int | None = None + energielabel: str | None = None + + hero_image_url: str | None = None + + extra: dict[str, Any] = field(default_factory=dict) + + +def listing_id(url: str) -> str: + return hashlib.sha256(url.encode()).hexdigest() + + +# --------------------------------------------------------------------------- +# Database +# --------------------------------------------------------------------------- + +SCHEMA = """ +CREATE TABLE IF NOT EXISTS woningen ( + id TEXT PRIMARY KEY, + url TEXT UNIQUE NOT NULL, + source_makelaar TEXT NOT NULL, + first_seen TEXT NOT NULL, + last_seen TEXT NOT NULL, + datum_aanmelding TEXT, + + status TEXT NOT NULL DEFAULT 'beschikbaar', + + adres TEXT, + postcode TEXT, + stad TEXT, + + prijs INTEGER, + woningtype TEXT, + woonoppervlak INTEGER, + perceeloppervlak INTEGER, + kamers INTEGER, + slaapkamers INTEGER, + bouwjaar INTEGER, + energielabel TEXT, + + hero_image_url TEXT, + + fiets_mark INTEGER, + fiets_michelle INTEGER, + ov_mark INTEGER, + ov_michelle INTEGER, + + extra TEXT +); +""" + + +def get_db(path: str) -> sqlite3.Connection: + conn = sqlite3.connect(path) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA journal_mode=WAL") + conn.executescript(SCHEMA) + return conn + + +def upsert(conn: sqlite3.Connection, listing: RawListing, travel: dict[str,int]) -> bool: + """ + Insert new listing or update last_seen + status on existing. + Returns True if this was a new listing. + """ + now = datetime.utcnow().isoformat() + lid = listing_id(listing.url) + + row = conn.execute("SELECT id FROM woningen WHERE id = ?", (lid,)).fetchone() + is_new = row is None + + if is_new: + _cursor = conn.execute(""" + INSERT INTO woningen ( + id, url, source_makelaar, first_seen, last_seen, datum_aanmelding, + status, adres, postcode, stad, + prijs, woningtype, woonoppervlak, perceeloppervlak, + kamers, slaapkamers, bouwjaar, energielabel, + hero_image_url, + fiets_mark, fiets_michelle, ov_mark, ov_michelle, + extra + ) VALUES ( + :id, :url, :source_makelaar, :first_seen, :last_seen, :datum_aanmelding, + :status, :adres, :postcode, :stad, + :prijs, :woningtype, :woonoppervlak, :perceeloppervlak, + :kamers, :slaapkamers, :bouwjaar, :energielabel, + :hero_image_url, + :fiets_mark, :fiets_michelle, :ov_mark, :ov_michelle, + :extra + ) + """, { + "id": lid, + "url": listing.url, + "source_makelaar": listing.source_makelaar, + "first_seen": now, + "last_seen": now, + "datum_aanmelding": listing.datum_aanmelding, + "status": listing.status, + "adres": listing.adres, + "postcode": listing.postcode, + "stad": listing.stad, + "prijs": listing.prijs, + "woningtype": listing.woningtype, + "woonoppervlak": listing.woonoppervlak, + "perceeloppervlak": listing.perceeloppervlak, + "kamers": listing.kamers, + "slaapkamers": listing.slaapkamers, + "bouwjaar": listing.bouwjaar, + "energielabel": listing.energielabel, + "hero_image_url": listing.hero_image_url, + "fiets_mark": travel.get("fiets_mark"), + "fiets_michelle": travel.get("fiets_michelle"), + "ov_mark": travel.get("ov_mark"), + "ov_michelle": travel.get("ov_michelle"), + "extra": json.dumps(listing.extra) if listing.extra else None, + }) + else: + _cursor = conn.execute(""" + UPDATE woningen SET last_seen = ?, status = ? WHERE id = ? + """, (now, listing.status, lid)) + + conn.commit() + return is_new + + +# --------------------------------------------------------------------------- +# Travel +# --------------------------------------------------------------------------- + +_geocode_cache: dict[str, tuple[float, float]] = {} + + +def geocode(postcode: str) -> tuple[float, float] | None: + """Postcode → (lat, lon) via Nominatim. Respects 1 req/s.""" + if postcode in _geocode_cache: + return _geocode_cache[postcode] + + time.sleep(1) # Nominatim rate limit + try: + r = httpx.get( + "https://nominatim.openstreetmap.org/search", + params={"q": postcode + ", Netherlands", "format": "json", "limit": 1}, + headers={"User-Agent": config.USER_AGENT}, + timeout=10, + ) + _response = r.raise_for_status() + results = r.json() + if not results: + log.warning("Geocode geen resultaat voor %s", postcode) + return None + lat, lon = float(results[0]["lat"]), float(results[0]["lon"]) + _geocode_cache[postcode] = (lat, lon) + return lat, lon + except Exception as e: + log.error("Geocode fout voor %s: %s", postcode, e) + return None + +def fiets_minuten(origin: tuple[float, float], dest: tuple[float, float]) -> int | None: + """Reistijd fiets in minuten via OSRM (routing.openstreetmap.de).""" + try: + olat, olon = origin + dlat, dlon = dest + url = ( + f"https://routing.openstreetmap.de/routed-bike/route/v1/driving/" + f"{olon},{olat};{dlon},{dlat}?overview=false" + ) + r = httpx.get(url, timeout=10) + r.raise_for_status() + data = r.json() + seconds = data["routes"][0]["duration"] + return round(seconds / 60 / config.FIETS_SNELHEID_FACTOR) + except Exception as e: + log.error("OSRM fout: %s", e) + return None + +def ov_minuten(from_loc: str, to_loc: str) -> int | None: + """Reistijd OV in minuten via 9292, vaste ochtendspits referentie.""" + return ov_minuten_9292(from_loc, to_loc) + + +def _next_weekday_morning() -> str: + """Geeft eerstvolgende doordeweekse dag om 08:30 als Navitia datetime string.""" + from datetime import timedelta + d = date.today() + d += timedelta(days=1) + while d.weekday() >= 5: # 5=zaterdag, 6=zondag + d += timedelta(days=1) + return d.strftime("%Y%m%dT083000") + + +def bereken_reistijden(postcode: str | None) -> dict[str, int]: + """Bereken alle reistijden voor een woning postcode. Geeft lege dict bij falen.""" + if not postcode: + return {} + + woning_coords = geocode(postcode) + if not woning_coords: + return {} + + werk1 = geocode(config.MARK_WERK_POSTCODE) + werk2 = geocode(config.MICHELLE_WERK_POSTCODE) + + result = {} + if werk1: + result["fiets_mark"] = fiets_minuten(woning_coords, werk1) + result["ov_mark"] = ov_minuten(woning_coords, werk1) + if werk2: + result["fiets_michelle"] = fiets_minuten(woning_coords, werk2) + result["ov_michelle"] = ov_minuten(woning_coords, werk2) + + return result + + +# --------------------------------------------------------------------------- +# Notify +# --------------------------------------------------------------------------- + +def notify_ha(listing: RawListing, travel: dict[str,int]) -> None: + """Stuur webhook naar Home Assistant.""" + if not config.HA_WEBHOOK_URL: + return + + payload = { + "adres": listing.adres, + "stad": listing.stad, + "prijs": listing.prijs, + "status": listing.status, + "url": listing.url, + "image": listing.hero_image_url, + "fiets_mark": travel.get("fiets_mark"), + "fiets_michelle": travel.get("fiets_michelle"), + "ov_mark": travel.get("ov_mark"), + "ov_michelle": travel.get("ov_michelle"), + } + + try: + r = httpx.post(config.HA_WEBHOOK_URL, json=payload, timeout=10) + r.raise_for_status() + log.info("HA notificatie verstuurd voor %s", listing.adres) + except Exception as e: + log.error("HA webhook fout: %s", e) + notify_email(listing, travel) # fallback + + +def notify_email(listing: RawListing, travel: dict[str,int]) -> None: + """Stuur HTML email als fallback.""" + if not config.SMTP_HOST: + return + + subject = f"Nieuwe woning: {listing.adres}, {listing.stad} — €{listing.prijs:,}" + + html = f""" + +

{listing.adres}, {listing.stad}

+

Prijs: €{listing.prijs:,}

+

Status: {listing.status}

+

Fiets P1: {travel.get('fiets_mark')} min   + OV P1: {travel.get('ov_mark')} min

+

Fiets P2: {travel.get('fiets_michelle')} min   + OV P2: {travel.get('ov_michelle')} min

+ {"" if listing.hero_image_url else ""} +

Bekijk listing

+ + """ + + msg = MIMEMultipart("alternative") + msg["Subject"] = subject + msg["From"] = config.SMTP_FROM + msg["To"] = config.SMTP_TO + msg.attach(MIMEText(html, "html")) + + try: + with smtplib.SMTP(config.SMTP_HOST, config.SMTP_PORT) as s: + if config.SMTP_USER: + s.starttls() + s.login(config.SMTP_USER, os.environ.get("SMTP_PASSWORD", "")) + s.send_message(msg) + log.info("Email verstuurd voor %s", listing.adres) + except Exception as e: + log.error("Email fout: %s", e) + + +# --------------------------------------------------------------------------- +# Orchestration +# --------------------------------------------------------------------------- + +Scraper = Callable[[], list[RawListing]] + + +def run(scrapers: list[Scraper], db_path: str) -> None: + conn = get_db(db_path) + total_new = 0 + + for scraper in scrapers: + name = scraper.__name__ + log.info("Scraper starten: %s", name) + try: + listings = scraper() + except Exception as e: + log.error("Scraper %s gefaald: %s", name, e) + continue + + log.info("Scraper %s: %d listings opgehaald", name, len(listings)) + + for listing in listings: + travel = {} + try: + # Check of het een nieuwe woning is vóór upsert + lid = listing_id(listing.url) + is_existing = conn.execute( + "SELECT id FROM woningen WHERE id = ?", (lid,) + ).fetchone() is not None + + if not is_existing: + travel = bereken_reistijden(listing.postcode) + + is_new = upsert(conn, listing, travel) + + if is_new: + total_new += 1 + log.info("Nieuwe woning: %s (%s)", listing.adres, listing.url) + notify_ha(listing, travel) + + except Exception as e: + log.error("Fout bij verwerken %s: %s", listing.url, e) + + log.info("Run klaar. %d nieuwe woningen gevonden.", total_new) + conn.close() diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..8a09d55 --- /dev/null +++ b/src/main.py @@ -0,0 +1,16 @@ +import logging +import sys + +import config +from adapters import SCRAPERS +from huizenbot import run + +logging.basicConfig( + stream=sys.stdout, + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s — %(message)s", + datefmt="%Y-%m-%dT%H:%M:%S", +) + +if __name__ == "__main__": + run(SCRAPERS, config.DB_PATH) diff --git a/src/nine292.py b/src/nine292.py new file mode 100644 index 0000000..d1dba06 --- /dev/null +++ b/src/nine292.py @@ -0,0 +1,95 @@ +"""9292 public transport travel time via their web API.""" +import hashlib +import hmac +import logging +import time +import urllib.parse +from datetime import date, timedelta + +import httpx + +log = logging.getLogger(__name__) + +_BASE_URL = "https://web-api.9292.nl" +_HMAC_SECRET = "ZVWm_Qytmq.Bo-guenFtRfUPi_vMFq4yrdDA6RYZAijNi4qocHmq6oZ" +_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:149.0) Gecko/20100101 Firefox/149.0" + + +def _encode_params(params: dict) -> str: + """Replicate 9292's Ye() param serializer: standard urlencode with + for spaces.""" + return urllib.parse.urlencode(params).replace("%20", "+") + + +def _sign(url_path: str, params: dict) -> tuple[str, str]: + """Return (x-request-time, x-validation-token) for a request.""" + ts = str(int(time.time() * 1000)) + qs = _encode_params(params) + full = f"{url_path}{'?' + qs if qs else ''}" + message = f"{ts}{full}{_USER_AGENT}" + token = hmac.new(_HMAC_SECRET.encode(), message.encode(), hashlib.sha256).hexdigest() + return ts, token + + +def _next_weekday_morning() -> str: + """First upcoming weekday at 08:30, as ISO 8601 for 9292.""" + d = date.today() + timedelta(days=1) + while d.weekday() >= 5: + d += timedelta(days=1) + return d.strftime("%Y-%m-%dT08:30:00.000Z") + + +def ov_minuten_9292(from_loc: str, to_loc: str) -> int | None: + """ + Travel time in minutes via 9292. + + Locations are 9292-style strings, e.g.: + "delft/2629hg" + "station-amsterdam-centraal" + "amsterdam/1011ab" + """ + url_path = "/api/v1/plans" + params = { + "from": from_loc.lower(), + "to": to_loc.lower(), + "requestType": "Departure", + "dateTime": _next_weekday_morning(), + "planWithAccessibility": "false", + "extraInterchangeTime": "0", + "firstMileLessWalking": "false", + "lastMileLessWalking": "false", + "firstMileModality": "Walking", + "lastMileModality": "Walking", + "previewsBefore": "0", + "previewsAfter": "3", + } + ts, token = _sign(url_path, params) + headers = { + "User-Agent": _USER_AGENT, + "Accept": "application/json, text/plain, */*", + "Accept-Language": "nl", + "x-origin": "Plan", + "x-request-time": ts, + "x-validation-token": token, + "Origin": "https://9292.nl", + "Referer": "https://9292.nl/", + } + try: + r = httpx.get( + _BASE_URL + url_path, + params=params, + headers=headers, + timeout=15, + ) + r.raise_for_status() + previews = r.json().get("previews", []) + if not previews: + log.warning("9292 geen reisadvies voor %s → %s", from_loc, to_loc) + return None + # Take the shortest non-cancelled journey + valid = [p for p in previews if not p.get("cancelled")] + if not valid: + valid = previews + return min(p["durationInMinutes"] for p in valid) + except Exception as e: + log.error("9292 fout voor %s → %s: %s", from_loc, to_loc, e) + return None diff --git a/tests/cache.py b/tests/cache.py new file mode 100644 index 0000000..01157ba --- /dev/null +++ b/tests/cache.py @@ -0,0 +1,53 @@ +""" +cache.py — import this before anything else in a test file to enable +file-based caching of fetch_json and fetch_soup calls. + +Cache is stored in tests/cache/ keyed by a hash of the URL + params. +Delete the cache directory to bust it. +""" + +import hashlib +import json +import pickle +from pathlib import Path + +CACHE_DIR = Path(__file__).parent / "cache" +CACHE_DIR.mkdir(exist_ok=True) + + +def _key(url: str, params: dict[str,str] | None) -> str: + raw = json.dumps({"url": url, "params": params or {}}, sort_keys=True) + return hashlib.sha256(raw.encode()).hexdigest() + + +def _patch(): + import adapters.api as api_mod + import adapters.ssr as ssr_mod + + _orig_fetch_json = api_mod.fetch_json + _orig_fetch_soup = ssr_mod.fetch_soup + + def cached_fetch_json(url, *, params: dict[str,str]|None=None, headers=None): + path = CACHE_DIR / (_key(url, params) + ".json") + if path.exists(): + print(f"[cache hit] {url}") + return json.loads(path.read_text()) + result = _orig_fetch_json(url, params=params, headers=headers) + path.write_text(json.dumps(result)) + return result + + def cached_fetch_soup(url, *, params=None): + path = CACHE_DIR / (_key(url, params) + ".pickle") + if path.exists(): + print(f"[cache hit] {url}") + return pickle.loads(path.read_bytes()) + result = _orig_fetch_soup(url, params=params) + path.write_bytes(pickle.dumps(result)) + return result + + api_mod.fetch_json = cached_fetch_json + ssr_mod.fetch_soup = cached_fetch_soup + print("[cache] fetch_json and fetch_soup patched") + + +_patch() diff --git a/tests/test_adapters.py b/tests/test_adapters.py new file mode 100644 index 0000000..6b96359 --- /dev/null +++ b/tests/test_adapters.py @@ -0,0 +1,17 @@ +import sys +sys.path.insert(0, "../src") + +from cache import * # noqa: F401 — must be before adapter imports + +from adapters import SCRAPERS + + +# --- change this to test a different adapter --- +ADAPTER = SCRAPERS['bjornd'] + +if __name__ == "__main__": + print(f"Testing adapter: {ADAPTER.__name__}") + listings = ADAPTER() + print(f"Got {len(listings)} listings\n") + for l in listings: + print(f" {l.adres}, {l.stad} — €{l.prijs} — {l.url}") diff --git a/tests/test_email.py b/tests/test_email.py new file mode 100644 index 0000000..e4dc2cc --- /dev/null +++ b/tests/test_email.py @@ -0,0 +1,26 @@ +import sys +sys.path.insert(0, "../src") + +from huizenbot import notify_email, RawListing + +TEST_LISTING = RawListing( + url="https://example.com/test-woning", + source_makelaar="test", + adres="Teststraat 1", + stad="Delft", + postcode="2613AA", + prijs=350000, + hero_image_url=None, +) + +TEST_TRAVEL = { + "fiets_mark": 20, + "fiets_michelle": 35, + "ov_mark": 30, + "ov_michelle": 45, +} + +if __name__ == "__main__": + print("=== Email ===") + notify_email(TEST_LISTING, TEST_TRAVEL) + print(" verstuurd (check je inbox)") diff --git a/tests/test_ha.py b/tests/test_ha.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_travel.py b/tests/test_travel.py new file mode 100644 index 0000000..627058b --- /dev/null +++ b/tests/test_travel.py @@ -0,0 +1,24 @@ +import sys +sys.path.insert(0, "../src") +from huizenbot import fiets_minuten, ov_minuten, geocode +from config import * + +# --- change these to test different postcodes --- +POSTCODE_FROM = "3028GE" +POSTCODE_FROM_9292 = "rotterdam/" + POSTCODE_FROM + +if __name__ == "__main__": + print("=== Geocode ===") + origin = geocode(POSTCODE_FROM) + dest = geocode(MARK_WERK_POSTCODE) + print(f" {POSTCODE_FROM} → {origin}") + print(f" {MARK_WERK_POSTCODE} → {dest}") + if not origin or not dest: + print("Geocode mislukt, stop.") + sys.exit(1) + + print("\n=== Fiets (OSRM) ===") + print(f" {fiets_minuten(origin, dest)} minuten") + + print("\n=== OV (9292) ===") + print(f" {ov_minuten(POSTCODE_FROM_9292, MARK_WERK_9292)} minuten")