add some more makelaars, and some more infra
This commit is contained in:
@@ -1,5 +1,3 @@
|
|||||||
NAVITIA_API_KEY=
|
|
||||||
|
|
||||||
HA_WEBHOOK_URL=
|
HA_WEBHOOK_URL=
|
||||||
|
|
||||||
SMTP_HOST=
|
SMTP_HOST=
|
||||||
|
|||||||
358
add_scraper_context.md
Normal file
358
add_scraper_context.md
Normal file
@@ -0,0 +1,358 @@
|
|||||||
|
# Huizenbot — Agent Context for Adding Routes
|
||||||
|
|
||||||
|
## Project Overview
|
||||||
|
|
||||||
|
**Huizenbot** is a periodic scraper of real estate broker websites in Delft and Schiedam (Netherlands). It:
|
||||||
|
- Fetches property listings from broker websites
|
||||||
|
- Saves new ones to SQLite with `RawListing` schema
|
||||||
|
- Calculates travel times (bike + public transit) to two work locations
|
||||||
|
- Sends push notifications via Home Assistant webhook (with email fallback)
|
||||||
|
|
||||||
|
**Your role:** You will add new broker routes (scrapers) to the `adapters/` directory. A human will:
|
||||||
|
1. Select a broker from the list
|
||||||
|
2. Help you investigate the broker's website
|
||||||
|
3. For API-based brokers: develop curl requests to test
|
||||||
|
4. For HTML scrapers: develop parsing logic using BeautifulSoup
|
||||||
|
5. Run `tests/test_adapters.py` to validate
|
||||||
|
6. Merge your code snippets into the codebase
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Key Schema: RawListing
|
||||||
|
|
||||||
|
**Location:** `src/huizenbot.py` (lines 29–52)
|
||||||
|
|
||||||
|
This is the data model you must populate. All fields except `url` are optional:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class RawListing:
|
||||||
|
url: str # REQUIRED — the listing URL
|
||||||
|
|
||||||
|
source_makelaar: str = "" # Name of the broker (e.g., "bjornd", "vdaal")
|
||||||
|
datum_aanmelding: str | None = None # ISO 8601 date if available
|
||||||
|
status: str = "beschikbaar" # enum: beschikbaar | onder_bod | verkocht
|
||||||
|
|
||||||
|
# Location
|
||||||
|
adres: str | None = None # Street address (e.g., "Binnenwatersloot 3")
|
||||||
|
postcode: str | None = None # Dutch postcode (e.g., "2611CA")
|
||||||
|
stad: str | None = None # City (e.g., "Delft")
|
||||||
|
|
||||||
|
# Property details
|
||||||
|
prijs: int | None = None # Price in euros (integer, no float)
|
||||||
|
woningtype: str | None = None # Type (e.g., "appartement", "tussenwoning")
|
||||||
|
woonoppervlak: int | None = None # Living space in m²
|
||||||
|
perceeloppervlak: int | None = None # Plot size in m² (NULL for apartments)
|
||||||
|
kamers: int | None = None # Number of rooms
|
||||||
|
slaapkamers: int | None = None # Number of bedrooms
|
||||||
|
bouwjaar: int | None = None # Build year
|
||||||
|
energielabel: str | None = None # Energy label (e.g., "A", "B")
|
||||||
|
|
||||||
|
# Media
|
||||||
|
hero_image_url: str | None = None # Main photo URL
|
||||||
|
|
||||||
|
# Extra data (broker-specific fields)
|
||||||
|
extra: dict[str, Any] = field(default_factory=dict) # Arbitrary JSON data
|
||||||
|
```
|
||||||
|
|
||||||
|
**DB Upsert:** The listing is inserted on first run (with `id = sha256(url)`) and updated only on `last_seen` / `status` on subsequent runs. Travel times are calculated only on first insert.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Adapter Structure
|
||||||
|
|
||||||
|
Adapters live in `src/adapters/` and are organized by type:
|
||||||
|
|
||||||
|
### Two Adapter Types
|
||||||
|
|
||||||
|
#### 1. **API-based** (`src/adapters/api.py`)
|
||||||
|
For brokers with REST/JSON endpoints.
|
||||||
|
|
||||||
|
**Pattern:**
|
||||||
|
```python
|
||||||
|
def fetch_bjornd() -> list[RawListing]:
|
||||||
|
data = fetch_json("https://...", params={...}, headers={...})
|
||||||
|
listings = []
|
||||||
|
for item in data:
|
||||||
|
# Filter / validate
|
||||||
|
if item.get("status") in _SKIP:
|
||||||
|
continue
|
||||||
|
if item.get("price") > config.MAX_PRICE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
listings.append(RawListing(
|
||||||
|
url=item["url"],
|
||||||
|
source_makelaar="bjornd",
|
||||||
|
adres=item.get("address"),
|
||||||
|
postcode=item.get("zipcode"),
|
||||||
|
# ... etc
|
||||||
|
))
|
||||||
|
|
||||||
|
log.info("bjornd: %d listings", len(listings))
|
||||||
|
return listings
|
||||||
|
```
|
||||||
|
|
||||||
|
**Helpers available:**
|
||||||
|
- `fetch_json(url, *, params=None, headers=None)` — GET with User-Agent, timeout, Retry-After handling
|
||||||
|
- Built-in logging via `log = logging.getLogger("huizenbot.api")`
|
||||||
|
|
||||||
|
#### 2. **SSR/HTML-based** (`src/adapters/ssr.py`)
|
||||||
|
For brokers with server-side rendered HTML.
|
||||||
|
|
||||||
|
**Pattern:**
|
||||||
|
```python
|
||||||
|
def fetch_vdaal() -> list[RawListing]:
|
||||||
|
soup = fetch_soup("https://vdaalmakelaardij.nl/aanbod")
|
||||||
|
listings = []
|
||||||
|
|
||||||
|
for card in soup.select(".property-card"):
|
||||||
|
try:
|
||||||
|
url = card.select_one("a[href]")["href"]
|
||||||
|
if not url.startswith("http"):
|
||||||
|
url = VDAAL_BASE + url
|
||||||
|
|
||||||
|
adres = _text(card, ".address-selector")
|
||||||
|
postcode = _extract_postcode(adres)
|
||||||
|
prijs = parse_prijs(_text(card, ".price"))
|
||||||
|
|
||||||
|
listings.append(RawListing(
|
||||||
|
url=url,
|
||||||
|
source_makelaar="vdaal",
|
||||||
|
adres=adres,
|
||||||
|
postcode=postcode,
|
||||||
|
stad=_infer_stad(postcode),
|
||||||
|
prijs=prijs,
|
||||||
|
# ... etc
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("Parse error: %s", e)
|
||||||
|
|
||||||
|
log.info("vdaal: %d listings", len(listings))
|
||||||
|
return listings
|
||||||
|
```
|
||||||
|
|
||||||
|
**Helpers available:**
|
||||||
|
- `fetch_soup(url, *, params=None)` — GET with BeautifulSoup, Retry-After handling
|
||||||
|
- `parse_prijs(text)` — Extract price from strings like "€ 325.000 k.k." → 325000
|
||||||
|
- `parse_m2(text)` — Extract area from "87 m²" → 87
|
||||||
|
- `_text(soup, selector)` — Get inner text from element
|
||||||
|
- `_src(soup, selector)` — Get src or data-src attribute
|
||||||
|
- `_extract_postcode(text)` — Regex postcode from any text
|
||||||
|
- `_infer_stad(postcode)` — Simple lookup: 2600–2629 → Delft, 3100–3135 → Schiedam
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Registration
|
||||||
|
|
||||||
|
Both `api.py` and `ssr.py` have a `SCRAPERS` dict at the bottom:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# api.py
|
||||||
|
SCRAPERS = {
|
||||||
|
'bjornd': fetch_bjornd,
|
||||||
|
'your_broker': fetch_your_broker, # ← Add here
|
||||||
|
}
|
||||||
|
|
||||||
|
# ssr.py
|
||||||
|
SCRAPERS = {
|
||||||
|
'bjornd_demo': fetch_bjornd_demo,
|
||||||
|
'your_broker': fetch_your_broker, # ← Add here
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The `src/adapters/__init__.py` merges both dicts, so the runner picks up all registered adapters automatically.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Testing Workflow
|
||||||
|
|
||||||
|
### 1. Understand the Website
|
||||||
|
The human will help you:
|
||||||
|
- Identify the broker's API endpoint (or the HTML structure)
|
||||||
|
- Check for a `robots.txt` or rate limit headers
|
||||||
|
- Write exploratory curl requests (for APIs) or BeautifulSoup inspections
|
||||||
|
|
||||||
|
### 2. Develop & Test Locally
|
||||||
|
- Add your scraper function to the appropriate file (`api.py` or `ssr.py`)
|
||||||
|
- Register it in the `SCRAPERS` dict
|
||||||
|
- The human updates `tests/test_adapters.py` to point to your adapter:
|
||||||
|
```python
|
||||||
|
ADAPTER = SCRAPERS['your_broker_name']
|
||||||
|
```
|
||||||
|
- Run the test:
|
||||||
|
```bash
|
||||||
|
cd tests && python test_adapters.py
|
||||||
|
```
|
||||||
|
- The test prints listings in a simple format so you can validate output
|
||||||
|
|
||||||
|
### 3. Merge Code
|
||||||
|
Once validated, the human will **copy your inline code snippets** into the main codebase. You produce **easily pasteable functions**, not entire files.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Config & Constants
|
||||||
|
|
||||||
|
**Location:** `src/config.py`
|
||||||
|
|
||||||
|
Key values you may reference:
|
||||||
|
- `MAX_PRICE = 300_000` — Price filter (your scraper can skip listings above this)
|
||||||
|
- `USER_AGENT = "Huizenbot/1.0 (+mark@kalsbeek.dev) persoonlijk gebruik"` — Used in all HTTP headers
|
||||||
|
- `MARK_WERK_POSTCODE`, `MICHELLE_WERK_POSTCODE` — Work postcodes for travel time calculation
|
||||||
|
|
||||||
|
Secrets (API keys, webhook URLs) are **environment variables**, not in config.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CMS Detection Tool
|
||||||
|
|
||||||
|
Before investigating a broker's HTML manually, prod the human in the loop to run `autoscraper.py` from the project root:
|
||||||
|
```bash
|
||||||
|
python autoscraper.py listings <listings-url>
|
||||||
|
python autoscraper.py details <detail-page-url>
|
||||||
|
```
|
||||||
|
|
||||||
|
If the broker uses a known CMS, the tool prints the exact code to add — no further investigation needed. Currently detected CMSes:
|
||||||
|
|
||||||
|
- **Realworks** → prints a ready-to-paste `fetch_realworks(...)` one-liner for `ssr.py`
|
||||||
|
|
||||||
|
If the CMS is unknown, the tool prints structural diagnostics (card selectors, field patterns, pagination) to guide manual adapter development.
|
||||||
|
|
||||||
|
## Important Notes
|
||||||
|
|
||||||
|
### Status Mapping
|
||||||
|
Brokers use different status strings. Always map to one of:
|
||||||
|
- `"beschikbaar"` — Available for sale
|
||||||
|
- `"onder_bod"` — Under offer
|
||||||
|
- `"verkocht"` — Sold
|
||||||
|
|
||||||
|
Example from api.py:
|
||||||
|
```python
|
||||||
|
_STATUS_MAP = {
|
||||||
|
"available": "beschikbaar",
|
||||||
|
"under_bid": "onder_bod",
|
||||||
|
"sold": "verkocht",
|
||||||
|
}
|
||||||
|
status = _STATUS_MAP.get(item.get("status"), "beschikbaar")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Postcode Extraction
|
||||||
|
Always aim for the **Dutch postcode format** (4 digits + 2 letters, e.g., `"2611CA"`). The travel time calculation depends on it. If a broker only provides the address string, use `_extract_postcode(address)`.
|
||||||
|
|
||||||
|
### Price Handling
|
||||||
|
Prices are **integers** (euros), never floats. Use `parse_prijs()` for HTML.
|
||||||
|
|
||||||
|
### Image URLs
|
||||||
|
Store the hero/main image URL in `hero_image_url`. This appears in Home Assistant notifications.
|
||||||
|
|
||||||
|
### Extra Data
|
||||||
|
If a broker provides extra fields that don't fit the schema (e.g., balcony, garden, orientation), store them in the `extra` dict:
|
||||||
|
```python
|
||||||
|
listings.append(RawListing(
|
||||||
|
url=...,
|
||||||
|
...
|
||||||
|
extra={
|
||||||
|
"balcony": item.get("has_balcony"),
|
||||||
|
"garden": item.get("has_garden"),
|
||||||
|
"custom_field": item.get("something_else"),
|
||||||
|
}
|
||||||
|
))
|
||||||
|
```
|
||||||
|
|
||||||
|
The database stores this as JSON in the `extra` column.
|
||||||
|
|
||||||
|
### Error Handling
|
||||||
|
- Wrap individual listing parsing in try/except to continue on one bad listing
|
||||||
|
- Log parse warnings, not errors (brokers' HTML changes)
|
||||||
|
- Let HTTP errors bubble up (the runner catches them at the adapter level)
|
||||||
|
|
||||||
|
### Rate Limiting & Ethics
|
||||||
|
- Both `fetch_json()` and `fetch_soup()` handle 429 Retry-After automatically
|
||||||
|
- Nominatim (geocoding) has a 1 req/s limiter built into `huizenbot.py`
|
||||||
|
- Never spawn parallel requests without the human's approval
|
||||||
|
- Always use the `USER_AGENT` header (includes contact info for respectful scraping)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Example: Adding "Van Daal" (API-based)
|
||||||
|
|
||||||
|
### Scenario
|
||||||
|
The human finds that Van Daal (vandaalmakelaardij.nl) has a JSON API at:
|
||||||
|
```
|
||||||
|
https://api.vandaal.nl/listings?city=delft&status=available
|
||||||
|
```
|
||||||
|
|
||||||
|
### Your Code (add to api.py)
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Van Daal
|
||||||
|
# --------
|
||||||
|
_VANDAAL_BASE = "https://www.vandaalmakelaardij.nl"
|
||||||
|
_VANDAAL_API = "https://api.vandaal.nl/listings"
|
||||||
|
|
||||||
|
_VANDAAL_STATUS_MAP = {
|
||||||
|
"available": "beschikbaar",
|
||||||
|
"under_offer": "onder_bod",
|
||||||
|
"sold": "verkocht",
|
||||||
|
}
|
||||||
|
|
||||||
|
def fetch_vandaal() -> list[RawListing]:
|
||||||
|
listings = []
|
||||||
|
for city in ["delft", "schiedam"]:
|
||||||
|
data = fetch_json(
|
||||||
|
_VANDAAL_API,
|
||||||
|
params={"city": city, "status": "available"}
|
||||||
|
)
|
||||||
|
|
||||||
|
for item in data.get("listings", []):
|
||||||
|
if item.get("price", 0) > config.MAX_PRICE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
listings.append(RawListing(
|
||||||
|
url=item["url"],
|
||||||
|
source_makelaar="vandaal",
|
||||||
|
adres=item.get("address"),
|
||||||
|
postcode=item.get("postcode"),
|
||||||
|
stad=item.get("city"),
|
||||||
|
prijs=item.get("price"),
|
||||||
|
woningtype=item.get("type"),
|
||||||
|
woonoppervlak=item.get("living_area"),
|
||||||
|
slaapkamers=item.get("bedrooms"),
|
||||||
|
hero_image_url=item.get("image_url"),
|
||||||
|
))
|
||||||
|
|
||||||
|
log.info("vandaal: %d listings", len(listings))
|
||||||
|
return listings
|
||||||
|
```
|
||||||
|
|
||||||
|
### Register in SCRAPERS (in api.py)
|
||||||
|
```python
|
||||||
|
SCRAPERS = {
|
||||||
|
'bjornd': fetch_bjornd,
|
||||||
|
'vandaal': fetch_vandaal, # ← Add this
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test
|
||||||
|
Human updates `test_adapters.py`:
|
||||||
|
```python
|
||||||
|
ADAPTER = SCRAPERS['vandaal']
|
||||||
|
```
|
||||||
|
|
||||||
|
Then runs:
|
||||||
|
```bash
|
||||||
|
cd tests && python test_adapters.py
|
||||||
|
```
|
||||||
|
|
||||||
|
If all looks good, the human copies the `fetch_vandaal()` function into the real `api.py` and adds it to `SCRAPERS`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
1. **You receive** an adapter request + investigation results (API endpoint or HTML structure)
|
||||||
|
2. **You write** a clean, self-contained scraper function that returns `list[RawListing]`
|
||||||
|
3. **You register** it in the appropriate `SCRAPERS` dict
|
||||||
|
4. **The human tests** it with `test_adapters.py` and validates output
|
||||||
|
5. **The human merges** your code into the production files
|
||||||
|
|
||||||
|
Keep code simple, use the provided helpers, populate `RawListing` fields as best you can, and always set `source_makelaar` and `url` correctly.
|
||||||
290
autoscraper.py
Normal file
290
autoscraper.py
Normal file
@@ -0,0 +1,290 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
autoscraper.py — detect CMS and extract patterns from broker pages
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python autoscraper.py listings <url> — detect CMS + card structure
|
||||||
|
python autoscraper.py details <url> — detect CMS + kenmerk patterns
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
UA = "Huizenbot/1.0 (+mark@kalsbeek.dev) persoonlijk gebruik"
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CMS fingerprints
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Each entry: (name, listings_signal, details_signal, adapter_hint)
|
||||||
|
# signals are (selector, min_count) tuples — all must match
|
||||||
|
CMS_FINGERPRINTS = [
|
||||||
|
{
|
||||||
|
"name": "Realworks",
|
||||||
|
"listings": [("li.aanbodEntry", 1), ("span.kenmerkValue", 1)],
|
||||||
|
"details": [("span.kenmerkName", 3), ("span.kenmerkValue", 3)],
|
||||||
|
"hint": "fetch_realworks('{base_url}', '{makelaar}')",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Candidate card selectors (tried in order for unknown CMS)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
CARD_CANDIDATES = [
|
||||||
|
"li.aanbodEntry",
|
||||||
|
"article",
|
||||||
|
"li[class*=object]",
|
||||||
|
"li[class*=woning]",
|
||||||
|
"li[class*=listing]",
|
||||||
|
"div[class*=object-item]",
|
||||||
|
"div[class*=property-item]",
|
||||||
|
"div[class*=aanbod]",
|
||||||
|
".listing-item",
|
||||||
|
]
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Regex patterns for field detection
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
RE_POSTCODE = re.compile(r"\b\d{4}\s?[A-Z]{2}\b")
|
||||||
|
RE_PRICE = re.compile(r"€\s*[\d.,]+")
|
||||||
|
RE_M2 = re.compile(r"\d+\s*m[²2]")
|
||||||
|
RE_PAGE_URL = re.compile(r"pagina[-/]?\d+|[?&]p(?:age)?=\d+|/\d+/?$")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def fetch(url: str) -> BeautifulSoup:
|
||||||
|
r = httpx.get(url, headers={"User-Agent": UA}, timeout=15, follow_redirects=True)
|
||||||
|
r.raise_for_status()
|
||||||
|
return BeautifulSoup(r.text, "html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
def _selector_path(el: Tag) -> str:
|
||||||
|
"""Short CSS-like path for an element: tag.class1.class2"""
|
||||||
|
parts = []
|
||||||
|
for ancestor in reversed(list(el.parents)):
|
||||||
|
if ancestor.name in (None, "[document]", "html", "body"):
|
||||||
|
continue
|
||||||
|
cls = ".".join(ancestor.get("class", []))
|
||||||
|
parts.append(f"{ancestor.name}.{cls}" if cls else ancestor.name)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
break
|
||||||
|
cls = ".".join(el.get("class", []))
|
||||||
|
parts.append(f"{el.name}.{cls}" if cls else el.name)
|
||||||
|
return " > ".join(parts[-3:])
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_cms(soup: BeautifulSoup, mode: str) -> dict | None:
|
||||||
|
key = "listings" if mode == "listings" else "details"
|
||||||
|
for cms in CMS_FINGERPRINTS:
|
||||||
|
if all(len(soup.select(sel)) >= n for sel, n in cms[key]):
|
||||||
|
return cms
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _find_cards(soup: BeautifulSoup) -> tuple[list, str | None]:
|
||||||
|
for sel in CARD_CANDIDATES:
|
||||||
|
found = soup.select(sel)
|
||||||
|
if len(found) >= 2:
|
||||||
|
return found, sel
|
||||||
|
# fallback: find the most repeated element class
|
||||||
|
from collections import Counter
|
||||||
|
class_counts: Counter = Counter()
|
||||||
|
for el in soup.find_all(True):
|
||||||
|
cls = tuple(el.get("class", []))
|
||||||
|
if cls:
|
||||||
|
class_counts[cls] += 1
|
||||||
|
if class_counts:
|
||||||
|
top_cls, count = class_counts.most_common(1)[0]
|
||||||
|
if count >= 2:
|
||||||
|
sel = "." + ".".join(top_cls)
|
||||||
|
return soup.select(sel), f"{sel} (auto-detected, count={count})"
|
||||||
|
return [], None
|
||||||
|
|
||||||
|
|
||||||
|
def _pattern_hits(soup: BeautifulSoup, pattern: re.Pattern, label: str):
|
||||||
|
hits = []
|
||||||
|
for el in soup.find_all(string=pattern):
|
||||||
|
parent = el.parent
|
||||||
|
if parent:
|
||||||
|
hits.append((parent.get_text(strip=True)[:80], _selector_path(parent)))
|
||||||
|
if hits:
|
||||||
|
print(f"\n [{label}] — {len(hits)} hit(s)")
|
||||||
|
for text, path in hits[:4]:
|
||||||
|
print(f" {path}")
|
||||||
|
print(f" → {text!r}")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Commands
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def cmd_listings(url: str):
|
||||||
|
print(f"Fetching: {url}\n")
|
||||||
|
soup = fetch(url)
|
||||||
|
base_url = "/".join(url.split("/")[:3])
|
||||||
|
|
||||||
|
cms = _detect_cms(soup, "listings")
|
||||||
|
|
||||||
|
if cms:
|
||||||
|
print(f"✓ CMS DETECTED: {cms['name']}")
|
||||||
|
hint = cms["hint"].format(base_url=base_url, makelaar="<name>")
|
||||||
|
print(f"\n Add to ssr.py:\n")
|
||||||
|
print(f" def fetch_<name>() -> list[RawListing]:")
|
||||||
|
print(f" return {hint}\n")
|
||||||
|
print(f" Register in SCRAPERS dict:")
|
||||||
|
print(f" '<name>': fetch_<name>,")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("✗ CMS unknown — structural diagnostics:\n")
|
||||||
|
|
||||||
|
# Cards
|
||||||
|
cards, matched_sel = _find_cards(soup)
|
||||||
|
print(f"=== CARDS ({matched_sel or 'none found'}: {len(cards)}) ===")
|
||||||
|
if cards:
|
||||||
|
print("\n--- FIRST CARD ---")
|
||||||
|
print(cards[0].prettify()[:2500])
|
||||||
|
print("\n--- CHILD ELEMENTS & CLASSES ---")
|
||||||
|
for el in cards[0].find_all(True):
|
||||||
|
cls = el.get("class")
|
||||||
|
text = el.get_text(strip=True)[:50]
|
||||||
|
if cls:
|
||||||
|
print(f" <{el.name}> .{' .'.join(cls)} {text!r}")
|
||||||
|
|
||||||
|
# Pattern hits in cards area (or full page if no cards)
|
||||||
|
search_area = cards[0] if cards else soup
|
||||||
|
print("\n=== FIELD PATTERNS ===")
|
||||||
|
_pattern_hits(search_area, RE_POSTCODE, "postcode")
|
||||||
|
_pattern_hits(search_area, RE_PRICE, "prijs")
|
||||||
|
_pattern_hits(search_area, RE_M2, "m²")
|
||||||
|
|
||||||
|
# Pagination
|
||||||
|
print("\n=== PAGINATION ===")
|
||||||
|
page_links = soup.find_all("a", href=RE_PAGE_URL)
|
||||||
|
if page_links:
|
||||||
|
seen = set()
|
||||||
|
for a in page_links:
|
||||||
|
href = a.get("href", "")
|
||||||
|
if href not in seen:
|
||||||
|
seen.add(href)
|
||||||
|
print(f" {href!r} — {a.get_text(strip=True)!r}")
|
||||||
|
else:
|
||||||
|
print(" No pagination links found")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_details(url: str):
|
||||||
|
print(f"Fetching: {url}\n")
|
||||||
|
soup = fetch(url)
|
||||||
|
|
||||||
|
cms = _detect_cms(soup, "details")
|
||||||
|
|
||||||
|
if cms:
|
||||||
|
print(f"✓ CMS DETECTED: {cms['name']}")
|
||||||
|
print("\n _realworks_detail() will extract:")
|
||||||
|
kv: dict[str, str] = {}
|
||||||
|
for kenmerk in soup.select("span.kenmerk"):
|
||||||
|
label_el = kenmerk.select_one("span.kenmerkName")
|
||||||
|
value_el = kenmerk.select_one("span.kenmerkValue")
|
||||||
|
if label_el and value_el:
|
||||||
|
label = label_el.get_text(strip=True).lower()
|
||||||
|
value = value_el.get_text(strip=True)
|
||||||
|
kv[label] = value
|
||||||
|
|
||||||
|
target_fields = {
|
||||||
|
"type woning": "woningtype",
|
||||||
|
"bouwjaar": "bouwjaar",
|
||||||
|
"woonoppervlakte": "woonoppervlak",
|
||||||
|
"perceeloppervlakte": "perceeloppervlak",
|
||||||
|
"aantal kamers": "kamers",
|
||||||
|
"aantal slaapkamers": "slaapkamers",
|
||||||
|
"energieklasse": "energielabel",
|
||||||
|
}
|
||||||
|
for key, field in target_fields.items():
|
||||||
|
val = kv.get(key, "NOT FOUND")
|
||||||
|
status = "✓" if key in kv else "✗"
|
||||||
|
print(f" {status} {field:<20} ← {key!r}: {val!r}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("✗ CMS unknown — structural diagnostics:\n")
|
||||||
|
|
||||||
|
# Address
|
||||||
|
print("=== ADDRESS ===")
|
||||||
|
for tag in ["h1", "h2"]:
|
||||||
|
for el in soup.select(tag):
|
||||||
|
t = el.get_text(strip=True)
|
||||||
|
if t:
|
||||||
|
print(f" <{tag}> {t!r}")
|
||||||
|
|
||||||
|
# Key-value patterns
|
||||||
|
print("\n=== KEY-VALUE STRUCTURES ===")
|
||||||
|
kv_selectors = [
|
||||||
|
("dl", "dt", "dd"),
|
||||||
|
("table", "th", "td"),
|
||||||
|
(".kenmerk", ".kenmerkName", ".kenmerkValue"),
|
||||||
|
(".spec", ".spec-label", ".spec-value"),
|
||||||
|
(".feature", ".feature-label", ".feature-value"),
|
||||||
|
]
|
||||||
|
found_any = False
|
||||||
|
for container_sel, label_sel, value_sel in kv_selectors:
|
||||||
|
pairs = []
|
||||||
|
for container in soup.select(container_sel)[:50]:
|
||||||
|
label_el = container.select_one(label_sel)
|
||||||
|
value_el = container.select_one(value_sel)
|
||||||
|
if label_el and value_el:
|
||||||
|
l = label_el.get_text(strip=True)
|
||||||
|
v = value_el.get_text(strip=True)
|
||||||
|
if l and v:
|
||||||
|
pairs.append((l, v))
|
||||||
|
if pairs:
|
||||||
|
found_any = True
|
||||||
|
print(f"\n [{container_sel} > {label_sel} / {value_sel}] — {len(pairs)} pairs")
|
||||||
|
for l, v in pairs[:10]:
|
||||||
|
print(f" {l:<30} {v}")
|
||||||
|
|
||||||
|
if not found_any:
|
||||||
|
print(" No key-value structures detected")
|
||||||
|
|
||||||
|
# Field pattern hits
|
||||||
|
print("\n=== FIELD PATTERNS ===")
|
||||||
|
_pattern_hits(soup, RE_POSTCODE, "postcode")
|
||||||
|
_pattern_hits(soup, RE_PRICE, "prijs")
|
||||||
|
_pattern_hits(soup, RE_M2, "m²")
|
||||||
|
|
||||||
|
# Images
|
||||||
|
print("\n=== IMAGES (first 5) ===")
|
||||||
|
for img in soup.select("img")[:5]:
|
||||||
|
src = img.get("src") or img.get("data-src")
|
||||||
|
alt = img.get("alt", "")
|
||||||
|
print(f" {src} [{alt}]")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Entry point
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print(__doc__)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
cmd = sys.argv[1]
|
||||||
|
url = sys.argv[2]
|
||||||
|
|
||||||
|
if cmd == "listings":
|
||||||
|
cmd_listings(url)
|
||||||
|
elif cmd == "details":
|
||||||
|
cmd_details(url)
|
||||||
|
else:
|
||||||
|
print(f"Unknown command: {cmd}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
3
bsprettify.py
Normal file
3
bsprettify.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
import sys
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
print(BeautifulSoup(sys.stdin.read(), 'html.parser').prettify())
|
||||||
82
makelaars.md
82
makelaars.md
@@ -2,37 +2,57 @@
|
|||||||
|
|
||||||
## Delft
|
## Delft
|
||||||
|
|
||||||
| Naam | Website | Adres |
|
| Done | Naam | Website | Adres |
|
||||||
|------|---------|-------|
|
| [ ] | ---- |------|---------|-------|
|
||||||
| Van Silfhout & Hogetoorn Wereldmakelaars | vansilfhout.nl | Ireneboulevard 2 |
|
| [ ] | Van Silfhout & Hogetoorn Wereldmakelaars | vansilfhout.nl | Ireneboulevard 2 |
|
||||||
| Van Daal Makelaardij | vandaalmakelaardij.nl | Voldersgracht 33 |
|
| [ ] | Van Daal Makelaardij | vandaalmakelaardij.nl | Voldersgracht 33 |
|
||||||
| Björnd Makelaardij | bjornd.nl | Oude Delft 103 |
|
| [x] | Björnd Makelaardij | bjornd.nl | Oude Delft 103 |
|
||||||
| Hof van Delft Makelaardij | hofvandelftmakelaardij.nl | Wateringsevest 26 |
|
| [ ] | Hof van Delft Makelaardij | hofvandelftmakelaardij.nl | Wateringsevest 26 |
|
||||||
| V&W Makelaars Delft | vwmakelaars.nl | Coenderstraat 31 |
|
| [ ] | V&W Makelaars Delft | vwmakelaars.nl | Coenderstraat 31 |
|
||||||
| Roepman Makelaardij NVM | roepman.nl | Molslaan 43 |
|
| [ ] | Roepman Makelaardij NVM | roepman.nl | Molslaan 43 |
|
||||||
| ZO makelaars | zomakelaars.nl | Van Foreestweg 4 |
|
| [ ] | ZO makelaars | zomakelaars.nl | Van Foreestweg 4 |
|
||||||
| Marloes Makelaars | — | Maerten Trompstraat 28 |
|
| [ ] | Marloes Makelaars | — | Maerten Trompstraat 28 |
|
||||||
| Makelaarskantoor J.E. Mouthaan | — | Julianalaan 43 |
|
| [ ] | Makelaarskantoor J.E. Mouthaan | — | Julianalaan 43 |
|
||||||
| Olsthoorn Makelaars Delft | olsthoornmakelaars.nl | Noordeinde 51 |
|
| [ ] | Olsthoorn Makelaars Delft | olsthoornmakelaars.nl | Noordeinde 51 |
|
||||||
| Post Makelaardij (v/h Bayense) | postmakelaardij.nl | Spoorsingel 1a |
|
| [ ] | Post Makelaardij (v/h Bayense) | postmakelaardij.nl | Spoorsingel 1a |
|
||||||
| Morris NVM Makelaars | morrismakelaardij.nl | — |
|
| [ ] | Morris NVM Makelaars | morrismakelaardij.nl | — |
|
||||||
| Prinsenstad Makelaardij | — | — |
|
| [ ] | Prinsenstad Makelaardij | — | — |
|
||||||
| Oude Delft Makelaardij | — | — |
|
| [ ] | Oude Delft Makelaardij | — | — |
|
||||||
| Dijksman Woningmakelaars | — | — |
|
| [ ] | Dijksman Woningmakelaars | — | — |
|
||||||
| CORPOwonen | — | — |
|
| [ ] | CORPOwonen | — | — |
|
||||||
|
|
||||||
## Schiedam
|
## Schiedam
|
||||||
|
|
||||||
| Naam | Website | Adres |
|
| Done | Naam | Website | Adres |
|
||||||
|------|---------|-------|
|
|------|------|---------|-------|
|
||||||
| Anke Bodewes Makelaardij | ankebodewes.nl | Hargplein 118 |
|
| [x] | Anke Bodewes Makelaardij | ankebodewes.nl | Hargplein 118 |
|
||||||
| Woongoed Makelaars Schiedam | woongoedmakelaars.nl | Oranjestraat 93 |
|
| [x] | Woongoed Makelaars Schiedam | woongoedmakelaars.nl | Oranjestraat 93 |
|
||||||
| Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 |
|
| [x] | Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 |
|
||||||
| De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 |
|
| [ ] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 |
|
||||||
| Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 |
|
| [ ] | Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 |
|
||||||
| 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 |
|
| [ ] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 |
|
||||||
| Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 |
|
| [ ] | Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 |
|
||||||
| D&S Makelaardij | densmakelaars.nl | Land van Belofte 50 |
|
| [ ] | D&S Makelaardij | densmakelaars.nl | Land van Belofte 50 |
|
||||||
| Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B |
|
| [ ] | Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B |
|
||||||
| Hagestein Makelaardij | — | Degerfors 54 |
|
| [ ] | Hagestein Makelaardij | — | Degerfors 54 |
|
||||||
| Schieland Borsboom NVM Makelaars | schielandborsboom.nl | (Rotterdam, actief in Schiedam) |
|
| [ ] | Schieland Borsboom NVM Makelaars | schielandborsboom.nl | (Rotterdam, actief in Schiedam) |
|
||||||
|
|
||||||
|
|
||||||
|
## Leiden
|
||||||
|
|
||||||
|
| Done | Naam | Website | Adres |
|
||||||
|
|------|------|---------|-------|
|
||||||
|
| [ ] | RE/MAX Makelaarsgilde | makelaars-in-leiden.nl | Levendaal 73-75 |
|
||||||
|
| [ ] | Hypodomus Leiden | hypodomusleiden.nl | Haarlemmerstraat 268 |
|
||||||
|
| [ ] | Alpina Leiden (v/h De Leeuw) | advies.alpina.nl | Molenwerf 4 |
|
||||||
|
| [ ] | Fides makelaars (ERA/NVM) | fidesmakelaarsleiden.nl | Lammenschansweg 76 |
|
||||||
|
| [ ] | Werk Makelaardij | werkmakelaardij.nl | Stevenshof (Leiden) |
|
||||||
|
| [ ] | Kerkvliet Makelaars | kerkvlietmakelaars.nl | Hoge Rijndijk 271A |
|
||||||
|
| [ ] | Kompas Makelaars & Taxateurs | kompasmakelaardij.nl | Maresingel 75-76 |
|
||||||
|
| [ ] | Hoekstra en Van Eck Leiden | hoekstraenvaneck.nl | Schipholweg 55-75 |
|
||||||
|
| [ ] | DOEN NVM Makelaars | doenmakelaars.com | Doezastraat 30 |
|
||||||
|
| [ ] | Oudshoorn Makelaardij | oudshoornmakelaardij.nl | — |
|
||||||
|
| [ ] | April Makelaars Leiden | aprilmakelaars.nl | Haagweg 55 |
|
||||||
|
| [ ] | Emil NVM Makelaars | emilmakelaars.nl | — |
|
||||||
|
| [ ] | Goedhart Makelaars | — | Oude Singel 14 |
|
||||||
|
| [ ] | Graal Makelaardij & Taxaties | — | Rapenburg 5 |
|
||||||
|
|||||||
36
new_scraper_prompt.md
Normal file
36
new_scraper_prompt.md
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
# SSR
|
||||||
|
Check out the add_scraper_context.md, let's add a new scraper.
|
||||||
|
|
||||||
|
**Broker:** [name]
|
||||||
|
**Website:** [base url]
|
||||||
|
**Listing page URL:** [url with any price/city filters applied]
|
||||||
|
**Detail page kenmerken:** yes / no
|
||||||
|
|
||||||
|
**Listing page HTML** (one card):
|
||||||
|
[paste]
|
||||||
|
|
||||||
|
**Detail page dump:** [attached / n.a.]
|
||||||
|
|
||||||
|
**Pagination:** [e.g. 10 per page, pagina-N in URL / no pagination]
|
||||||
|
|
||||||
|
**Notes:** [auth, JS rendering, price filter in URL, etc.]
|
||||||
|
|
||||||
|
|
||||||
|
# API
|
||||||
|
|
||||||
|
Check out the add_scraper_context.md, let's add a new scraper.
|
||||||
|
|
||||||
|
**Broker:** [name]
|
||||||
|
**Website:** [base url]
|
||||||
|
**API endpoint:** [full url]
|
||||||
|
**Auth:** [none / header: X-Foo: bar / query param]
|
||||||
|
|
||||||
|
**Example curl:**
|
||||||
|
[paste]
|
||||||
|
|
||||||
|
**Example response (one item):**
|
||||||
|
[paste]
|
||||||
|
|
||||||
|
**Pagination:** [e.g. page param / offset / single response]
|
||||||
|
|
||||||
|
**Notes:** [price filter, city filter, status field values, etc.]
|
||||||
@@ -106,11 +106,87 @@ def fetch_bjornd() -> list[RawListing]:
|
|||||||
log.info("bjornd: %d koopwoningen opgehaald", len(listings))
|
log.info("bjornd: %d koopwoningen opgehaald", len(listings))
|
||||||
return listings
|
return listings
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Ooms
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_OOMS_BASE = "https://ooms.com"
|
||||||
|
_OOMS_CITIES = {"Delft", "Schiedam", "Rotterdam", "Leiden", "Voorburg", "Pijnacker"}
|
||||||
|
_OOMS_SKIP_STATUS = {"verhuurd", "verhuurd onder voorbehoud"}
|
||||||
|
_OOMS_STATUS_MAP = {
|
||||||
|
"beschikbaar": "beschikbaar",
|
||||||
|
"onder bod": "onder_bod",
|
||||||
|
"onder optie": "onder_bod",
|
||||||
|
"verkocht": "verkocht",
|
||||||
|
"verkocht onder voorbehoud":"verkocht",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_ooms() -> list[RawListing]:
|
||||||
|
data = fetch_json(f"{_OOMS_BASE}/api/properties/available.json")
|
||||||
|
listings = []
|
||||||
|
|
||||||
|
for item in data.get("objects", []):
|
||||||
|
if item.get("buy_or_rent") != "buy":
|
||||||
|
continue
|
||||||
|
if item.get("place") not in _OOMS_CITIES:
|
||||||
|
continue
|
||||||
|
if item.get("buy_price", 0) > config.MAX_PRICE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
status_raw = item.get("availability_status", "")
|
||||||
|
if status_raw in _OOMS_SKIP_STATUS:
|
||||||
|
continue
|
||||||
|
|
||||||
|
hnr = item.get("house_number", "")
|
||||||
|
add = item.get("house_number_addition") or ""
|
||||||
|
adres = f"{item.get('street_name', '')} {hnr}{(' ' + add) if add else ''}".strip()
|
||||||
|
|
||||||
|
main_images = item.get("realworks_main_images") or item.get("realworks_images") or []
|
||||||
|
hero = None
|
||||||
|
if main_images:
|
||||||
|
sizes = main_images[0].get("sizes") or []
|
||||||
|
best = max(sizes, key=lambda s: s.get("width", 0), default=None)
|
||||||
|
if best:
|
||||||
|
hero = _OOMS_BASE + best["imageUrl"]
|
||||||
|
|
||||||
|
perceel = item.get("parcel_surface") or None
|
||||||
|
if perceel == 0:
|
||||||
|
perceel = None
|
||||||
|
|
||||||
|
listings.append(RawListing(
|
||||||
|
url=item["url"],
|
||||||
|
source_makelaar="ooms",
|
||||||
|
datum_aanmelding=item.get("publish_date", "")[:10] or None,
|
||||||
|
status=_OOMS_STATUS_MAP.get(status_raw, "beschikbaar"),
|
||||||
|
adres=adres or None,
|
||||||
|
postcode=(item.get("zip_code") or "").replace(" ", "") or None,
|
||||||
|
stad=item.get("place") or None,
|
||||||
|
prijs=item.get("buy_price") or None,
|
||||||
|
woningtype=item.get("appartment_characteristic") or item.get("residential_building_type") or None,
|
||||||
|
woonoppervlak=item.get("usable_area_living_function") or None,
|
||||||
|
perceeloppervlak=perceel,
|
||||||
|
kamers=item.get("amount_of_rooms") or None,
|
||||||
|
slaapkamers=item.get("amount_of_bedrooms") or None,
|
||||||
|
hero_image_url=hero,
|
||||||
|
extra={
|
||||||
|
"office": item.get("office", {}).get("name"),
|
||||||
|
"locations": item.get("locations"),
|
||||||
|
"garden_types": item.get("garden_types"),
|
||||||
|
"lat": item.get("lat"),
|
||||||
|
"lng": item.get("lng"),
|
||||||
|
"object_code": item.get("object_code"),
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
log.info("ooms: %d listings opgehaald", len(listings))
|
||||||
|
return listings
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# SCRAPERS — exporteer hier alle actieve API adapters
|
# SCRAPERS — exporteer hier alle actieve API adapters
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
SCRAPERS = {
|
SCRAPERS = {
|
||||||
'bjornd': fetch_bjornd,
|
'bjornd': fetch_bjornd,
|
||||||
|
'ooms': fetch_ooms,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -65,51 +65,127 @@ def parse_m2(text: str | None) -> int | None:
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Björn & Dries adapter (bjornd.nl)
|
# Realworks CMS (shared)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# TODO: vul de echte CSS selectors in na inspectie van de pagina.
|
|
||||||
# Dit is een structureel sjabloon — de selectors zijn placeholders.
|
|
||||||
|
|
||||||
BJORND_BASE = "https://www.bjornd.nl"
|
_REALWORKS_STATUS_MAP = {
|
||||||
BJORND_AANBOD = f"{BJORND_BASE}/aanbod"
|
"te koop": "beschikbaar",
|
||||||
|
"nieuw": "beschikbaar",
|
||||||
|
"onder bod": "onder_bod",
|
||||||
|
"onder optie": "onder_bod",
|
||||||
|
"verkocht o.v.": "verkocht",
|
||||||
|
"verkocht": "verkocht",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def fetch_bjornd_demo() -> list[RawListing]:
|
def _realworks_detail(detail_url: str, makelaar: str) -> dict:
|
||||||
soup = fetch_soup(BJORND_AANBOD)
|
"""Fetch a Realworks detail page and extract kenmerken. Returns empty dict on failure."""
|
||||||
|
try:
|
||||||
|
soup = fetch_soup(detail_url)
|
||||||
|
|
||||||
|
# Build a label→value map from all .kenmerk spans
|
||||||
|
kv: dict[str, str] = {}
|
||||||
|
for kenmerk in soup.select("span.kenmerk"):
|
||||||
|
label_el = kenmerk.select_one("span.kenmerkName")
|
||||||
|
value_el = kenmerk.select_one("span.kenmerkValue")
|
||||||
|
if label_el and value_el:
|
||||||
|
label = label_el.get_text(strip=True).lower()
|
||||||
|
value = value_el.get_text(strip=True)
|
||||||
|
kv[label] = value
|
||||||
|
|
||||||
|
return {
|
||||||
|
"woningtype": kv.get("type woning"),
|
||||||
|
"bouwjaar": kv.get("bouwjaar"),
|
||||||
|
"woonoppervlak": kv.get("woonoppervlakte"),
|
||||||
|
"perceeloppervlak": kv.get("perceeloppervlakte"),
|
||||||
|
"kamers": kv.get("aantal kamers"),
|
||||||
|
"slaapkamers": kv.get("aantal slaapkamers"),
|
||||||
|
"energielabel": kv.get("energieklasse"),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("%s: detail fetch fout %s: %s", makelaar, detail_url, e)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_realworks(base_url: str, makelaar: str) -> list[RawListing]:
|
||||||
|
"""
|
||||||
|
Generic fetcher for Realworks CMS brokers.
|
||||||
|
Paginates via /pagina-{n}/, fetches detail page per listing.
|
||||||
|
"""
|
||||||
|
listings_path = f"/aanbod/woningaanbod/-{config.MAX_PRICE}/koop"
|
||||||
listings = []
|
listings = []
|
||||||
|
page = 1
|
||||||
|
|
||||||
# Pas de selector aan op de echte HTML structuur
|
while True:
|
||||||
for card in soup.select(".property-card"): # ← aanpassen
|
url = f"{base_url}{listings_path}/pagina-{page}/"
|
||||||
try:
|
soup = fetch_soup(url)
|
||||||
a_tag = card.select_one("a[href]")
|
cards = soup.select("li.aanbodEntry")
|
||||||
if not a_tag:
|
if not cards:
|
||||||
continue
|
break
|
||||||
url = a_tag["href"]
|
|
||||||
if not url.startswith("http"):
|
|
||||||
url = BJORND_BASE + url
|
|
||||||
|
|
||||||
adres = _text(card, ".property-address") # ← aanpassen
|
for card in cards:
|
||||||
postcode = _extract_postcode(_text(card, ".property-location"))
|
try:
|
||||||
prijs = parse_prijs(_text(card, ".property-price"))
|
a_tag = card.select_one("a.aanbodEntryLink")
|
||||||
opp = parse_m2(_text(card, ".property-area"))
|
if not a_tag:
|
||||||
img = _src(card, "img")
|
continue
|
||||||
|
listing_url = base_url + a_tag["href"]
|
||||||
|
|
||||||
listings.append(RawListing(
|
adres = _text(card, ".street-address")
|
||||||
url=url,
|
postcode = (_text(card, ".postal-code") or "").replace(" ", "") or None
|
||||||
source_makelaar="bjornd",
|
stad = _text(card, ".locality")
|
||||||
adres=adres,
|
prijs = parse_prijs(_text(card, ".koopprijs .kenmerkValue"))
|
||||||
postcode=postcode,
|
|
||||||
stad=_infer_stad(postcode),
|
|
||||||
prijs=prijs,
|
|
||||||
woonoppervlak=opp,
|
|
||||||
hero_image_url=img,
|
|
||||||
))
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("Fout bij parsen bjornd card: %s", e)
|
|
||||||
|
|
||||||
|
status_text = (_text(card, ".objectstatusbanner") or "").lower()
|
||||||
|
status = _REALWORKS_STATUS_MAP.get(status_text, "beschikbaar")
|
||||||
|
|
||||||
|
img_tag = card.select_one(".hoofdfoto img")
|
||||||
|
hero = img_tag["src"] if img_tag else None
|
||||||
|
|
||||||
|
kk = _realworks_detail(listing_url, makelaar)
|
||||||
|
|
||||||
|
listings.append(RawListing(
|
||||||
|
url=listing_url,
|
||||||
|
source_makelaar=makelaar,
|
||||||
|
adres=adres,
|
||||||
|
postcode=postcode,
|
||||||
|
stad=stad,
|
||||||
|
prijs=prijs,
|
||||||
|
status=status,
|
||||||
|
hero_image_url=hero,
|
||||||
|
woningtype=kk.get("woningtype"),
|
||||||
|
bouwjaar=int(kk["bouwjaar"]) if kk.get("bouwjaar") else None,
|
||||||
|
woonoppervlak=parse_m2(kk.get("woonoppervlak")),
|
||||||
|
perceeloppervlak=parse_m2(kk.get("perceeloppervlak")),
|
||||||
|
kamers=int(kk["kamers"]) if kk.get("kamers") else None,
|
||||||
|
slaapkamers=int(kk["slaapkamers"]) if kk.get("slaapkamers") else None,
|
||||||
|
energielabel=kk.get("energielabel"),
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("%s: parse fout: %s", makelaar, e)
|
||||||
|
|
||||||
|
if len(cards) < 10:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
log.info("%s: %d listings opgehaald", makelaar, len(listings))
|
||||||
return listings
|
return listings
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Anke Bodewes Makelaardij
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def fetch_ankebodewes() -> list[RawListing]:
|
||||||
|
return fetch_realworks("https://www.ankebodewes.nl", "ankebodewes")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Woongoed Makelaars Schiedam
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def fetch_woongoed() -> list[RawListing]:
|
||||||
|
return fetch_realworks("https://www.woongoedmakelaars.nl", "woongoed")
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# SSR helper utils
|
# SSR helper utils
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -150,5 +226,6 @@ def _infer_stad(postcode: str | None) -> str | None:
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
SCRAPERS = {
|
SCRAPERS = {
|
||||||
'bjornd_demo': fetch_bjornd_demo,
|
'ankebodewes': fetch_ankebodewes,
|
||||||
|
'woongoed': fetch_woongoed,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,11 +7,11 @@ from adapters import SCRAPERS
|
|||||||
|
|
||||||
|
|
||||||
# --- change this to test a different adapter ---
|
# --- change this to test a different adapter ---
|
||||||
ADAPTER = SCRAPERS['bjornd']
|
ADAPTER = SCRAPERS['ooms']
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print(f"Testing adapter: {ADAPTER.__name__}")
|
print(f"Testing adapter: {ADAPTER.__name__}")
|
||||||
listings = ADAPTER()
|
listings = ADAPTER()
|
||||||
print(f"Got {len(listings)} listings\n")
|
print(f"Got {len(listings)} listings\n")
|
||||||
for l in listings:
|
for l in listings:
|
||||||
print(f" {l.adres}, {l.stad} — €{l.prijs} — {l.url}")
|
print(f" {l.adres}, {l.postcode}, {l.stad} — €{l.prijs} — {l.kamers} rooms — {l.url}")
|
||||||
|
|||||||
Reference in New Issue
Block a user