ever onwards

This commit is contained in:
2026-04-03 16:58:57 +02:00
parent 918042d27e
commit b35025b9cb
4 changed files with 132 additions and 2 deletions

View File

@@ -219,6 +219,8 @@ If the CMS is unknown, the tool prints structural diagnostics (card selectors, f
## Important Notes ## Important Notes
Don't treat detail pages as optional, we always want all the info!
### Status Mapping ### Status Mapping
Brokers use different status strings. Always map to one of: Brokers use different status strings. Always map to one of:
- `"beschikbaar"` — Available for sale - `"beschikbaar"` — Available for sale
@@ -270,6 +272,7 @@ The database stores this as JSON in the `extra` column.
- Nominatim (geocoding) has a 1 req/s limiter built into `huizenbot.py` - Nominatim (geocoding) has a 1 req/s limiter built into `huizenbot.py`
- Never spawn parallel requests without the human's approval - Never spawn parallel requests without the human's approval
- Always use the `USER_AGENT` header (includes contact info for respectful scraping) - Always use the `USER_AGENT` header (includes contact info for respectful scraping)
- Don't keep curling the same endpoint, pipe it to a <name makelaar>.dump and then rg through it to find what you need. Can also pipe it through the bsprettify.py and then rg that.
--- ---

View File

@@ -30,7 +30,7 @@
| [x] | Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 | | [x] | Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 |
| [x] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 | | [x] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 |
| [x] | Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 | | [x] | Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 |
| [ ] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 | | [x] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 |
| [ ] | Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 | | [ ] | Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 |
| [x] | D&S Makelaardij | densmakelaars.nl | Land van Belofte 50 | | [x] | D&S Makelaardij | densmakelaars.nl | Land van Belofte 50 |
| [ ] | Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B | | [ ] | Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B |

View File

@@ -608,6 +608,132 @@ def fetch_dens() -> list[RawListing]:
return listings return listings
# ---------------------------------------------------------------------------
# 3D Makelaars (Schiedam/Vlaardingen)
# ---------------------------------------------------------------------------
_3D_BASE = "https://3dmakelaars.nl"
def _3dmakelaars_detail(detail_url: str) -> dict:
"""Fetch 3dmakelaars detail page and extract structured info block."""
try:
soup = fetch_soup(detail_url)
# Parse structured info block: span (label) + p (value) pairs
kv: dict[str, str] = {}
for li in soup.select("div.tl-adiltional-inforamtion ul.tl-adiltional-listed li"):
label_el = li.select_one("span")
value_el = li.select_one("p")
if label_el and value_el:
label = label_el.get_text(strip=True).lower()
value = value_el.get_text(strip=True)
kv[label] = value
# Extract postcode from first description paragraph
postcode = None
p_tag = soup.select_one(".omschrijving > p:nth-child(1)")
if p_tag:
text = p_tag.get_text()
postcode = _extract_postcode(text)
return {
"kamers": int(kv["aantal kamers"].split()[0]) if "aantal kamers" in kv else None,
"slaapkamers": int(kv["aantal slaapkamers"].split()[0]) if "aantal slaapkamers" in kv else None,
"bouwjaar": int(kv["bouwjaar"]) if "bouwjaar" in kv else None,
"woningtype": kv.get("bouwvorm"),
"woonoppervlak": parse_m2(kv.get("oppervlakte")),
"postcode": postcode,
}
except Exception as e:
log.warning("3dmakelaars: detail fetch fout %s: %s", detail_url, e)
return {}
def fetch_3dmakelaars() -> list[RawListing]:
"""Fetch 3D Makelaars listings with pagination."""
listings = []
page = 1
while True:
url = (
f"{_3D_BASE}/woningen-te-koop-in-schiedam-en-vlaardingen"
f"?kamers=&oppervlakte=&woonplaats=&video=&prijs=3&page={page}"
)
soup = fetch_soup(url)
cards = soup.select("div.tl-properties-item")
if not cards:
break
for card in cards:
try:
# Extract detail URL from onclick attribute
onclick = card.get("onclick", "")
detail_url = None
if "window.location" in onclick:
m = re.search(r"window\.location\s*=\s*['\"]([^'\"]+)['\"]", onclick)
if m:
detail_url = _3D_BASE + m.group(1)
if not detail_url:
continue
# Extract listing-level info
adres = _text(card, "h3.price")
prijs_text = _text(card, "span.address")
prijs = parse_prijs(prijs_text)
# Extract rooms and area from meta list
kamers = None
woonoppervlak = None
for li in card.select("ul.tl-meta-listed > li"):
text = li.get_text(strip=True)
if "kamers" in text.lower():
m = re.search(r"(\d+)", text)
if m:
kamers = int(m.group(1))
elif "" in text or "m2" in text:
woonoppervlak = parse_m2(text)
# Extract image
img_tag = card.select_one("img")
hero = img_tag["src"] if img_tag else None
if hero and not hero.startswith("http"):
hero = _3D_BASE + hero
# Fetch detail page for full info
detail_data = _3dmakelaars_detail(detail_url)
# Postcode from detail page, fallback to extraction from address
postcode = detail_data.get("postcode")
if not postcode and adres:
postcode = _extract_postcode(adres)
listings.append(RawListing(
url=detail_url,
source_makelaar="3dmakelaars",
adres=adres,
postcode=postcode,
stad=_infer_stad(postcode),
prijs=prijs,
woningtype=detail_data.get("woningtype"),
bouwjaar=detail_data.get("bouwjaar"),
woonoppervlak=woonoppervlak or detail_data.get("woonoppervlak"),
kamers=kamers or detail_data.get("kamers"),
slaapkamers=detail_data.get("slaapkamers"),
hero_image_url=hero,
))
except Exception as e:
log.warning("3dmakelaars: parse fout: %s", e)
if len(cards) < 7:
break
page += 1
log.info("3dmakelaars: %d listings opgehaald", len(listings))
return listings
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# SCRAPERS — exporteer hier alle actieve SSR adapters # SCRAPERS — exporteer hier alle actieve SSR adapters
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -618,4 +744,5 @@ SCRAPERS = {
'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars, 'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars,
'wassenaar': fetch_wassenaar, 'wassenaar': fetch_wassenaar,
'dens': fetch_dens, 'dens': fetch_dens,
'3dmakelaars': fetch_3dmakelaars,
} }

View File

@@ -16,7 +16,7 @@ logging.basicConfig(
) )
# --- change this to test a different adapter --- # --- change this to test a different adapter ---
ADAPTER = SCRAPERS['wassenaar'] ADAPTER = SCRAPERS['3dmakelaars']
if __name__ == "__main__": if __name__ == "__main__":
print(f"Testing adapter: {ADAPTER.__name__}") print(f"Testing adapter: {ADAPTER.__name__}")