From b35025b9cb91dc4566f5b99fbf17489fa0bc30ca Mon Sep 17 00:00:00 2001 From: Mark Kalsbeek Date: Fri, 3 Apr 2026 16:58:57 +0200 Subject: [PATCH] ever onwards --- add_scraper_context.md | 3 + makelaars.md | 2 +- src/adapters/ssr.py | 127 +++++++++++++++++++++++++++++++++++++++++ tests/test_adapters.py | 2 +- 4 files changed, 132 insertions(+), 2 deletions(-) diff --git a/add_scraper_context.md b/add_scraper_context.md index cbd879c..f2eaab5 100644 --- a/add_scraper_context.md +++ b/add_scraper_context.md @@ -219,6 +219,8 @@ If the CMS is unknown, the tool prints structural diagnostics (card selectors, f ## Important Notes +Don't treat detail pages as optional, we always want all the info! + ### Status Mapping Brokers use different status strings. Always map to one of: - `"beschikbaar"` — Available for sale @@ -270,6 +272,7 @@ The database stores this as JSON in the `extra` column. - Nominatim (geocoding) has a 1 req/s limiter built into `huizenbot.py` - Never spawn parallel requests without the human's approval - Always use the `USER_AGENT` header (includes contact info for respectful scraping) +- Don't keep curling the same endpoint, pipe it to a .dump and then rg through it to find what you need. Can also pipe it through the bsprettify.py and then rg that. --- diff --git a/makelaars.md b/makelaars.md index 80e3840..66323f8 100644 --- a/makelaars.md +++ b/makelaars.md @@ -30,7 +30,7 @@ | [x] | Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 | | [x] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 | | [x] | Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 | -| [ ] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 | +| [x] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 | | [ ] | Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 | | [x] | D&S Makelaardij | densmakelaars.nl | Land van Belofte 50 | | [ ] | Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B | diff --git a/src/adapters/ssr.py b/src/adapters/ssr.py index a6b7844..366df42 100644 --- a/src/adapters/ssr.py +++ b/src/adapters/ssr.py @@ -608,6 +608,132 @@ def fetch_dens() -> list[RawListing]: return listings +# --------------------------------------------------------------------------- +# 3D Makelaars (Schiedam/Vlaardingen) +# --------------------------------------------------------------------------- + +_3D_BASE = "https://3dmakelaars.nl" + + +def _3dmakelaars_detail(detail_url: str) -> dict: + """Fetch 3dmakelaars detail page and extract structured info block.""" + try: + soup = fetch_soup(detail_url) + + # Parse structured info block: span (label) + p (value) pairs + kv: dict[str, str] = {} + for li in soup.select("div.tl-adiltional-inforamtion ul.tl-adiltional-listed li"): + label_el = li.select_one("span") + value_el = li.select_one("p") + if label_el and value_el: + label = label_el.get_text(strip=True).lower() + value = value_el.get_text(strip=True) + kv[label] = value + + # Extract postcode from first description paragraph + postcode = None + p_tag = soup.select_one(".omschrijving > p:nth-child(1)") + if p_tag: + text = p_tag.get_text() + postcode = _extract_postcode(text) + + return { + "kamers": int(kv["aantal kamers"].split()[0]) if "aantal kamers" in kv else None, + "slaapkamers": int(kv["aantal slaapkamers"].split()[0]) if "aantal slaapkamers" in kv else None, + "bouwjaar": int(kv["bouwjaar"]) if "bouwjaar" in kv else None, + "woningtype": kv.get("bouwvorm"), + "woonoppervlak": parse_m2(kv.get("oppervlakte")), + "postcode": postcode, + } + except Exception as e: + log.warning("3dmakelaars: detail fetch fout %s: %s", detail_url, e) + return {} + + +def fetch_3dmakelaars() -> list[RawListing]: + """Fetch 3D Makelaars listings with pagination.""" + listings = [] + page = 1 + + while True: + url = ( + f"{_3D_BASE}/woningen-te-koop-in-schiedam-en-vlaardingen" + f"?kamers=&oppervlakte=&woonplaats=&video=&prijs=3&page={page}" + ) + soup = fetch_soup(url) + cards = soup.select("div.tl-properties-item") + if not cards: + break + + for card in cards: + try: + # Extract detail URL from onclick attribute + onclick = card.get("onclick", "") + detail_url = None + if "window.location" in onclick: + m = re.search(r"window\.location\s*=\s*['\"]([^'\"]+)['\"]", onclick) + if m: + detail_url = _3D_BASE + m.group(1) + + if not detail_url: + continue + + # Extract listing-level info + adres = _text(card, "h3.price") + prijs_text = _text(card, "span.address") + prijs = parse_prijs(prijs_text) + + # Extract rooms and area from meta list + kamers = None + woonoppervlak = None + for li in card.select("ul.tl-meta-listed > li"): + text = li.get_text(strip=True) + if "kamers" in text.lower(): + m = re.search(r"(\d+)", text) + if m: + kamers = int(m.group(1)) + elif "m²" in text or "m2" in text: + woonoppervlak = parse_m2(text) + + # Extract image + img_tag = card.select_one("img") + hero = img_tag["src"] if img_tag else None + if hero and not hero.startswith("http"): + hero = _3D_BASE + hero + + # Fetch detail page for full info + detail_data = _3dmakelaars_detail(detail_url) + + # Postcode from detail page, fallback to extraction from address + postcode = detail_data.get("postcode") + if not postcode and adres: + postcode = _extract_postcode(adres) + + listings.append(RawListing( + url=detail_url, + source_makelaar="3dmakelaars", + adres=adres, + postcode=postcode, + stad=_infer_stad(postcode), + prijs=prijs, + woningtype=detail_data.get("woningtype"), + bouwjaar=detail_data.get("bouwjaar"), + woonoppervlak=woonoppervlak or detail_data.get("woonoppervlak"), + kamers=kamers or detail_data.get("kamers"), + slaapkamers=detail_data.get("slaapkamers"), + hero_image_url=hero, + )) + except Exception as e: + log.warning("3dmakelaars: parse fout: %s", e) + + if len(cards) < 7: + break + page += 1 + + log.info("3dmakelaars: %d listings opgehaald", len(listings)) + return listings + + # --------------------------------------------------------------------------- # SCRAPERS — exporteer hier alle actieve SSR adapters # --------------------------------------------------------------------------- @@ -618,4 +744,5 @@ SCRAPERS = { 'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars, 'wassenaar': fetch_wassenaar, 'dens': fetch_dens, + '3dmakelaars': fetch_3dmakelaars, } diff --git a/tests/test_adapters.py b/tests/test_adapters.py index db16e63..ed6cb67 100644 --- a/tests/test_adapters.py +++ b/tests/test_adapters.py @@ -16,7 +16,7 @@ logging.basicConfig( ) # --- change this to test a different adapter --- -ADAPTER = SCRAPERS['wassenaar'] +ADAPTER = SCRAPERS['3dmakelaars'] if __name__ == "__main__": print(f"Testing adapter: {ADAPTER.__name__}")