""" Custom Schiedam scrapers (no shared CMS platform). Each makelaar here uses a bespoke site structure that required its own parser. Scrapers: dewittegarantiemakelaars (JSON-LD), dens, 3dmakelaars, dupont """ import re import config from huizenbot import RawListing from ._shared import ( fetch_soup, parse_prijs, parse_m2, _text, _extract_postcode, _infer_stad, log, ) # --------------------------------------------------------------------------- # De Witte Garantiemakelaars (Schiedam) # --------------------------------------------------------------------------- # Listing cards have a pill badge for status. All detail data comes from # JSON-LD (schema.org BuyAction/Offer) on the detail page. _DEWITTE_BASE = "https://dewittegarantiemakelaars.nl" _DEWITTE_PILL_MAP = { "bg-fun-green": "beschikbaar", "bg-sold": "verkocht", } _DEWITTE_TYPE_MAP = { "Apartment": "appartement", "House": "woning", "SingleFamilyResidence": "woning", "Residence": "woning", } def _dewitte_jsonld(detail_url: str) -> dict: """Fetch detail page and return parsed JSON-LD dict, or {} on failure.""" import json try: soup = fetch_soup(detail_url) tag = soup.select_one('script[type="application/ld+json"]') if not tag: log.warning("dewitte: geen JSON-LD op %s", detail_url) return {} return json.loads(tag.string) except Exception as e: log.warning("dewitte: JSON-LD fout %s: %s", detail_url, e) return {} def fetch_dewittegarantiemakelaars() -> list[RawListing]: listings = [] page = 1 while True: url = ( f"{_DEWITTE_BASE}/woningaanbod" f"?buy_rent=buy&buy_price=1-{config.MAX_PRICE}&page={page}" ) soup = fetch_soup(url) cards = soup.select("div.card.card--property") if not cards: break for card in cards: try: a_tag = card.select_one("a.card__anchor") if not a_tag: continue detail_url = a_tag["href"] if not detail_url.startswith("http"): detail_url = _DEWITTE_BASE + detail_url pill = card.select_one("span.pill") pill_classes = pill.get("class", []) if pill else [] status_key = next( (c for c in pill_classes if c.startswith("bg-")), None ) status = _DEWITTE_PILL_MAP.get(status_key, "onder_bod") ld = _dewitte_jsonld(detail_url) if not ld: continue offered = ld.get("itemOffered", {}) address = offered.get("address", {}) floor_size = offered.get("floorSize", {}) postcode = address.get("postalCode", "").replace(" ", "") or None stad = address.get("addressLocality") or None adres = address.get("streetAddress") or None prijs = ld.get("price") if prijs and int(prijs) > config.MAX_PRICE: continue woningtype = _DEWITTE_TYPE_MAP.get(offered.get("@type", "")) woonoppervlak = int(floor_size["value"]) if floor_size.get("value") else None kamers = offered.get("numberOfRooms") bouwjaar = offered.get("yearBuilt") # Full-res image from JSON-LD, fall back to card thumbnail hero = ld.get("image") if not hero: img = card.select_one("picture img") hero = img["src"] if img else None listings.append(RawListing( url=detail_url, source_makelaar="dewittegarantiemakelaars", status=status, adres=adres, postcode=postcode, stad=stad, prijs=int(prijs) if prijs else None, woningtype=woningtype, woonoppervlak=woonoppervlak, kamers=int(kamers) if kamers else None, bouwjaar=int(bouwjaar) if bouwjaar else None, hero_image_url=hero, )) if config.APP_ENV == "dev": break except Exception as e: log.warning("dewitte: parse fout: %s", e) if len(cards) < 10: break page += 1 log.info("dewittegarantiemakelaars: %d listings opgehaald", len(listings)) return listings # --------------------------------------------------------------------------- # D&S Makelaars (Schiedam) # --------------------------------------------------------------------------- _DS_BASE = "https://www.densmakelaars.nl" _DS_STATUS_MAP = { "onder bod": "onder_bod", "te koop": "beschikbaar", "nieuw": "beschikbaar", "beschikbaar": "beschikbaar", "verkocht": "verkocht", } def _ds_detail(detail_url: str, html_text: str = None) -> dict: """Fetch D&S detail page and extract all kenmerken from
/
pairs and postcode from maps URL.""" try: # If html_text not provided, fetch it if html_text is None: import httpx r = httpx.get( detail_url, headers={"User-Agent": config.USER_AGENT}, timeout=15, follow_redirects=True, ) html_text = r.text from bs4 import BeautifulSoup soup = BeautifulSoup(html_text, "html.parser") # Parse
/
pairs into a label → value map kv: dict[str, str] = {} dts = soup.select("dt") dds = soup.select("dd") for dt, dd in zip(dts, dds): label = dt.get_text(strip=True).lower() value = dd.get_text(strip=True) kv[label] = value # Extract postcode from Google Maps URL in iframe src # Pattern: q=...POSTCODE...,CITY where POSTCODE is 4 digits + 2 letters postcode = None m = re.search(r'q=.+?,(\d{4})\s+([A-Z]{2}),', html_text) if m: postcode = f"{m.group(1)}{m.group(2)}" return { "status": kv.get("status", "beschikbaar").lower(), "woningtype": kv.get("soort woning"), "bouwjaar": kv.get("bouwjaar"), "woonoppervlak": kv.get("woonoppervlakte"), "kamers": kv.get("aantal kamers"), "slaapkamers": kv.get("aantal slaapkamers"), "energielabel": kv.get("energielabel"), "postcode": postcode, } except Exception as e: log.warning("dens: detail fetch fout %s: %s", detail_url, e) return {} def fetch_dens() -> list[RawListing]: """Fetch D&S Makelaars listings with full detail pages.""" listings = [] page = 1 while True: url = f"{_DS_BASE}/aanbod/koopwoningen?page={page}" soup = fetch_soup(url) cards = soup.select(".col-12.col-md-4.object-wrapper") if not cards: break for card in cards: try: # Extract URL a_tag = card.select_one("a.property") if not a_tag or "href" not in a_tag.attrs: continue detail_url = a_tag["href"] if not detail_url.startswith("http"): detail_url = _DS_BASE + detail_url # Extract listing page data status_label = _text(card, "span.label") or "beschikbaar" status_label = status_label.strip().lower() status = _DS_STATUS_MAP.get(status_label, "beschikbaar") adres = _text(card, "h3") stad = _text(card, "h4") prijs_text = _text(card, "div.price") prijs = parse_prijs(prijs_text) # Extract area and rooms from footer footer_spans = card.select("div.footer span") woonoppervlak = None kamers = None for span in footer_spans: text = span.get_text(strip=True) if "m²" in text: woonoppervlak = parse_m2(text) elif "kamers" in text.lower(): m = re.search(r"(\d+)", text) if m: kamers = int(m.group(1)) # Extract hero image img_tag = card.select_one("img") hero = img_tag["src"] if img_tag else None # Fetch and parse detail page detail_data = _ds_detail(detail_url) # Use postcode from detail data (extracted from Google Maps URL) postcode = detail_data.get("postcode") # Determine status from detail page if available if detail_data.get("status"): status = _DS_STATUS_MAP.get(detail_data["status"], status) listings.append(RawListing( url=detail_url, source_makelaar="dens", adres=adres, postcode=postcode, stad=stad or _infer_stad(postcode), prijs=prijs, status=status, hero_image_url=hero, woningtype=detail_data.get("woningtype"), bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None, woonoppervlak=parse_m2(detail_data.get("woonoppervlak")) or woonoppervlak, kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else kamers, slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None, energielabel=detail_data.get("energielabel"), )) if config.APP_ENV == "dev": break except Exception as e: log.warning("dens: parse fout: %s", e) if len(cards) < 10: break page += 1 log.info("dens: %d listings opgehaald", len(listings)) return listings # --------------------------------------------------------------------------- # 3D Makelaars (Schiedam/Vlaardingen) # --------------------------------------------------------------------------- _3D_BASE = "https://3dmakelaars.nl" def _3dmakelaars_detail(detail_url: str) -> dict: """Fetch 3dmakelaars detail page and extract structured info block.""" try: soup = fetch_soup(detail_url) # Parse structured info block: span (label) + p (value) pairs kv: dict[str, str] = {} for li in soup.select("div.tl-adiltional-inforamtion ul.tl-adiltional-listed li"): label_el = li.select_one("span") value_el = li.select_one("p") if label_el and value_el: label = label_el.get_text(strip=True).lower() value = value_el.get_text(strip=True) kv[label] = value # Extract postcode from first description paragraph postcode = None p_tag = soup.select_one(".omschrijving > p:nth-child(1)") if p_tag: text = p_tag.get_text() postcode = _extract_postcode(text) return { "kamers": int(kv["aantal kamers"].split()[0]) if "aantal kamers" in kv else None, "slaapkamers": int(kv["aantal slaapkamers"].split()[0]) if "aantal slaapkamers" in kv else None, "bouwjaar": int(kv["bouwjaar"]) if "bouwjaar" in kv else None, "woningtype": kv.get("bouwvorm"), "woonoppervlak": parse_m2(kv.get("oppervlakte")), "postcode": postcode, } except Exception as e: log.warning("3dmakelaars: detail fetch fout %s: %s", detail_url, e) return {} def fetch_3dmakelaars() -> list[RawListing]: """Fetch 3D Makelaars listings with pagination.""" listings = [] page = 1 while True: url = ( f"{_3D_BASE}/woningen-te-koop-in-schiedam-en-vlaardingen" f"?kamers=&oppervlakte=&woonplaats=&video=&prijs=3&page={page}" ) soup = fetch_soup(url) cards = soup.select("div.tl-properties-item") if not cards: break for card in cards: try: # Extract detail URL from onclick attribute onclick = card.get("onclick", "") detail_url = None if "window.location" in onclick: m = re.search(r"window\.location\s*=\s*['\"]([^'\"]+)['\"]", onclick) if m: detail_url = _3D_BASE + m.group(1) if not detail_url: continue # Extract listing-level info adres = _text(card, "h3.price") prijs_text = _text(card, "span.address") prijs = parse_prijs(prijs_text) # Extract rooms and area from meta list kamers = None woonoppervlak = None for li in card.select("ul.tl-meta-listed > li"): text = li.get_text(strip=True) if "kamers" in text.lower(): m = re.search(r"(\d+)", text) if m: kamers = int(m.group(1)) elif "m²" in text or "m2" in text: woonoppervlak = parse_m2(text) # Extract image img_tag = card.select_one("img") hero = img_tag["src"] if img_tag else None if hero and not hero.startswith("http"): hero = _3D_BASE + hero # Fetch detail page for full info detail_data = _3dmakelaars_detail(detail_url) # Postcode from detail page, fallback to extraction from address postcode = detail_data.get("postcode") if not postcode and adres: postcode = _extract_postcode(adres) listings.append(RawListing( url=detail_url, source_makelaar="3dmakelaars", adres=adres, postcode=postcode, stad=_infer_stad(postcode), prijs=prijs, woningtype=detail_data.get("woningtype"), bouwjaar=detail_data.get("bouwjaar"), woonoppervlak=woonoppervlak or detail_data.get("woonoppervlak"), kamers=kamers or detail_data.get("kamers"), slaapkamers=detail_data.get("slaapkamers"), hero_image_url=hero, )) if config.APP_ENV == "dev": break except Exception as e: log.warning("3dmakelaars: parse fout: %s", e) if len(cards) < 7: break page += 1 log.info("3dmakelaars: %d listings opgehaald", len(listings)) return listings # --------------------------------------------------------------------------- # Dupont ERA Makelaars (Schiedam/Rotterdam) # --------------------------------------------------------------------------- _DUPONT_BASE = "https://www.dupont.nl" _DUPONT_STATUS_MAP = { "te koop": "beschikbaar", "nieuw": "beschikbaar", "onder bod": "onder_bod", "verkocht onder voorbehoud": "onder_bod", "verkocht": "verkocht", } def _dupont_detail(detail_url: str) -> dict: """Fetch Dupont detail page and extract kenmerken from dt/dd pairs.""" try: soup = fetch_soup(detail_url) # Parse dt/dd pairs into label → value map kv: dict[str, str] = {} dts = soup.select("dt") dds = soup.select("dd") for dt, dd in zip(dts, dds): label = dt.get_text(strip=True).lower() value = dd.get_text(strip=True) kv[label] = value # Extract postcode from small tag (format: "NNNN AA CITY") postcode = None small_tag = soup.select_one("section div.container-fluid small") if small_tag: postcode = _extract_postcode(small_tag.get_text()) return { "postcode": postcode, "woningtype": kv.get("soort woning"), "bouwjaar": kv.get("bouwjaar"), "woonoppervlak": kv.get("woonoppervlakte"), "kamers": kv.get("aantal kamers"), "slaapkamers": kv.get("aantal slaapkamers"), "energielabel": kv.get("energielabel"), } except Exception as e: log.warning("dupont: detail fetch fout %s: %s", detail_url, e) return {} def fetch_dupont() -> list[RawListing]: """Fetch Dupont ERA Makelaars listings with pagination and detail pages.""" listings = [] page = 1 while True: url = f"{_DUPONT_BASE}/aanbod/koopwoningen?page={page}" soup = fetch_soup(url) cards = soup.select("article.object") if not cards: break for card in cards: try: # Extract URL a_tag = card.select_one("a[href]") if not a_tag or "href" not in a_tag.attrs: continue detail_url = a_tag["href"] if not detail_url.startswith("http"): detail_url = _DUPONT_BASE + detail_url # Extract listing-level data adres = _text(card, "h3") stad = _text(card, "h4") prijs_text = _text(card, "div.price") prijs = parse_prijs(prijs_text) # Extract status from label status_label = _text(card, "div.label") or "beschikbaar" status_label = status_label.strip().lower() status = _DUPONT_STATUS_MAP.get(status_label, "beschikbaar") # Extract image img_tag = card.select_one("img.img-responsive") hero = img_tag["src"] if img_tag else None if hero and not hero.startswith("http"): hero = _DUPONT_BASE + hero # Fetch detail page for full data detail_data = _dupont_detail(detail_url) # Use postcode from detail if available postcode = detail_data.get("postcode") listings.append(RawListing( url=detail_url, source_makelaar="dupont", adres=adres, postcode=postcode, stad=stad or _infer_stad(postcode), prijs=prijs, status=status, hero_image_url=hero, woningtype=detail_data.get("woningtype"), bouwjaar=int(detail_data["bouwjaar"]) if detail_data.get("bouwjaar") else None, woonoppervlak=parse_m2(detail_data.get("woonoppervlak")), kamers=int(detail_data["kamers"]) if detail_data.get("kamers") else None, slaapkamers=int(detail_data["slaapkamers"]) if detail_data.get("slaapkamers") else None, energielabel=detail_data.get("energielabel"), )) if config.APP_ENV == "dev": break except Exception as e: log.warning("dupont: parse fout: %s", e) if len(cards) < 10: break page += 1 log.info("dupont: %d listings opgehaald", len(listings)) return listings