diff --git a/autoscraper.py b/autoscraper.py index ee5d6c3..1988649 100644 --- a/autoscraper.py +++ b/autoscraper.py @@ -9,6 +9,7 @@ Usage: import re import sys +import json import httpx from bs4 import BeautifulSoup, Tag @@ -264,6 +265,31 @@ def cmd_details(url: str): alt = img.get("alt", "") print(f" {src} [{alt}]") + # JSON-LD + print("\n=== JSON-LD (schema.org) ===") + for tag in soup.select('script[type="application/ld+json"]'): + try: + ld = json.loads(tag.string) + offered = ld.get("itemOffered", {}) + address = offered.get("address", {}) + floor_size = offered.get("floorSize", {}) + fields = { + "woningtype": offered.get("@type"), + "adres": address.get("streetAddress"), + "postcode": address.get("postalCode"), + "stad": address.get("addressLocality"), + "prijs": ld.get("price"), + "woonoppervlak": floor_size.get("value"), + "kamers": offered.get("numberOfRooms"), + "bouwjaar": offered.get("yearBuilt"), + "availability": ld.get("availability"), + "image": ld.get("image"), + } + for k, v in fields.items(): + mark = "✓" if v is not None else "✗" + print(f" {mark} {k:<16} {v!r}") + except Exception as e: + print(f" parse fout: {e}") # --------------------------------------------------------------------------- # Entry point diff --git a/makelaars.md b/makelaars.md index e6f565b..a1e5ca0 100644 --- a/makelaars.md +++ b/makelaars.md @@ -28,7 +28,7 @@ | [x] | Anke Bodewes Makelaardij | ankebodewes.nl | Hargplein 118 | | [x] | Woongoed Makelaars Schiedam | woongoedmakelaars.nl | Oranjestraat 93 | | [x] | Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 | -| [ ] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 | +| [x] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 | | [ ] | Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 | | [ ] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 | | [ ] | Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 | diff --git a/src/adapters/ssr.py b/src/adapters/ssr.py index 463b078..d1f56a1 100644 --- a/src/adapters/ssr.py +++ b/src/adapters/ssr.py @@ -186,6 +186,123 @@ def fetch_ankebodewes() -> list[RawListing]: def fetch_woongoed() -> list[RawListing]: return fetch_realworks("https://www.woongoedmakelaars.nl", "woongoed") + +# --------------------------------------------------------------------------- +# De Witte Garantiemakelaars +# --------------------------------------------------------------------------- + +_DEWITTE_BASE = "https://dewittegarantiemakelaars.nl" + +_DEWITTE_PILL_MAP = { + "bg-fun-green": "beschikbaar", + "bg-sold": "verkocht", +} + +_DEWITTE_TYPE_MAP = { + "Apartment": "appartement", + "House": "woning", + "SingleFamilyResidence": "woning", + "Residence": "woning", +} + + +def _dewitte_jsonld(detail_url: str) -> dict: + """Fetch detail page and return parsed JSON-LD dict, or {} on failure.""" + import json + try: + soup = fetch_soup(detail_url) + tag = soup.select_one('script[type="application/ld+json"]') + if not tag: + log.warning("dewitte: geen JSON-LD op %s", detail_url) + return {} + return json.loads(tag.string) + except Exception as e: + log.warning("dewitte: JSON-LD fout %s: %s", detail_url, e) + return {} + + +def fetch_dewittegarantiemakelaars() -> list[RawListing]: + listings = [] + page = 1 + + while True: + url = ( + f"{_DEWITTE_BASE}/woningaanbod" + f"?buy_rent=buy&buy_price=1-{config.MAX_PRICE}&page={page}" + ) + soup = fetch_soup(url) + cards = soup.select("div.card.card--property") + if not cards: + break + + for card in cards: + try: + a_tag = card.select_one("a.card__anchor") + if not a_tag: + continue + detail_url = a_tag["href"] + if not detail_url.startswith("http"): + detail_url = _DEWITTE_BASE + detail_url + + pill = card.select_one("span.pill") + pill_classes = pill.get("class", []) if pill else [] + status_key = next( + (c for c in pill_classes if c.startswith("bg-")), None + ) + status = _DEWITTE_PILL_MAP.get(status_key, "onder_bod") + + ld = _dewitte_jsonld(detail_url) + if not ld: + continue + + offered = ld.get("itemOffered", {}) + address = offered.get("address", {}) + floor_size = offered.get("floorSize", {}) + + postcode = address.get("postalCode", "").replace(" ", "") or None + stad = address.get("addressLocality") or None + adres = address.get("streetAddress") or None + + prijs = ld.get("price") + if prijs and int(prijs) > config.MAX_PRICE: + continue + + woningtype = _DEWITTE_TYPE_MAP.get(offered.get("@type", "")) + woonoppervlak = int(floor_size["value"]) if floor_size.get("value") else None + kamers = offered.get("numberOfRooms") + bouwjaar = offered.get("yearBuilt") + + # Full-res image from JSON-LD, fall back to card thumbnail + hero = ld.get("image") + if not hero: + img = card.select_one("picture img") + hero = img["src"] if img else None + + listings.append(RawListing( + url=detail_url, + source_makelaar="dewittegarantiemakelaars", + status=status, + adres=adres, + postcode=postcode, + stad=stad, + prijs=int(prijs) if prijs else None, + woningtype=woningtype, + woonoppervlak=woonoppervlak, + kamers=int(kamers) if kamers else None, + bouwjaar=int(bouwjaar) if bouwjaar else None, + hero_image_url=hero, + )) + except Exception as e: + log.warning("dewitte: parse fout: %s", e) + + if len(cards) < 10: + break + page += 1 + + log.info("dewittegarantiemakelaars: %d listings opgehaald", len(listings)) + return listings + + # --------------------------------------------------------------------------- # SSR helper utils # --------------------------------------------------------------------------- @@ -228,4 +345,5 @@ def _infer_stad(postcode: str | None) -> str | None: SCRAPERS = { 'ankebodewes': fetch_ankebodewes, 'woongoed': fetch_woongoed, + 'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars }