update autoscraper, add another makellaar
This commit is contained in:
@@ -9,6 +9,7 @@ Usage:
|
||||
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
@@ -264,6 +265,31 @@ def cmd_details(url: str):
|
||||
alt = img.get("alt", "")
|
||||
print(f" {src} [{alt}]")
|
||||
|
||||
# JSON-LD
|
||||
print("\n=== JSON-LD (schema.org) ===")
|
||||
for tag in soup.select('script[type="application/ld+json"]'):
|
||||
try:
|
||||
ld = json.loads(tag.string)
|
||||
offered = ld.get("itemOffered", {})
|
||||
address = offered.get("address", {})
|
||||
floor_size = offered.get("floorSize", {})
|
||||
fields = {
|
||||
"woningtype": offered.get("@type"),
|
||||
"adres": address.get("streetAddress"),
|
||||
"postcode": address.get("postalCode"),
|
||||
"stad": address.get("addressLocality"),
|
||||
"prijs": ld.get("price"),
|
||||
"woonoppervlak": floor_size.get("value"),
|
||||
"kamers": offered.get("numberOfRooms"),
|
||||
"bouwjaar": offered.get("yearBuilt"),
|
||||
"availability": ld.get("availability"),
|
||||
"image": ld.get("image"),
|
||||
}
|
||||
for k, v in fields.items():
|
||||
mark = "✓" if v is not None else "✗"
|
||||
print(f" {mark} {k:<16} {v!r}")
|
||||
except Exception as e:
|
||||
print(f" parse fout: {e}")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
| [x] | Anke Bodewes Makelaardij | ankebodewes.nl | Hargplein 118 |
|
||||
| [x] | Woongoed Makelaars Schiedam | woongoedmakelaars.nl | Oranjestraat 93 |
|
||||
| [x] | Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 |
|
||||
| [ ] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 |
|
||||
| [x] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 |
|
||||
| [ ] | Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 |
|
||||
| [ ] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 |
|
||||
| [ ] | Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 |
|
||||
|
||||
@@ -186,6 +186,123 @@ def fetch_ankebodewes() -> list[RawListing]:
|
||||
def fetch_woongoed() -> list[RawListing]:
|
||||
return fetch_realworks("https://www.woongoedmakelaars.nl", "woongoed")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# De Witte Garantiemakelaars
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DEWITTE_BASE = "https://dewittegarantiemakelaars.nl"
|
||||
|
||||
_DEWITTE_PILL_MAP = {
|
||||
"bg-fun-green": "beschikbaar",
|
||||
"bg-sold": "verkocht",
|
||||
}
|
||||
|
||||
_DEWITTE_TYPE_MAP = {
|
||||
"Apartment": "appartement",
|
||||
"House": "woning",
|
||||
"SingleFamilyResidence": "woning",
|
||||
"Residence": "woning",
|
||||
}
|
||||
|
||||
|
||||
def _dewitte_jsonld(detail_url: str) -> dict:
|
||||
"""Fetch detail page and return parsed JSON-LD dict, or {} on failure."""
|
||||
import json
|
||||
try:
|
||||
soup = fetch_soup(detail_url)
|
||||
tag = soup.select_one('script[type="application/ld+json"]')
|
||||
if not tag:
|
||||
log.warning("dewitte: geen JSON-LD op %s", detail_url)
|
||||
return {}
|
||||
return json.loads(tag.string)
|
||||
except Exception as e:
|
||||
log.warning("dewitte: JSON-LD fout %s: %s", detail_url, e)
|
||||
return {}
|
||||
|
||||
|
||||
def fetch_dewittegarantiemakelaars() -> list[RawListing]:
|
||||
listings = []
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
url = (
|
||||
f"{_DEWITTE_BASE}/woningaanbod"
|
||||
f"?buy_rent=buy&buy_price=1-{config.MAX_PRICE}&page={page}"
|
||||
)
|
||||
soup = fetch_soup(url)
|
||||
cards = soup.select("div.card.card--property")
|
||||
if not cards:
|
||||
break
|
||||
|
||||
for card in cards:
|
||||
try:
|
||||
a_tag = card.select_one("a.card__anchor")
|
||||
if not a_tag:
|
||||
continue
|
||||
detail_url = a_tag["href"]
|
||||
if not detail_url.startswith("http"):
|
||||
detail_url = _DEWITTE_BASE + detail_url
|
||||
|
||||
pill = card.select_one("span.pill")
|
||||
pill_classes = pill.get("class", []) if pill else []
|
||||
status_key = next(
|
||||
(c for c in pill_classes if c.startswith("bg-")), None
|
||||
)
|
||||
status = _DEWITTE_PILL_MAP.get(status_key, "onder_bod")
|
||||
|
||||
ld = _dewitte_jsonld(detail_url)
|
||||
if not ld:
|
||||
continue
|
||||
|
||||
offered = ld.get("itemOffered", {})
|
||||
address = offered.get("address", {})
|
||||
floor_size = offered.get("floorSize", {})
|
||||
|
||||
postcode = address.get("postalCode", "").replace(" ", "") or None
|
||||
stad = address.get("addressLocality") or None
|
||||
adres = address.get("streetAddress") or None
|
||||
|
||||
prijs = ld.get("price")
|
||||
if prijs and int(prijs) > config.MAX_PRICE:
|
||||
continue
|
||||
|
||||
woningtype = _DEWITTE_TYPE_MAP.get(offered.get("@type", ""))
|
||||
woonoppervlak = int(floor_size["value"]) if floor_size.get("value") else None
|
||||
kamers = offered.get("numberOfRooms")
|
||||
bouwjaar = offered.get("yearBuilt")
|
||||
|
||||
# Full-res image from JSON-LD, fall back to card thumbnail
|
||||
hero = ld.get("image")
|
||||
if not hero:
|
||||
img = card.select_one("picture img")
|
||||
hero = img["src"] if img else None
|
||||
|
||||
listings.append(RawListing(
|
||||
url=detail_url,
|
||||
source_makelaar="dewittegarantiemakelaars",
|
||||
status=status,
|
||||
adres=adres,
|
||||
postcode=postcode,
|
||||
stad=stad,
|
||||
prijs=int(prijs) if prijs else None,
|
||||
woningtype=woningtype,
|
||||
woonoppervlak=woonoppervlak,
|
||||
kamers=int(kamers) if kamers else None,
|
||||
bouwjaar=int(bouwjaar) if bouwjaar else None,
|
||||
hero_image_url=hero,
|
||||
))
|
||||
except Exception as e:
|
||||
log.warning("dewitte: parse fout: %s", e)
|
||||
|
||||
if len(cards) < 10:
|
||||
break
|
||||
page += 1
|
||||
|
||||
log.info("dewittegarantiemakelaars: %d listings opgehaald", len(listings))
|
||||
return listings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SSR helper utils
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -228,4 +345,5 @@ def _infer_stad(postcode: str | None) -> str | None:
|
||||
SCRAPERS = {
|
||||
'ankebodewes': fetch_ankebodewes,
|
||||
'woongoed': fetch_woongoed,
|
||||
'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user