update autoscraper, add another makellaar
This commit is contained in:
@@ -9,6 +9,7 @@ Usage:
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
@@ -264,6 +265,31 @@ def cmd_details(url: str):
|
|||||||
alt = img.get("alt", "")
|
alt = img.get("alt", "")
|
||||||
print(f" {src} [{alt}]")
|
print(f" {src} [{alt}]")
|
||||||
|
|
||||||
|
# JSON-LD
|
||||||
|
print("\n=== JSON-LD (schema.org) ===")
|
||||||
|
for tag in soup.select('script[type="application/ld+json"]'):
|
||||||
|
try:
|
||||||
|
ld = json.loads(tag.string)
|
||||||
|
offered = ld.get("itemOffered", {})
|
||||||
|
address = offered.get("address", {})
|
||||||
|
floor_size = offered.get("floorSize", {})
|
||||||
|
fields = {
|
||||||
|
"woningtype": offered.get("@type"),
|
||||||
|
"adres": address.get("streetAddress"),
|
||||||
|
"postcode": address.get("postalCode"),
|
||||||
|
"stad": address.get("addressLocality"),
|
||||||
|
"prijs": ld.get("price"),
|
||||||
|
"woonoppervlak": floor_size.get("value"),
|
||||||
|
"kamers": offered.get("numberOfRooms"),
|
||||||
|
"bouwjaar": offered.get("yearBuilt"),
|
||||||
|
"availability": ld.get("availability"),
|
||||||
|
"image": ld.get("image"),
|
||||||
|
}
|
||||||
|
for k, v in fields.items():
|
||||||
|
mark = "✓" if v is not None else "✗"
|
||||||
|
print(f" {mark} {k:<16} {v!r}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" parse fout: {e}")
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Entry point
|
# Entry point
|
||||||
|
|||||||
@@ -28,7 +28,7 @@
|
|||||||
| [x] | Anke Bodewes Makelaardij | ankebodewes.nl | Hargplein 118 |
|
| [x] | Anke Bodewes Makelaardij | ankebodewes.nl | Hargplein 118 |
|
||||||
| [x] | Woongoed Makelaars Schiedam | woongoedmakelaars.nl | Oranjestraat 93 |
|
| [x] | Woongoed Makelaars Schiedam | woongoedmakelaars.nl | Oranjestraat 93 |
|
||||||
| [x] | Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 |
|
| [x] | Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 |
|
||||||
| [ ] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 |
|
| [x] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 |
|
||||||
| [ ] | Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 |
|
| [ ] | Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 |
|
||||||
| [ ] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 |
|
| [ ] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 |
|
||||||
| [ ] | Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 |
|
| [ ] | Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 |
|
||||||
|
|||||||
@@ -186,6 +186,123 @@ def fetch_ankebodewes() -> list[RawListing]:
|
|||||||
def fetch_woongoed() -> list[RawListing]:
|
def fetch_woongoed() -> list[RawListing]:
|
||||||
return fetch_realworks("https://www.woongoedmakelaars.nl", "woongoed")
|
return fetch_realworks("https://www.woongoedmakelaars.nl", "woongoed")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# De Witte Garantiemakelaars
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_DEWITTE_BASE = "https://dewittegarantiemakelaars.nl"
|
||||||
|
|
||||||
|
_DEWITTE_PILL_MAP = {
|
||||||
|
"bg-fun-green": "beschikbaar",
|
||||||
|
"bg-sold": "verkocht",
|
||||||
|
}
|
||||||
|
|
||||||
|
_DEWITTE_TYPE_MAP = {
|
||||||
|
"Apartment": "appartement",
|
||||||
|
"House": "woning",
|
||||||
|
"SingleFamilyResidence": "woning",
|
||||||
|
"Residence": "woning",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _dewitte_jsonld(detail_url: str) -> dict:
|
||||||
|
"""Fetch detail page and return parsed JSON-LD dict, or {} on failure."""
|
||||||
|
import json
|
||||||
|
try:
|
||||||
|
soup = fetch_soup(detail_url)
|
||||||
|
tag = soup.select_one('script[type="application/ld+json"]')
|
||||||
|
if not tag:
|
||||||
|
log.warning("dewitte: geen JSON-LD op %s", detail_url)
|
||||||
|
return {}
|
||||||
|
return json.loads(tag.string)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("dewitte: JSON-LD fout %s: %s", detail_url, e)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_dewittegarantiemakelaars() -> list[RawListing]:
|
||||||
|
listings = []
|
||||||
|
page = 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
url = (
|
||||||
|
f"{_DEWITTE_BASE}/woningaanbod"
|
||||||
|
f"?buy_rent=buy&buy_price=1-{config.MAX_PRICE}&page={page}"
|
||||||
|
)
|
||||||
|
soup = fetch_soup(url)
|
||||||
|
cards = soup.select("div.card.card--property")
|
||||||
|
if not cards:
|
||||||
|
break
|
||||||
|
|
||||||
|
for card in cards:
|
||||||
|
try:
|
||||||
|
a_tag = card.select_one("a.card__anchor")
|
||||||
|
if not a_tag:
|
||||||
|
continue
|
||||||
|
detail_url = a_tag["href"]
|
||||||
|
if not detail_url.startswith("http"):
|
||||||
|
detail_url = _DEWITTE_BASE + detail_url
|
||||||
|
|
||||||
|
pill = card.select_one("span.pill")
|
||||||
|
pill_classes = pill.get("class", []) if pill else []
|
||||||
|
status_key = next(
|
||||||
|
(c for c in pill_classes if c.startswith("bg-")), None
|
||||||
|
)
|
||||||
|
status = _DEWITTE_PILL_MAP.get(status_key, "onder_bod")
|
||||||
|
|
||||||
|
ld = _dewitte_jsonld(detail_url)
|
||||||
|
if not ld:
|
||||||
|
continue
|
||||||
|
|
||||||
|
offered = ld.get("itemOffered", {})
|
||||||
|
address = offered.get("address", {})
|
||||||
|
floor_size = offered.get("floorSize", {})
|
||||||
|
|
||||||
|
postcode = address.get("postalCode", "").replace(" ", "") or None
|
||||||
|
stad = address.get("addressLocality") or None
|
||||||
|
adres = address.get("streetAddress") or None
|
||||||
|
|
||||||
|
prijs = ld.get("price")
|
||||||
|
if prijs and int(prijs) > config.MAX_PRICE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
woningtype = _DEWITTE_TYPE_MAP.get(offered.get("@type", ""))
|
||||||
|
woonoppervlak = int(floor_size["value"]) if floor_size.get("value") else None
|
||||||
|
kamers = offered.get("numberOfRooms")
|
||||||
|
bouwjaar = offered.get("yearBuilt")
|
||||||
|
|
||||||
|
# Full-res image from JSON-LD, fall back to card thumbnail
|
||||||
|
hero = ld.get("image")
|
||||||
|
if not hero:
|
||||||
|
img = card.select_one("picture img")
|
||||||
|
hero = img["src"] if img else None
|
||||||
|
|
||||||
|
listings.append(RawListing(
|
||||||
|
url=detail_url,
|
||||||
|
source_makelaar="dewittegarantiemakelaars",
|
||||||
|
status=status,
|
||||||
|
adres=adres,
|
||||||
|
postcode=postcode,
|
||||||
|
stad=stad,
|
||||||
|
prijs=int(prijs) if prijs else None,
|
||||||
|
woningtype=woningtype,
|
||||||
|
woonoppervlak=woonoppervlak,
|
||||||
|
kamers=int(kamers) if kamers else None,
|
||||||
|
bouwjaar=int(bouwjaar) if bouwjaar else None,
|
||||||
|
hero_image_url=hero,
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("dewitte: parse fout: %s", e)
|
||||||
|
|
||||||
|
if len(cards) < 10:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
log.info("dewittegarantiemakelaars: %d listings opgehaald", len(listings))
|
||||||
|
return listings
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# SSR helper utils
|
# SSR helper utils
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -228,4 +345,5 @@ def _infer_stad(postcode: str | None) -> str | None:
|
|||||||
SCRAPERS = {
|
SCRAPERS = {
|
||||||
'ankebodewes': fetch_ankebodewes,
|
'ankebodewes': fetch_ankebodewes,
|
||||||
'woongoed': fetch_woongoed,
|
'woongoed': fetch_woongoed,
|
||||||
|
'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user