ever onwards
This commit is contained in:
@@ -219,6 +219,8 @@ If the CMS is unknown, the tool prints structural diagnostics (card selectors, f
|
|||||||
|
|
||||||
## Important Notes
|
## Important Notes
|
||||||
|
|
||||||
|
Don't treat detail pages as optional, we always want all the info!
|
||||||
|
|
||||||
### Status Mapping
|
### Status Mapping
|
||||||
Brokers use different status strings. Always map to one of:
|
Brokers use different status strings. Always map to one of:
|
||||||
- `"beschikbaar"` — Available for sale
|
- `"beschikbaar"` — Available for sale
|
||||||
@@ -270,6 +272,7 @@ The database stores this as JSON in the `extra` column.
|
|||||||
- Nominatim (geocoding) has a 1 req/s limiter built into `huizenbot.py`
|
- Nominatim (geocoding) has a 1 req/s limiter built into `huizenbot.py`
|
||||||
- Never spawn parallel requests without the human's approval
|
- Never spawn parallel requests without the human's approval
|
||||||
- Always use the `USER_AGENT` header (includes contact info for respectful scraping)
|
- Always use the `USER_AGENT` header (includes contact info for respectful scraping)
|
||||||
|
- Don't keep curling the same endpoint, pipe it to a <name makelaar>.dump and then rg through it to find what you need. Can also pipe it through the bsprettify.py and then rg that.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -30,7 +30,7 @@
|
|||||||
| [x] | Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 |
|
| [x] | Ooms Makelaars Schiedam | ooms.com | Gerrit Verboonstraat 2 |
|
||||||
| [x] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 |
|
| [x] | De Witte Garantiemakelaars | dewittegarantiemakelaars.nl | Philippusweg 2 |
|
||||||
| [x] | Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 |
|
| [x] | Makelaardij Wassenaar | makelaardijwassenaar.nl | Gerrit Verboonstraat 12 |
|
||||||
| [ ] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 |
|
| [x] | 3D Makelaars | 3dmakelaars.nl | Gerrit Verboonstraat 17 |
|
||||||
| [ ] | Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 |
|
| [ ] | Dupont Makelaars | dupont.nl | Rotterdamsedijk 437 |
|
||||||
| [x] | D&S Makelaardij | densmakelaars.nl | Land van Belofte 50 |
|
| [x] | D&S Makelaardij | densmakelaars.nl | Land van Belofte 50 |
|
||||||
| [ ] | Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B |
|
| [ ] | Moerman & De Jong Makelaars | moerman-dejong.nl | Lange Kerkstraat 80B |
|
||||||
|
|||||||
@@ -608,6 +608,132 @@ def fetch_dens() -> list[RawListing]:
|
|||||||
return listings
|
return listings
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 3D Makelaars (Schiedam/Vlaardingen)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_3D_BASE = "https://3dmakelaars.nl"
|
||||||
|
|
||||||
|
|
||||||
|
def _3dmakelaars_detail(detail_url: str) -> dict:
|
||||||
|
"""Fetch 3dmakelaars detail page and extract structured info block."""
|
||||||
|
try:
|
||||||
|
soup = fetch_soup(detail_url)
|
||||||
|
|
||||||
|
# Parse structured info block: span (label) + p (value) pairs
|
||||||
|
kv: dict[str, str] = {}
|
||||||
|
for li in soup.select("div.tl-adiltional-inforamtion ul.tl-adiltional-listed li"):
|
||||||
|
label_el = li.select_one("span")
|
||||||
|
value_el = li.select_one("p")
|
||||||
|
if label_el and value_el:
|
||||||
|
label = label_el.get_text(strip=True).lower()
|
||||||
|
value = value_el.get_text(strip=True)
|
||||||
|
kv[label] = value
|
||||||
|
|
||||||
|
# Extract postcode from first description paragraph
|
||||||
|
postcode = None
|
||||||
|
p_tag = soup.select_one(".omschrijving > p:nth-child(1)")
|
||||||
|
if p_tag:
|
||||||
|
text = p_tag.get_text()
|
||||||
|
postcode = _extract_postcode(text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"kamers": int(kv["aantal kamers"].split()[0]) if "aantal kamers" in kv else None,
|
||||||
|
"slaapkamers": int(kv["aantal slaapkamers"].split()[0]) if "aantal slaapkamers" in kv else None,
|
||||||
|
"bouwjaar": int(kv["bouwjaar"]) if "bouwjaar" in kv else None,
|
||||||
|
"woningtype": kv.get("bouwvorm"),
|
||||||
|
"woonoppervlak": parse_m2(kv.get("oppervlakte")),
|
||||||
|
"postcode": postcode,
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("3dmakelaars: detail fetch fout %s: %s", detail_url, e)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_3dmakelaars() -> list[RawListing]:
|
||||||
|
"""Fetch 3D Makelaars listings with pagination."""
|
||||||
|
listings = []
|
||||||
|
page = 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
url = (
|
||||||
|
f"{_3D_BASE}/woningen-te-koop-in-schiedam-en-vlaardingen"
|
||||||
|
f"?kamers=&oppervlakte=&woonplaats=&video=&prijs=3&page={page}"
|
||||||
|
)
|
||||||
|
soup = fetch_soup(url)
|
||||||
|
cards = soup.select("div.tl-properties-item")
|
||||||
|
if not cards:
|
||||||
|
break
|
||||||
|
|
||||||
|
for card in cards:
|
||||||
|
try:
|
||||||
|
# Extract detail URL from onclick attribute
|
||||||
|
onclick = card.get("onclick", "")
|
||||||
|
detail_url = None
|
||||||
|
if "window.location" in onclick:
|
||||||
|
m = re.search(r"window\.location\s*=\s*['\"]([^'\"]+)['\"]", onclick)
|
||||||
|
if m:
|
||||||
|
detail_url = _3D_BASE + m.group(1)
|
||||||
|
|
||||||
|
if not detail_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract listing-level info
|
||||||
|
adres = _text(card, "h3.price")
|
||||||
|
prijs_text = _text(card, "span.address")
|
||||||
|
prijs = parse_prijs(prijs_text)
|
||||||
|
|
||||||
|
# Extract rooms and area from meta list
|
||||||
|
kamers = None
|
||||||
|
woonoppervlak = None
|
||||||
|
for li in card.select("ul.tl-meta-listed > li"):
|
||||||
|
text = li.get_text(strip=True)
|
||||||
|
if "kamers" in text.lower():
|
||||||
|
m = re.search(r"(\d+)", text)
|
||||||
|
if m:
|
||||||
|
kamers = int(m.group(1))
|
||||||
|
elif "m²" in text or "m2" in text:
|
||||||
|
woonoppervlak = parse_m2(text)
|
||||||
|
|
||||||
|
# Extract image
|
||||||
|
img_tag = card.select_one("img")
|
||||||
|
hero = img_tag["src"] if img_tag else None
|
||||||
|
if hero and not hero.startswith("http"):
|
||||||
|
hero = _3D_BASE + hero
|
||||||
|
|
||||||
|
# Fetch detail page for full info
|
||||||
|
detail_data = _3dmakelaars_detail(detail_url)
|
||||||
|
|
||||||
|
# Postcode from detail page, fallback to extraction from address
|
||||||
|
postcode = detail_data.get("postcode")
|
||||||
|
if not postcode and adres:
|
||||||
|
postcode = _extract_postcode(adres)
|
||||||
|
|
||||||
|
listings.append(RawListing(
|
||||||
|
url=detail_url,
|
||||||
|
source_makelaar="3dmakelaars",
|
||||||
|
adres=adres,
|
||||||
|
postcode=postcode,
|
||||||
|
stad=_infer_stad(postcode),
|
||||||
|
prijs=prijs,
|
||||||
|
woningtype=detail_data.get("woningtype"),
|
||||||
|
bouwjaar=detail_data.get("bouwjaar"),
|
||||||
|
woonoppervlak=woonoppervlak or detail_data.get("woonoppervlak"),
|
||||||
|
kamers=kamers or detail_data.get("kamers"),
|
||||||
|
slaapkamers=detail_data.get("slaapkamers"),
|
||||||
|
hero_image_url=hero,
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("3dmakelaars: parse fout: %s", e)
|
||||||
|
|
||||||
|
if len(cards) < 7:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
log.info("3dmakelaars: %d listings opgehaald", len(listings))
|
||||||
|
return listings
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# SCRAPERS — exporteer hier alle actieve SSR adapters
|
# SCRAPERS — exporteer hier alle actieve SSR adapters
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -618,4 +744,5 @@ SCRAPERS = {
|
|||||||
'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars,
|
'dewittegarantiemakelaars': fetch_dewittegarantiemakelaars,
|
||||||
'wassenaar': fetch_wassenaar,
|
'wassenaar': fetch_wassenaar,
|
||||||
'dens': fetch_dens,
|
'dens': fetch_dens,
|
||||||
|
'3dmakelaars': fetch_3dmakelaars,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ logging.basicConfig(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# --- change this to test a different adapter ---
|
# --- change this to test a different adapter ---
|
||||||
ADAPTER = SCRAPERS['wassenaar']
|
ADAPTER = SCRAPERS['3dmakelaars']
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print(f"Testing adapter: {ADAPTER.__name__}")
|
print(f"Testing adapter: {ADAPTER.__name__}")
|
||||||
|
|||||||
Reference in New Issue
Block a user