- Split src/adapters/ssr.py (2160 LOC) into ssr/ package grouped by CMS: realworks.py, sure.py, schiedam.py, denhaag.py, overige.py - Add _og_detail() to api.py; all OG Online scrapers now fall back to detail page fetch when energielabel/bouwjaar are missing from the API - Fix run() to recalculate travel times for existing listings where fiets_mark IS NULL; upsert() now writes travel cols on existing rows too - Update tests/cache.py to patch fetch_soup in every ssr submodule - Update docs to reflect new package structure and mark API enrichment TODO done Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
62 lines
2.0 KiB
Python
62 lines
2.0 KiB
Python
"""
|
|
cache.py — import this before anything else in a test file to enable
|
|
file-based caching of fetch_json and fetch_soup calls.
|
|
|
|
Cache is stored in tests/cache/ keyed by a hash of the URL + params.
|
|
Delete the cache directory to bust it.
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import pickle
|
|
from pathlib import Path
|
|
|
|
CACHE_DIR = Path(__file__).parent / "cache"
|
|
CACHE_DIR.mkdir(exist_ok=True)
|
|
|
|
|
|
def _key(url: str, params: dict[str,str] | None) -> str:
|
|
raw = json.dumps({"url": url, "params": params or {}}, sort_keys=True)
|
|
return hashlib.sha256(raw.encode()).hexdigest()
|
|
|
|
|
|
def _patch():
|
|
import adapters.api as api_mod
|
|
import adapters.ssr._shared as ssr_shared
|
|
|
|
_orig_fetch_json = api_mod.fetch_json
|
|
_orig_fetch_soup = ssr_shared.fetch_soup
|
|
|
|
def cached_fetch_json(url, *, params: dict[str,str]|None=None, headers=None):
|
|
path = CACHE_DIR / (_key(url, params) + ".json")
|
|
if path.exists():
|
|
print(f"[cache hit] {url}")
|
|
return json.loads(path.read_text())
|
|
result = _orig_fetch_json(url, params=params, headers=headers)
|
|
path.write_text(json.dumps(result))
|
|
return result
|
|
|
|
def cached_fetch_soup(url, *, params=None):
|
|
path = CACHE_DIR / (_key(url, params) + ".pickle")
|
|
if path.exists():
|
|
print(f"[cache hit] {url}")
|
|
return pickle.loads(path.read_bytes())
|
|
result = _orig_fetch_soup(url, params=params)
|
|
path.write_bytes(pickle.dumps(result))
|
|
return result
|
|
|
|
api_mod.fetch_json = cached_fetch_json
|
|
# fetch_soup is imported directly in each submodule via `from ._shared import fetch_soup`,
|
|
# so we must patch the name in every submodule that uses it.
|
|
import adapters.ssr.realworks as _rw
|
|
import adapters.ssr.sure as _sure
|
|
import adapters.ssr.schiedam as _sch
|
|
import adapters.ssr.denhaag as _dh
|
|
import adapters.ssr.overige as _ov
|
|
for _mod in [ssr_shared, _rw, _sure, _sch, _dh, _ov]:
|
|
_mod.fetch_soup = cached_fetch_soup
|
|
print("[cache] fetch_json and fetch_soup patched")
|
|
|
|
|
|
_patch()
|