Files
SPRUCE-scraper/spruce/progress.py
T
poprhythm f2193011ca Add --metadata-only mode; harden resume and idempotency
- Add --metadata-only flag: fetches scan detail pages, writes
  metadata.json + scans.csv rows, skips all image downloads.
  Re-runs skip scans whose metadata.json already exists.
- Atomic progress.json saves (temp-file rename).
- Heal-on-resume: tiles on disk but not in progress are silently
  re-marked before building the pending list.
- scans.csv dedup: skip row if mosaic URL already in progress.
- Rename mosaic_downloaded -> mosaic_on_disk (reflects disk state).
- --recheck now checks mosaics as well as tiles.
- RunStats dataclass replaces raw int return; richer run summary.
- Fix argparse allow_abbrev reverted; fix --scan-id + --metadata-only
  glob fallback when scan_time is absent.
- Add .venv/ to .gitignore.
- README: fix typo, update worker counts, document all new behaviour.
2026-04-24 09:44:57 -04:00

85 lines
2.6 KiB
Python

"""
Progress tracking (JSON) and CSV writing.
"""
import csv
import json
import logging
from pathlib import Path
from typing import Iterator
log = logging.getLogger(__name__)
class ProgressTracker:
"""
Tracks successfully downloaded URLs in a JSON file so runs can be resumed.
Public API (all external code should use only these methods):
is_done(url) — True if url has been downloaded
mark_done(url) — Record url as complete (call save() to persist)
discard(url) — Remove url from the completed set
iter_urls() — Iterate over all completed URLs
__len__() — Number of completed URLs
save() — Flush state to disk
"""
def __init__(self, path: Path) -> None:
self.path = path
self._done: set[str] = set()
self._load()
def _load(self) -> None:
if self.path.exists():
try:
data = json.loads(self.path.read_text())
self._done = set(data.get("completed_urls", []))
log.info("Resuming: %d URLs already downloaded.", len(self._done))
except Exception:
log.warning("Could not read progress file; starting fresh.")
def is_done(self, url: str) -> bool:
return url in self._done
def mark_done(self, url: str) -> None:
self._done.add(url)
def discard(self, url: str) -> None:
"""Remove a URL from the completed set (re-queues it for download)."""
self._done.discard(url)
def iter_urls(self) -> Iterator[str]:
"""Iterate over all completed URLs."""
return iter(self._done)
def __len__(self) -> int:
return len(self._done)
def save(self) -> None:
self.path.parent.mkdir(parents=True, exist_ok=True)
tmp = self.path.with_suffix(".json.tmp")
tmp.write_text(
json.dumps({"completed_urls": sorted(self._done)}, indent=2)
)
tmp.replace(self.path) # atomic on POSIX; avoids corrupt JSON on crash
class CsvWriter:
"""Append-mode CSV writer that writes a header on first creation."""
def __init__(self, path: Path, fields: list[str]) -> None:
is_new = not path.exists()
path.parent.mkdir(parents=True, exist_ok=True)
self._fh = open(path, "a", newline="", encoding="utf-8")
self._writer = csv.DictWriter(self._fh, fieldnames=fields)
if is_new:
self._writer.writeheader()
self._fields = fields
def write(self, row: dict) -> None:
self._writer.writerow({f: row.get(f, "") for f in self._fields})
self._fh.flush()
def close(self) -> None:
self._fh.close()