f2193011ca
- Add --metadata-only flag: fetches scan detail pages, writes metadata.json + scans.csv rows, skips all image downloads. Re-runs skip scans whose metadata.json already exists. - Atomic progress.json saves (temp-file rename). - Heal-on-resume: tiles on disk but not in progress are silently re-marked before building the pending list. - scans.csv dedup: skip row if mosaic URL already in progress. - Rename mosaic_downloaded -> mosaic_on_disk (reflects disk state). - --recheck now checks mosaics as well as tiles. - RunStats dataclass replaces raw int return; richer run summary. - Fix argparse allow_abbrev reverted; fix --scan-id + --metadata-only glob fallback when scan_time is absent. - Add .venv/ to .gitignore. - README: fix typo, update worker counts, document all new behaviour.
85 lines
2.6 KiB
Python
85 lines
2.6 KiB
Python
"""
|
|
Progress tracking (JSON) and CSV writing.
|
|
"""
|
|
|
|
import csv
|
|
import json
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Iterator
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class ProgressTracker:
|
|
"""
|
|
Tracks successfully downloaded URLs in a JSON file so runs can be resumed.
|
|
|
|
Public API (all external code should use only these methods):
|
|
is_done(url) — True if url has been downloaded
|
|
mark_done(url) — Record url as complete (call save() to persist)
|
|
discard(url) — Remove url from the completed set
|
|
iter_urls() — Iterate over all completed URLs
|
|
__len__() — Number of completed URLs
|
|
save() — Flush state to disk
|
|
"""
|
|
|
|
def __init__(self, path: Path) -> None:
|
|
self.path = path
|
|
self._done: set[str] = set()
|
|
self._load()
|
|
|
|
def _load(self) -> None:
|
|
if self.path.exists():
|
|
try:
|
|
data = json.loads(self.path.read_text())
|
|
self._done = set(data.get("completed_urls", []))
|
|
log.info("Resuming: %d URLs already downloaded.", len(self._done))
|
|
except Exception:
|
|
log.warning("Could not read progress file; starting fresh.")
|
|
|
|
def is_done(self, url: str) -> bool:
|
|
return url in self._done
|
|
|
|
def mark_done(self, url: str) -> None:
|
|
self._done.add(url)
|
|
|
|
def discard(self, url: str) -> None:
|
|
"""Remove a URL from the completed set (re-queues it for download)."""
|
|
self._done.discard(url)
|
|
|
|
def iter_urls(self) -> Iterator[str]:
|
|
"""Iterate over all completed URLs."""
|
|
return iter(self._done)
|
|
|
|
def __len__(self) -> int:
|
|
return len(self._done)
|
|
|
|
def save(self) -> None:
|
|
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
tmp = self.path.with_suffix(".json.tmp")
|
|
tmp.write_text(
|
|
json.dumps({"completed_urls": sorted(self._done)}, indent=2)
|
|
)
|
|
tmp.replace(self.path) # atomic on POSIX; avoids corrupt JSON on crash
|
|
|
|
|
|
class CsvWriter:
|
|
"""Append-mode CSV writer that writes a header on first creation."""
|
|
|
|
def __init__(self, path: Path, fields: list[str]) -> None:
|
|
is_new = not path.exists()
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
self._fh = open(path, "a", newline="", encoding="utf-8")
|
|
self._writer = csv.DictWriter(self._fh, fieldnames=fields)
|
|
if is_new:
|
|
self._writer.writeheader()
|
|
self._fields = fields
|
|
|
|
def write(self, row: dict) -> None:
|
|
self._writer.writerow({f: row.get(f, "") for f in self._fields})
|
|
self._fh.flush()
|
|
|
|
def close(self) -> None:
|
|
self._fh.close()
|