Scraping resilience, metadata tooling, and repository hygiene
Consolidates mosaic and session hardening (login retry, skip processed scans, no retry on 404, started_at), progress reporting (Markdown tables, by-year rollup, rolling-window rate/ETA), and metadata workflow scripts (run_metadata_scan.sh, scan_progress_report.py, export_machine_metadata.py). Adds mosaic reconstruction sample JPEGs referenced by the report. Updates .gitignore for backup/ and .claude/; sample_random_scans helper is documented for branch testing/sample-runs only (see README).
This commit is contained in:
+8
-1
@@ -5,6 +5,7 @@ Progress tracking (JSON) and CSV writing.
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
@@ -27,6 +28,7 @@ class ProgressTracker:
|
||||
def __init__(self, path: Path) -> None:
|
||||
self.path = path
|
||||
self._done: set[str] = set()
|
||||
self.started_at: str = datetime.now(timezone.utc).isoformat()
|
||||
self._load()
|
||||
|
||||
def _load(self) -> None:
|
||||
@@ -34,6 +36,8 @@ class ProgressTracker:
|
||||
try:
|
||||
data = json.loads(self.path.read_text())
|
||||
self._done = set(data.get("completed_urls", []))
|
||||
if "started_at" in data:
|
||||
self.started_at = data["started_at"]
|
||||
log.info("Resuming: %d URLs already downloaded.", len(self._done))
|
||||
except Exception:
|
||||
log.warning("Could not read progress file; starting fresh.")
|
||||
@@ -59,7 +63,10 @@ class ProgressTracker:
|
||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = self.path.with_suffix(".json.tmp")
|
||||
tmp.write_text(
|
||||
json.dumps({"completed_urls": sorted(self._done)}, indent=2)
|
||||
json.dumps(
|
||||
{"started_at": self.started_at, "completed_urls": sorted(self._done)},
|
||||
indent=2,
|
||||
)
|
||||
)
|
||||
tmp.replace(self.path) # atomic on POSIX; avoids corrupt JSON on crash
|
||||
|
||||
|
||||
Reference in New Issue
Block a user