Scraping resilience, metadata tooling, and repository hygiene

Consolidates mosaic and session hardening (login retry, skip processed scans, no retry on 404, started_at), progress reporting (Markdown tables, by-year rollup, rolling-window rate/ETA), and metadata workflow scripts (run_metadata_scan.sh, scan_progress_report.py, export_machine_metadata.py). Adds mosaic reconstruction sample JPEGs referenced by the report. Updates .gitignore for backup/ and .claude/; sample_random_scans helper is documented for branch testing/sample-runs only (see README).
This commit is contained in:
2026-05-14 19:52:53 -04:00
parent 752c278dff
commit 6390f5d529
23 changed files with 788 additions and 188 deletions
+8 -1
View File
@@ -5,6 +5,7 @@ Progress tracking (JSON) and CSV writing.
import csv
import json
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterator
@@ -27,6 +28,7 @@ class ProgressTracker:
def __init__(self, path: Path) -> None:
self.path = path
self._done: set[str] = set()
self.started_at: str = datetime.now(timezone.utc).isoformat()
self._load()
def _load(self) -> None:
@@ -34,6 +36,8 @@ class ProgressTracker:
try:
data = json.loads(self.path.read_text())
self._done = set(data.get("completed_urls", []))
if "started_at" in data:
self.started_at = data["started_at"]
log.info("Resuming: %d URLs already downloaded.", len(self._done))
except Exception:
log.warning("Could not read progress file; starting fresh.")
@@ -59,7 +63,10 @@ class ProgressTracker:
self.path.parent.mkdir(parents=True, exist_ok=True)
tmp = self.path.with_suffix(".json.tmp")
tmp.write_text(
json.dumps({"completed_urls": sorted(self._done)}, indent=2)
json.dumps(
{"started_at": self.started_at, "completed_urls": sorted(self._done)},
indent=2,
)
)
tmp.replace(self.path) # atomic on POSIX; avoids corrupt JSON on crash