Scraping resilience, metadata tooling, and repository hygiene

Consolidates mosaic and session hardening (login retry, skip processed scans, no retry on 404, started_at), progress reporting (Markdown tables, by-year rollup, rolling-window rate/ETA), and metadata workflow scripts (run_metadata_scan.sh, scan_progress_report.py, export_machine_metadata.py). Adds mosaic reconstruction sample JPEGs referenced by the report. Updates .gitignore for backup/ and .claude/; sample_random_scans helper is documented for branch testing/sample-runs only (see README).
2026-05-14 19:52:53 -04:00
parent 752c278dff
commit 6390f5d529
23 changed files with 788 additions and 188 deletions
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""
+Split scans.csv into per-machine metadata CSVs.
+
+Reads the combined scans.csv produced by the scraper and writes one CSV per
+machine containing only the website-sourced metadata columns (no mosaic paths,
+download status, or error fields).
+
+Usage:
+  python scripts/export_machine_metadata.py
+  python scripts/export_machine_metadata.py --input archives/scans.csv --output-dir archives/by_machine
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+METADATA_COLUMNS = [
+    "machine",
+    "machine_id",
+    "scan_id",
+    "name",
+    "scan_time",
+    "start_x",
+    "start_y",
+    "end_x",
+    "end_y",
+    "dx",
+    "dy",
+    "nx",
+    "ny",
+    "total_tiles",
+    "scan_lines",
+    "scan_mode",
+    "start_datetime",
+    "end_datetime",
+    "status",
+    "user",
+    "disk_space_mb",
+]
+
+
+def sanitize_machine_label(label: str) -> str:
+    return label.replace("[", "").replace("]", "").replace(" ", "_").strip("_")
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Split scans.csv into per-machine metadata CSVs.")
+    p.add_argument(
+        "--input",
+        default="archives/scans.csv",
+        metavar="FILE",
+        help="Path to scans.csv (default: archives/scans.csv)",
+    )
+    p.add_argument(
+        "--output-dir",
+        default="archives/by_machine",
+        metavar="DIR",
+        help="Directory for output CSVs (default: archives/by_machine)",
+    )
+    return p.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    input_path = Path(args.input)
+    output_dir = Path(args.output_dir)
+
+    if not input_path.exists():
+        sys.exit(f"Input file not found: {input_path}")
+
+    with input_path.open(newline="") as fh:
+        reader = csv.DictReader(fh)
+        if reader.fieldnames is None:
+            sys.exit(f"{input_path} appears to be empty.")
+
+        missing = [c for c in METADATA_COLUMNS if c not in reader.fieldnames]
+        if missing:
+            sys.exit(f"Expected columns not found in {input_path}: {missing}")
+
+        rows_by_machine: dict[str, list[dict]] = defaultdict(list)
+        for row in reader:
+            rows_by_machine[row["machine"]].append(row)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for machine_label, rows in sorted(rows_by_machine.items()):
+        safe_name = sanitize_machine_label(machine_label)
+        out_path = output_dir / f"{safe_name}_scans_metadata.csv"
+        with out_path.open("w", newline="") as fh:
+            writer = csv.DictWriter(fh, fieldnames=METADATA_COLUMNS, extrasaction="ignore")
+            writer.writeheader()
+            writer.writerows(rows)
+        print(f"  {out_path}  ({len(rows)} rows)")
+
+    total = sum(len(r) for r in rows_by_machine.values())
+    print(f"\n{len(rows_by_machine)} machine(s), {total} total rows → {output_dir}/")
+
+
+if __name__ == "__main__":
+    main()