Scraping resilience, metadata tooling, and repository hygiene

Consolidates mosaic and session hardening (login retry, skip processed scans, no retry on 404, started_at), progress reporting (Markdown tables, by-year rollup, rolling-window rate/ETA), and metadata workflow scripts (run_metadata_scan.sh, scan_progress_report.py, export_machine_metadata.py). Adds mosaic reconstruction sample JPEGs referenced by the report. Updates .gitignore for backup/ and .claude/; sample_random_scans helper is documented for branch testing/sample-runs only (see README).
2026-05-14 19:52:53 -04:00
parent 752c278dff
commit 6390f5d529
23 changed files with 788 additions and 188 deletions
@@ -241,6 +241,7 @@ def process_scan(
    mosaic_only: bool,
    metadata_only: bool = False,
    max_tiles: int | None = None,
+    scans_csv_existing_ids: set[int] | None = None,
 ) -> RunStats:
    """
    Process one scan: fetch metadata, download mosaic and (optionally) tiles.
@@ -379,9 +380,16 @@ def process_scan(
        mds, mer, mco, mcl = "", "", "", ""

    # Write scan-level CSV row only if this scan hasn't been recorded before.
-    if mosaic_already_done and not metadata_only:
+    # Skip if: (1) mosaic URL already in .progress.json, or (2) scan already
+    # has a non-pending row in scans.csv from a prior run.
+    already_recorded = (mosaic_already_done and not metadata_only) or (
+        not metadata_only
+        and scans_csv_existing_ids is not None
+        and scan_id in scans_csv_existing_ids
+    )
+    if already_recorded:
        log.debug(
-            "[%s] Scan %d: already in scans.csv (mosaic was previously downloaded), skipping CSV row.",
+            "[%s] Scan %d: already in scans.csv, skipping CSV row.",
            machine["label"],
            scan_id,
        )
@@ -494,7 +502,20 @@ def scrape_machine(
 ) -> RunStats:
    """Login, fetch scans, and download all content for one machine."""
    sess = MachineSession(machine, config)
-    if not sess.login():
+    login_ok = False
+    for attempt in range(1, 4):
+        if sess.login():
+            login_ok = True
+            break
+        if attempt < 3:
+            log.warning(
+                "[%s] Login failed (attempt %d/3) — retrying in 10s.",
+                machine["label"],
+                attempt,
+            )
+            time.sleep(10)
+    if not login_ok:
+        log.error("[%s] Login failed after 3 attempts — skipping machine.", machine["label"])
        return RunStats()

    if scan_id_filter is not None:
@@ -508,8 +529,33 @@ def scrape_machine(
            log.warning("[%s] No scans found.", machine["label"])
            return RunStats()

+    # Build a set of scan_ids already fully processed in a prior run so we can
+    # skip them entirely (no metadata fetch, no mosaic request).
+    # Only scans with a definitive non-pending status count; skipped_metadata_only
+    # rows still need to be processed in mosaic mode.
+    PENDING_STATUSES = {"skipped_metadata_only", ""}
+    existing_ids: set[int] = set()
+    if not metadata_only and scans_csv._fh.name:
+        existing_path = Path(scans_csv._fh.name)
+        if existing_path.exists():
+            import csv as _csv
+            with open(existing_path, newline="", encoding="utf-8") as _f:
+                for _row in _csv.DictReader(_f):
+                    if _row.get("machine") == machine["label"]:
+                        if _row.get("mosaic_download_status", "") not in PENDING_STATUSES:
+                            existing_ids.add(int(_row["scan_id"]))
+
    stats = RunStats()
    for scan in scans:
+        # Skip scans already fully processed in a prior run — avoids redundant
+        # metadata fetches and mosaic requests for known-failed / known-done scans.
+        if not metadata_only and scan["scan_id"] in existing_ids:
+            log.debug(
+                "[%s] Scan %d: already processed, skipping.",
+                machine["label"],
+                scan["scan_id"],
+            )
+            continue
        stats.merge(process_scan(
            sess=sess,
            scan=scan,
@@ -523,5 +569,6 @@ def scrape_machine(
            mosaic_only=mosaic_only,
            metadata_only=metadata_only,
            max_tiles=max_tiles,
+            scans_csv_existing_ids=existing_ids,
        ))
    return stats
@@ -5,6 +5,7 @@ Progress tracking (JSON) and CSV writing.
 import csv
 import json
 import logging
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Iterator

@@ -27,6 +28,7 @@ class ProgressTracker:
    def __init__(self, path: Path) -> None:
        self.path = path
        self._done: set[str] = set()
+        self.started_at: str = datetime.now(timezone.utc).isoformat()
        self._load()

    def _load(self) -> None:
@@ -34,6 +36,8 @@ class ProgressTracker:
            try:
                data = json.loads(self.path.read_text())
                self._done = set(data.get("completed_urls", []))
+                if "started_at" in data:
+                    self.started_at = data["started_at"]
                log.info("Resuming: %d URLs already downloaded.", len(self._done))
            except Exception:
                log.warning("Could not read progress file; starting fresh.")
@@ -59,7 +63,10 @@ class ProgressTracker:
        self.path.parent.mkdir(parents=True, exist_ok=True)
        tmp = self.path.with_suffix(".json.tmp")
        tmp.write_text(
-            json.dumps({"completed_urls": sorted(self._done)}, indent=2)
+            json.dumps(
+                {"started_at": self.started_at, "completed_urls": sorted(self._done)},
+                indent=2,
+            )
        )
        tmp.replace(self.path)  # atomic on POSIX; avoids corrupt JSON on crash

@@ -14,6 +14,7 @@ from bs4 import BeautifulSoup

 from spruce.download_result import (
    OK,
+    PERMANENT_MISSING,
    UNKNOWN,
    DownloadResult,
    classify_http_error,
@@ -263,6 +264,10 @@ class MachineSession:
                    and exc.response is not None
                ):
                    sc = exc.response.status_code
+                cl = classify_http_error(sc, exc)
+                if cl == PERMANENT_MISSING:
+                    # 404/410 will never succeed — don't waste time retrying.
+                    return DownloadResult(0, sc, str(exc), cl)
                if attempt < retries:
                    log.warning(
                        "Attempt %d/%d failed %s: %s — retrying in %.0fs",
@@ -281,7 +286,6 @@ class MachineSession:
                        url,
                        exc,
                    )
-                    cl = classify_http_error(sc, exc)
                    return DownloadResult(0, sc, str(exc), cl)
        return DownloadResult(0, None, "download_file: exhausted", UNKNOWN)