Add --retry-failed mode and mosaic retry estimates to progress report

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-17 17:40:15 -04:00
parent 6390f5d529
commit 8593808cf3
5 changed files with 399 additions and 27 deletions
@@ -84,6 +84,33 @@ def parse_args() -> argparse.Namespace:
            "inventorying all scans across all machines."
        ),
    )
+    p.add_argument(
+        "--retry-failed",
+        action="store_true",
+        help=(
+            "Mosaic-only: re-attempt scans whose latest scans.csv row has "
+            "mosaic_download_status=failed (queue from CSV, not the server list). "
+            "Implies --mosaic-only."
+        ),
+    )
+    p.add_argument(
+        "--retry-since",
+        metavar="YEAR",
+        default=None,
+        help=(
+            "With --retry-failed: only scans with scan_time year >= YEAR "
+            "(e.g. 2023)."
+        ),
+    )
+    p.add_argument(
+        "--retry-error-code",
+        metavar="CODE",
+        default=None,
+        help=(
+            "With --retry-failed: filter by mosaic_error_code "
+            "(e.g. 200 for empty-body failures)."
+        ),
+    )
    p.add_argument(
        "--dry-run",
        action="store_true",
@@ -159,6 +186,16 @@ def main() -> None:
    if args.scan_id is not None and args.scan_id <= 0:
        sys.exit("--scan-id must be a positive integer")

+    if args.retry_since and not args.retry_failed:
+        sys.exit("--retry-since requires --retry-failed.")
+    if args.retry_error_code and not args.retry_failed:
+        sys.exit("--retry-error-code requires --retry-failed.")
+
+    if args.retry_failed:
+        if args.metadata_only:
+            sys.exit("--retry-failed cannot be used with --metadata-only.")
+        args.mosaic_only = True  # implied
+
    # --list-machines doesn't need credentials
    if args.list_machines:
        base_url = "http://205.149.147.131:8010/"
@@ -261,7 +298,13 @@ def main() -> None:
    if args.metadata_only:
        log.info("Mode: metadata only (mosaics and tiles skipped)")
    elif args.mosaic_only:
-        log.info("Mode: mosaics only (individual tiles skipped)")
+        if args.retry_failed:
+            log.info(
+                "Mode: mosaic retry (failed scans from %s)",
+                SCANS_CSV_FILENAME,
+            )
+        else:
+            log.info("Mode: mosaics only (individual tiles skipped)")
    if args.dry_run:
        log.info("Mode: dry-run (no files will be written)")

@@ -285,6 +328,9 @@ def main() -> None:
                metadata_only=args.metadata_only,
                scan_id_filter=args.scan_id,
                max_tiles=args.max_tiles,
+                retry_failed=args.retry_failed,
+                retry_since_year=args.retry_since,
+                retry_error_code=args.retry_error_code,
            )
            totals.merge(stats)
    finally:
@@ -2,14 +2,22 @@
 High-level scrape orchestration: drives the per-machine and per-scan loops.
 """

+import csv
 import json
 import logging
+import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any

+from tqdm import tqdm
+
 from spruce.download_result import PERMANENT_MISSING, UNKNOWN, error_code_str
+from spruce.exif import write_mosaic_exif
+from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
+from spruce.progress import ProgressTracker, CsvWriter
+from spruce.session import MachineSession

 # RootView returns ~43-byte 1×1 JPEG placeholders for empty cells; stay well
 # below smallest observed real tile (~7 KiB in production samples).
@@ -49,16 +57,64 @@ class RunStats:
        self.scans_probe_skipped += other.scans_probe_skipped
        self.scans_disk_space_skipped += other.scans_disk_space_skipped

-from tqdm import tqdm
-
-from spruce.exif import write_mosaic_exif
-from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
-from spruce.progress import ProgressTracker, CsvWriter
-from spruce.session import MachineSession
-
 log = logging.getLogger(__name__)


+def _read_scans_csv_latest(scans_csv_path: Path) -> dict[tuple[str, str], dict[str, str]]:
+    """Last row wins per (machine, scan_id)."""
+    latest: dict[tuple[str, str], dict[str, str]] = {}
+    if not scans_csv_path.exists():
+        return latest
+    with open(scans_csv_path, newline="", encoding="utf-8") as fh:
+        for row in csv.DictReader(fh):
+            key = (row.get("machine", ""), row.get("scan_id", ""))
+            latest[key] = row
+    return latest
+
+
+def load_failed_scans_from_csv(
+    scans_csv_path: Path,
+    machine_label: str,
+    *,
+    since_year: str | None = None,
+    error_code: str | None = None,
+) -> list[dict[str, Any]]:
+    """
+    Dedupe scans.csv by (machine, scan_id); return failed mosaic rows for one machine.
+
+    Each dict is suitable as the ``scan`` argument to ``process_scan`` (scan_id,
+    scan_time, name, status).
+    """
+    latest = _read_scans_csv_latest(scans_csv_path)
+    out: list[dict[str, Any]] = []
+    for (_m, _sid), row in latest.items():
+        if row.get("machine") != machine_label:
+            continue
+        if row.get("mosaic_download_status") != "failed":
+            continue
+        if error_code is not None and row.get("mosaic_error_code", "") != error_code:
+            continue
+        st = row.get("scan_time", "") or ""
+        if since_year is not None:
+            yr = st[:4]
+            if len(yr) < 4 or yr < since_year:
+                continue
+        sid = int(row["scan_id"])
+        out.append(
+            {
+                "scan_id": sid,
+                "scan_time": st,
+                "name": row.get("name", ""),
+                "status": row.get("status", "") or "Completed",
+                "user": row.get("user", ""),
+                "scan_lines": row.get("scan_lines", ""),
+                "scan_mode": row.get("scan_mode", ""),
+            }
+        )
+    out.sort(key=lambda s: s["scan_id"])
+    return out
+
+
 # ---------------------------------------------------------------------------
 # Per-scan helpers
 # ---------------------------------------------------------------------------
@@ -499,6 +555,9 @@ def scrape_machine(
    metadata_only: bool = False,
    scan_id_filter: int | None = None,
    max_tiles: int | None = None,
+    retry_failed: bool = False,
+    retry_since_year: str | None = None,
+    retry_error_code: str | None = None,
 ) -> RunStats:
    """Login, fetch scans, and download all content for one machine."""
    sess = MachineSession(machine, config)
@@ -518,8 +577,37 @@ def scrape_machine(
        log.error("[%s] Login failed after 3 attempts — skipping machine.", machine["label"])
        return RunStats()

-    if scan_id_filter is not None:
-        scans: list[dict[str, Any]] = [
+    if retry_failed:
+        scans = load_failed_scans_from_csv(
+            scans_csv.path,
+            machine["label"],
+            since_year=retry_since_year,
+            error_code=retry_error_code,
+        )
+        if scan_id_filter is not None:
+            scans = [s for s in scans if s["scan_id"] == scan_id_filter]
+            if not scans:
+                log.warning(
+                    "[%s] Retry: scan_id %d not among failed rows for this machine.",
+                    machine["label"],
+                    scan_id_filter,
+                )
+                return RunStats()
+            log.info("[%s] Mosaic retry: single scan %d.", machine["label"], scan_id_filter)
+        elif not scans:
+            log.warning(
+                "[%s] No failed mosaic rows in scans.csv match retry filters.",
+                machine["label"],
+            )
+            return RunStats()
+        else:
+            log.info(
+                "[%s] Mosaic retry: %d failed scan(s) from scans.csv.",
+                machine["label"],
+                len(scans),
+            )
+    elif scan_id_filter is not None:
+        scans = [
            {"scan_id": scan_id_filter, "status": "Completed"}
        ]
        log.info("[%s] Targeting scan ID %d.", machine["label"], scan_id_filter)
@@ -529,21 +617,25 @@ def scrape_machine(
            log.warning("[%s] No scans found.", machine["label"])
            return RunStats()

-    # Build a set of scan_ids already fully processed in a prior run so we can
-    # skip them entirely (no metadata fetch, no mosaic request).
-    # Only scans with a definitive non-pending status count; skipped_metadata_only
-    # rows still need to be processed in mosaic mode.
+    # Build existing_ids: scan_ids to skip entirely (no metadata fetch, no HTTP).
+    # In normal mode: skip anything with a definitive non-pending status.
+    # In retry mode: only skip scans that are already downloaded or skipped for
+    # disk-space reasons — failed scans must be re-attempted.
    PENDING_STATUSES = {"skipped_metadata_only", ""}
+    BLOCK_AFTER_RETRY_STATUSES = {"downloaded", "skipped_zero_disk_space"}
    existing_ids: set[int] = set()
-    if not metadata_only and scans_csv._fh.name:
-        existing_path = Path(scans_csv._fh.name)
-        if existing_path.exists():
-            import csv as _csv
-            with open(existing_path, newline="", encoding="utf-8") as _f:
-                for _row in _csv.DictReader(_f):
-                    if _row.get("machine") == machine["label"]:
-                        if _row.get("mosaic_download_status", "") not in PENDING_STATUSES:
-                            existing_ids.add(int(_row["scan_id"]))
+    if not metadata_only:
+        latest_rows = _read_scans_csv_latest(scans_csv.path)
+        for (_mlabel, _sid), _row in latest_rows.items():
+            if _mlabel != machine["label"]:
+                continue
+            st = _row.get("mosaic_download_status", "")
+            if retry_failed:
+                if st in BLOCK_AFTER_RETRY_STATUSES:
+                    existing_ids.add(int(_row["scan_id"]))
+            else:
+                if st not in PENDING_STATUSES:
+                    existing_ids.add(int(_row["scan_id"]))

    stats = RunStats()
    for scan in scans:
@@ -77,6 +77,7 @@ class CsvWriter:
    def __init__(self, path: Path, fields: list[str]) -> None:
        is_new = not path.exists()
        path.parent.mkdir(parents=True, exist_ok=True)
+        self.path = path
        self._fh = open(path, "a", newline="", encoding="utf-8")
        self._writer = csv.DictWriter(self._fh, fieldnames=fields)
        if is_new: