Add --metadata-only mode; harden resume and idempotency

- Add --metadata-only flag: fetches scan detail pages, writes metadata.json + scans.csv rows, skips all image downloads. Re-runs skip scans whose metadata.json already exists. - Atomic progress.json saves (temp-file rename). - Heal-on-resume: tiles on disk but not in progress are silently re-marked before building the pending list. - scans.csv dedup: skip row if mosaic URL already in progress. - Rename mosaic_downloaded -> mosaic_on_disk (reflects disk state). - --recheck now checks mosaics as well as tiles. - RunStats dataclass replaces raw int return; richer run summary. - Fix argparse allow_abbrev reverted; fix --scan-id + --metadata-only glob fallback when scan_time is absent. - Add .venv/ to .gitignore. - README: fix typo, update worker counts, document all new behaviour.
2026-04-24 09:44:57 -04:00
parent e122f6435a
commit f2193011ca
8 changed files with 294 additions and 93 deletions
@@ -5,9 +5,30 @@ High-level scrape orchestration: drives the per-machine and per-scan loops.
 import json
 import logging
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any

+
+@dataclass
+class RunStats:
+    """Accumulated counters for one or more machines."""
+
+    scans_fetched: int = 0      # metadata fetched from server this run
+    scans_skipped: int = 0      # metadata.json already on disk; no HTTP request
+    scans_failed: int = 0       # fetch error or missing grid params
+    metadata_written: int = 0   # new metadata.json files created
+    mosaics_downloaded: int = 0
+    tiles_downloaded: int = 0
+
+    def merge(self, other: "RunStats") -> None:
+        self.scans_fetched += other.scans_fetched
+        self.scans_skipped += other.scans_skipped
+        self.scans_failed += other.scans_failed
+        self.metadata_written += other.metadata_written
+        self.mosaics_downloaded += other.mosaics_downloaded
+        self.tiles_downloaded += other.tiles_downloaded
+
 from tqdm import tqdm

 from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
@@ -66,6 +87,24 @@ def _download_tiles_for_scan(
    dry_run: bool,
 ) -> int:
    """Download all pending tiles for a scan. Returns count of tiles downloaded."""
+    # Heal progress for tiles that exist on disk but weren't recorded (e.g.
+    # crash between write and batch save). Prevents duplicate tiles.csv rows.
+    healed = 0
+    for t in tiles:
+        if not progress.is_done(t["url"]):
+            dest = tile_dest(output_dir, machine, scan_meta, t)
+            if dest.exists() and dest.stat().st_size > 0:
+                progress.mark_done(t["url"])
+                healed += 1
+    if healed:
+        log.debug(
+            "[%s] Scan %d: healed %d tile(s) already on disk into progress.",
+            machine["label"],
+            scan_id,
+            healed,
+        )
+        progress.save()
+
    pending = [t for t in tiles if not progress.is_done(t["url"])]
    log.info(
        "[%s] Scan %d: %d tiles total, %d pending.",
@@ -152,12 +191,43 @@ def process_scan(
    tiles_csv: CsvWriter,
    dry_run: bool,
    mosaic_only: bool,
-) -> int:
+    metadata_only: bool = False,
+) -> RunStats:
    """
    Process one scan: fetch metadata, download mosaic and (optionally) tiles.
-    Returns total files downloaded for this scan.
+    Returns a RunStats with counters for what happened this call.
+
+    If metadata_only is True, writes metadata.json and the scans.csv row but
+    skips both the mosaic and the tiles.
    """
    scan_id: int = scan["scan_id"]
+    stats = RunStats()
+
+    # In metadata-only mode, skip the HTTP fetch if metadata.json already exists.
+    # Try the date-hinted path first; fall back to a glob when scan_time is
+    # absent (e.g. when --scan-id is used and the synthetic scan dict has no
+    # scan_time field).
+    if metadata_only and not dry_run:
+        machine_root = output_dir / machine_dir_name(machine)
+        scan_date_hint = _extract_date(scan.get("scan_time", ""))
+        found_meta: Path | None = None
+        if scan_date_hint and scan_date_hint != "unknown":
+            candidate = machine_root / scan_date_hint / str(scan_id) / "metadata.json"
+            if candidate.exists():
+                found_meta = candidate
+        if found_meta is None:
+            matches = list(machine_root.glob(f"*/{scan_id}/metadata.json"))
+            if matches:
+                found_meta = matches[0]
+        if found_meta is not None:
+            log.debug(
+                "[%s] Scan %d: metadata.json already exists, skipping fetch.",
+                machine["label"],
+                scan_id,
+            )
+            stats.scans_skipped += 1
+            return stats
+
    log.info("[%s] Processing scan %d …", machine["label"], scan_id)

    try:
@@ -166,7 +236,8 @@ def process_scan(
        log.error(
            "[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
        )
-        return 0
+        stats.scans_failed += 1
+        return stats

    if not scan_meta.get("nx") or not scan_meta.get("ny"):
        log.warning(
@@ -174,7 +245,10 @@ def process_scan(
            machine["label"],
            scan_id,
        )
-        return 0
+        stats.scans_failed += 1
+        return stats
+
+    stats.scans_fetched += 1

    # Merge list-level metadata into scan_meta (detail page takes precedence)
    for k in (
@@ -199,51 +273,64 @@ def process_scan(
            meta_file.write_text(
                json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
            )
+            stats.metadata_written += 1

-    # Mosaic
+    # Mosaic (skipped entirely in metadata-only mode)
    mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
    mosaic_url = sess.mosaic_url(scan_id)
-    mosaic_downloaded = _download_mosaic(
-        sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
-    )
-    total = 1 if mosaic_downloaded else 0
+    mosaic_already_done = progress.is_done(mosaic_url)
+    if metadata_only:
+        mosaic_just_downloaded = False
+    else:
+        mosaic_just_downloaded = _download_mosaic(
+            sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
+        )
+    if mosaic_just_downloaded:
+        stats.mosaics_downloaded += 1

-    # Write scan-level CSV row
-    scans_csv.write(
-        {
-            "machine": machine["label"],
-            "machine_id": machine["machine_id"],
-            "scan_id": scan_id,
-            "name": scan_meta.get("name", ""),
-            "scan_time": scan_meta.get("scan_time", ""),
-            "start_x": scan_meta.get("start_x", ""),
-            "start_y": scan_meta.get("start_y", ""),
-            "end_x": scan_meta.get("end_x", ""),
-            "end_y": scan_meta.get("end_y", ""),
-            "dx": scan_meta.get("dx", ""),
-            "dy": scan_meta.get("dy", ""),
-            "nx": scan_meta.get("nx", ""),
-            "ny": scan_meta.get("ny", ""),
-            "total_tiles": scan_meta.get("total_tiles", ""),
-            "scan_lines": scan_meta.get("scan_lines", ""),
-            "scan_mode": scan_meta.get("scan_mode", ""),
-            "start_datetime": scan_meta.get("start_datetime", ""),
-            "end_datetime": scan_meta.get("end_datetime", ""),
-            "status": scan_meta.get("status", ""),
-            "user": scan_meta.get("user", ""),
-            "disk_space_mb": scan_meta.get("disk_space_mb", ""),
-            "mosaic_url": mosaic_url,
-            "mosaic_local_path": str(mosaic_path),
-            "mosaic_downloaded": mosaic_downloaded,
-        }
-    )
+    # Write scan-level CSV row only if this scan hasn't been recorded before.
+    if mosaic_already_done and not metadata_only:
+        log.debug(
+            "[%s] Scan %d: already in scans.csv (mosaic was previously downloaded), skipping CSV row.",
+            machine["label"],
+            scan_id,
+        )
+    else:
+        scans_csv.write(
+            {
+                "machine": machine["label"],
+                "machine_id": machine["machine_id"],
+                "scan_id": scan_id,
+                "name": scan_meta.get("name", ""),
+                "scan_time": scan_meta.get("scan_time", ""),
+                "start_x": scan_meta.get("start_x", ""),
+                "start_y": scan_meta.get("start_y", ""),
+                "end_x": scan_meta.get("end_x", ""),
+                "end_y": scan_meta.get("end_y", ""),
+                "dx": scan_meta.get("dx", ""),
+                "dy": scan_meta.get("dy", ""),
+                "nx": scan_meta.get("nx", ""),
+                "ny": scan_meta.get("ny", ""),
+                "total_tiles": scan_meta.get("total_tiles", ""),
+                "scan_lines": scan_meta.get("scan_lines", ""),
+                "scan_mode": scan_meta.get("scan_mode", ""),
+                "start_datetime": scan_meta.get("start_datetime", ""),
+                "end_datetime": scan_meta.get("end_datetime", ""),
+                "status": scan_meta.get("status", ""),
+                "user": scan_meta.get("user", ""),
+                "disk_space_mb": scan_meta.get("disk_space_mb", ""),
+                "mosaic_url": mosaic_url,
+                "mosaic_local_path": str(mosaic_path),
+                "mosaic_on_disk": mosaic_path.exists(),
+            }
+        )

-    if mosaic_only:
-        return total
+    if mosaic_only or metadata_only:
+        return stats

    # Tiles
    tiles = sess.enumerate_tiles(scan_meta)
-    total += _download_tiles_for_scan(
+    stats.tiles_downloaded += _download_tiles_for_scan(
        sess,
        tiles,
        scan_meta,
@@ -255,7 +342,7 @@ def process_scan(
        tiles_csv,
        dry_run,
    )
-    return total
+    return stats


 # ---------------------------------------------------------------------------
@@ -272,12 +359,13 @@ def scrape_machine(
    scans_csv: CsvWriter,
    dry_run: bool,
    mosaic_only: bool,
-    scan_id_filter: int | None,
-) -> int:
+    metadata_only: bool = False,
+    scan_id_filter: int | None = None,
+) -> RunStats:
    """Login, fetch scans, and download all content for one machine."""
    sess = MachineSession(machine, config)
    if not sess.login():
-        return 0
+        return RunStats()

    if scan_id_filter is not None:
        scans: list[dict[str, Any]] = [
@@ -288,11 +376,11 @@ def scrape_machine(
        scans = sess.get_all_scans()
        if not scans:
            log.warning("[%s] No scans found.", machine["label"])
-            return 0
+            return RunStats()

-    total = 0
+    stats = RunStats()
    for scan in scans:
-        total += process_scan(
+        stats.merge(process_scan(
            sess=sess,
            scan=scan,
            output_dir=output_dir,
@@ -303,5 +391,6 @@ def scrape_machine(
            tiles_csv=tiles_csv,
            dry_run=dry_run,
            mosaic_only=mosaic_only,
-        )
-    return total
+            metadata_only=metadata_only,
+        ))
+    return stats