From f2193011ca2d62928054e878199f3737f6367b9a Mon Sep 17 00:00:00 2001
From: James Kolpack <james.kolpack@gmail.com>
Date: Fri, 24 Apr 2026 09:44:57 -0400
Subject: [PATCH] Add --metadata-only mode; harden resume and idempotency

- Add --metadata-only flag: fetches scan detail pages, writes
  metadata.json + scans.csv rows, skips all image downloads.
  Re-runs skip scans whose metadata.json already exists.
- Atomic progress.json saves (temp-file rename).
- Heal-on-resume: tiles on disk but not in progress are silently
  re-marked before building the pending list.
- scans.csv dedup: skip row if mosaic URL already in progress.
- Rename mosaic_downloaded -> mosaic_on_disk (reflects disk state).
- --recheck now checks mosaics as well as tiles.
- RunStats dataclass replaces raw int return; richer run summary.
- Fix argparse allow_abbrev reverted; fix --scan-id + --metadata-only
  glob fallback when scan_time is absent.
- Add .venv/ to .gitignore.
- README: fix typo, update worker counts, document all new behaviour.
---
 .gitignore             |   1 +
 README.md              |  62 +++++++++++---
 spruce/cli.py          |  79 ++++++++++++++---
 spruce/orchestrator.py | 189 ++++++++++++++++++++++++++++++-----------
 spruce/progress.py     |   4 +-
 spruce/recheck.py      |  46 +++++++---
 spruce/session.py      |   4 -
 spruce/settings.py     |   2 +-
 8 files changed, 294 insertions(+), 93 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0022c39..d2f8359 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ __pycache__/
 *.pyc
 .DS_Store
 explore_dumps/
+.venv/
diff --git a/README.md b/README.md
index 3e6c1a5..4d865d7 100644
--- a/README.md
+++ b/README.md
@@ -42,15 +42,15 @@ A full-tube scan covers a 310 mm × 740 mm cylinder at 3.01 × 2.26 mm steps, pr
 
 ### Download speed
 
-Tile downloads are server-limited: the RootView PHP backend renders tiles on-demand, sustaining ~**0.67 tiles/sec** with 8 parallel workers regardless of local bandwidth. Mosaics are pre-rendered and download ~20× faster per MB.
+Tile downloads are server-limited: the RootView PHP backend renders tiles on-demand, sustaining ~**0.67 tiles/sec** with 4 parallel workers regardless of local bandwidth. Mosaics are pre-rendered and download ~20× faster per MB.
 
 | Scenario | Estimated time |
 |---|---|
 | All mosaics (4 workers) | ~3 months |
-| Full tiles for one scan (8 workers) | ~14 hours |
+| Full tiles for one scan (4 workers) | ~14 hours |
 | All tiles, full-tube machines only | Years — not recommended |
 
-**Recommended approach:** archive mosaics first (`--mosaic-only`), then selectively download tiles for priority scans.
+**Recommended approach:** inventory all scans first (`--metadata-only`, ~80 hours serial or ~7 hours if machines run in parallel), then archive mosaics (`--mosaic-only`), then selectively download tiles for priority scans.
 
 ---
 
@@ -58,7 +58,7 @@ Tile downloads are server-limited: the RootView PHP backend renders tiles on-dem
 
 ```bash
 # 1. Clone / download this repo
-cd spruce_scrapper
+cd spruce_scraper
 
 # 2. Install dependencies (Python 3.10+)
 pip install -r requirements.txt
@@ -84,6 +84,10 @@ python scraper.py --list-scans --machine "BW3-20 [AMR-26]"
 # Preview what would be downloaded (dry run)
 python scraper.py --machine "BW3-20 [AMR-26]" --dry-run
 
+# Inventory scan parameters only (no images downloaded) — very fast
+python scraper.py --metadata-only
+python scraper.py --machine "BW3-20 [AMR-26]" --metadata-only
+
 # Download mosaics only for one machine
 python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only
 
@@ -103,11 +107,12 @@ python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
 |---|---|
 | `--config FILE` | Config file path (default: `config.yaml`) |
 | `--machine LABEL` | Restrict to one machine, e.g. `"BW3-20 [AMR-26]"` |
-| `--scan-id ID` | Download only this scan (use with `--machine`) |
+| `--scan-id ID` | Restrict to one scan ID (use with `--machine`; works with all modes) |
 | `--mosaic-only` | Download mosaics only; skip individual tiles |
+| `--metadata-only` | Fetch scan parameters only; write `metadata.json` + `scans.csv` rows, skip all images. Re-runs skip scans whose `metadata.json` already exists |
 | `--dry-run` | Print what would be downloaded without saving |
 | `--workers N` | Parallel download threads (default: 2, hard cap: 4) |
-| `--recheck` | Scan archive for zero-byte/missing tiles and remove them from `.progress.json` so they re-download on next run |
+| `--recheck` | Scan archive for zero-byte/missing tiles and mosaics; remove bad entries from `.progress.json` so they re-download on next run |
 | `--list-machines` | Print all machines and exit |
 | `--list-scans` | Print all scans for `--machine` and exit |
 | `--verbose` / `-v` | Debug logging |
@@ -128,7 +133,7 @@ archives/
             ├── metadata.json   # full scan parameters (grid, timestamps, etc.)
             ├── mosaic.jpg      # pre-stitched full image (~16 MB)
             └── tiles/
-                ├── tile_r000_c000.jpg   # row 0, column 0
+                ├── tile_r000_c000.jpg   # row 0, column 0 (zero-padding matches grid size)
                 ├── tile_r000_c001.jpg
                 └── ...                 # 33,784 tiles total for a full-tube scan
 ```
@@ -137,10 +142,14 @@ Tile filenames encode position: `tile_r{row}_c{col}.jpg` where row increases wit
 
 ### Metadata files
 
-**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_downloaded`
+**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_on_disk`
+
+- `mosaic_on_disk`: `True` if `mosaic.jpg` exists on disk at row-write time, regardless of which run downloaded it. Useful for inventory — reflects actual archive state rather than what happened in the current run.
 
 **`tiles.csv`** columns: `machine`, `machine_id`, `scan_id`, `scan_time`, `row_index`, `col_index`, `x_mm`, `y_mm`, `url`, `local_path`, `downloaded_at`, `file_size_bytes`
 
+- `downloaded_at`: ISO 8601 UTC timestamp of when the tile was fetched. Empty if the download failed.
+
 ---
 
 ## Site structure (RootView)
@@ -161,20 +170,47 @@ Grid coordinates (X, Y) are in millimetres, starting from `(start_x, start_y)` w
 
 ## Resume and reliability
 
-- **Resumable**: `.progress.json` records every completed URL. Re-running the same command skips already-downloaded files.
+- **Resumable**: `.progress.json` records every completed URL. Re-running the same command skips already-downloaded files. `--metadata-only` re-runs additionally skip any scan whose `metadata.json` already exists on disk — no HTTP request is made.
+- **Atomic progress saves**: `.progress.json` is written via a temp-file rename, so a crash mid-save never produces a corrupt or empty progress file.
+- **Heal on resume**: at the start of each scan's tile pass, any tile file that exists on disk but isn't recorded in progress is silently re-marked as complete, preventing duplicate `tiles.csv` rows and redundant re-downloads.
 - **Retry logic**: each tile download retries up to 3 times with exponential backoff (5 s → 10 s → 20 s) before logging a warning and moving on.
-- **Worker cap**: the RootView server renders tiles on a single-threaded PHP process. Running more than 4 concurrent requests causes cascading read timeouts. The default is 2 workers; the scraper hard-caps at 4 and warns loudly if you try to exceed it.
-- **Crash recovery**: if a run is killed mid-flight, some in-progress tiles may have been written as zero-byte files without being marked complete. Run `--recheck` before resuming — it deletes zero-byte files on disk and removes their URLs from `.progress.json` so they are cleanly re-downloaded.
+- **Worker cap**: the RootView server renders tiles on a single-threaded PHP process. Running more than 4 concurrent requests causes cascading timeouts. The default is 2 workers; the scraper hard-caps at 4 and warns if you try to exceed it.
+- **Crash recovery**: run `--recheck` to find and remove zero-byte or missing tile and mosaic files from `.progress.json` so they are cleanly re-downloaded on the next run.
 
 ```bash
-# After any interrupted run, always do this first:
+# After a hard crash, optionally run recheck before resuming:
 python scraper.py --recheck
-# Then resume normally:
+# Then resume normally — the scraper picks up where it left off:
 python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374
 ```
 
 ---
 
+## Run summary
+
+Every run prints a summary table on completion:
+
+```
+──────────────────────────────────────────────
+  Run complete
+──────────────────────────────────────────────
+  Machines:             1
+  Scans fetched:        428  (2 already cached, 0 failed)
+  Metadata written:     428  (new JSON files)
+──────────────────────────────────────────────
+  Scans CSV:            archives/scans.csv
+  Progress:             archives/.progress.json
+──────────────────────────────────────────────
+```
+
+- **Scans fetched**: metadata detail page was retrieved from the server this run.
+- **Already cached**: `metadata.json` already existed on disk; no HTTP request was made.
+- **Failed**: fetch error or scan missing required grid parameters.
+- **Metadata written**: new `metadata.json` files created (shown in `--metadata-only` mode).
+- Mosaic and tile counts appear in their respective modes.
+
+---
+
 ## Dependencies
 
 | Package | Purpose |
diff --git a/spruce/cli.py b/spruce/cli.py
index 6370ea9..9232104 100644
--- a/spruce/cli.py
+++ b/spruce/cli.py
@@ -10,7 +10,7 @@ from pathlib import Path
 
 import yaml
 
-from spruce.orchestrator import scrape_machine
+from spruce.orchestrator import scrape_machine, RunStats
 from spruce.parsers import parse_machine_option
 from spruce.progress import ProgressTracker, CsvWriter
 from spruce.recheck import recheck_archive, recheck_tile_files
@@ -75,6 +75,15 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Download mosaics only; skip individual tiles",
     )
+    p.add_argument(
+        "--metadata-only",
+        action="store_true",
+        help=(
+            "Fetch scan parameters only; write metadata.json and scans.csv "
+            "rows but skip mosaics and tiles. Very fast — suitable for "
+            "inventorying all scans across all machines."
+        ),
+    )
     p.add_argument(
         "--dry-run",
         action="store_true",
@@ -220,7 +229,11 @@ def main() -> None:
         len(machines),
         ", ".join(m["label"] for m in machines),
     )
-    if args.mosaic_only:
+    if args.mosaic_only and args.metadata_only:
+        sys.exit("--mosaic-only and --metadata-only are mutually exclusive.")
+    if args.metadata_only:
+        log.info("Mode: metadata only (mosaics and tiles skipped)")
+    elif args.mosaic_only:
         log.info("Mode: mosaics only (individual tiles skipped)")
     if args.dry_run:
         log.info("Mode: dry-run (no files will be written)")
@@ -230,10 +243,10 @@ def main() -> None:
     tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
     scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
 
-    total = 0
+    totals = RunStats()
     try:
         for machine in machines:
-            count = scrape_machine(
+            stats = scrape_machine(
                 machine=machine,
                 config=config,
                 output_dir=output_dir,
@@ -242,18 +255,62 @@ def main() -> None:
                 scans_csv=scans_csv,
                 dry_run=args.dry_run,
                 mosaic_only=args.mosaic_only,
+                metadata_only=args.metadata_only,
                 scan_id_filter=args.scan_id,
             )
-            total += count
+            totals.merge(stats)
     finally:
         tiles_csv.close()
         scans_csv.close()
         progress.save()
 
-    if args.dry_run:
-        log.info("Dry run complete.")
+    _print_summary(
+        totals=totals,
+        machines=machines,
+        output_dir=output_dir,
+        dry_run=args.dry_run,
+        metadata_only=args.metadata_only,
+        mosaic_only=args.mosaic_only,
+    )
+
+
+def _print_summary(
+    totals: RunStats,
+    machines: list[dict],
+    output_dir: Path,
+    dry_run: bool,
+    metadata_only: bool,
+    mosaic_only: bool,
+) -> None:
+    W = 46
+    sep = "─" * W
+
+    def row(label: str, value: str, note: str = "") -> str:
+        note_str = f"  ({note})" if note else ""
+        return f"  {label:<22}{value}{note_str}"
+
+    log.info(sep)
+    if dry_run:
+        log.info("  Dry run complete — no files written.")
     else:
-        log.info("Done. Total files downloaded: %d", total)
-        log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME)
-        log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME)
-        log.info("Progress  : %s", output_dir / PROGRESS_FILENAME)
+        log.info("  Run complete")
+        log.info(sep)
+        log.info(row("Machines:", str(len(machines))))
+        log.info(
+            row("Scans fetched:", str(totals.scans_fetched),
+                f"{totals.scans_skipped} already cached, "
+                f"{totals.scans_failed} failed"
+                if totals.scans_skipped or totals.scans_failed else "")
+        )
+        if not metadata_only:
+            log.info(row("Mosaics downloaded:", str(totals.mosaics_downloaded)))
+        if not metadata_only and not mosaic_only:
+            log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
+        if metadata_only:
+            log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files"))
+    log.info(sep)
+    log.info(row("Scans CSV:", str(output_dir / SCANS_CSV_FILENAME)))
+    if not metadata_only:
+        log.info(row("Tiles CSV:", str(output_dir / TILES_CSV_FILENAME)))
+    log.info(row("Progress:", str(output_dir / PROGRESS_FILENAME)))
+    log.info(sep)
diff --git a/spruce/orchestrator.py b/spruce/orchestrator.py
index 1525bf5..dff2371 100644
--- a/spruce/orchestrator.py
+++ b/spruce/orchestrator.py
@@ -5,9 +5,30 @@ High-level scrape orchestration: drives the per-machine and per-scan loops.
 import json
 import logging
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 
+
+@dataclass
+class RunStats:
+    """Accumulated counters for one or more machines."""
+
+    scans_fetched: int = 0      # metadata fetched from server this run
+    scans_skipped: int = 0      # metadata.json already on disk; no HTTP request
+    scans_failed: int = 0       # fetch error or missing grid params
+    metadata_written: int = 0   # new metadata.json files created
+    mosaics_downloaded: int = 0
+    tiles_downloaded: int = 0
+
+    def merge(self, other: "RunStats") -> None:
+        self.scans_fetched += other.scans_fetched
+        self.scans_skipped += other.scans_skipped
+        self.scans_failed += other.scans_failed
+        self.metadata_written += other.metadata_written
+        self.mosaics_downloaded += other.mosaics_downloaded
+        self.tiles_downloaded += other.tiles_downloaded
+
 from tqdm import tqdm
 
 from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
@@ -66,6 +87,24 @@ def _download_tiles_for_scan(
     dry_run: bool,
 ) -> int:
     """Download all pending tiles for a scan. Returns count of tiles downloaded."""
+    # Heal progress for tiles that exist on disk but weren't recorded (e.g.
+    # crash between write and batch save). Prevents duplicate tiles.csv rows.
+    healed = 0
+    for t in tiles:
+        if not progress.is_done(t["url"]):
+            dest = tile_dest(output_dir, machine, scan_meta, t)
+            if dest.exists() and dest.stat().st_size > 0:
+                progress.mark_done(t["url"])
+                healed += 1
+    if healed:
+        log.debug(
+            "[%s] Scan %d: healed %d tile(s) already on disk into progress.",
+            machine["label"],
+            scan_id,
+            healed,
+        )
+        progress.save()
+
     pending = [t for t in tiles if not progress.is_done(t["url"])]
     log.info(
         "[%s] Scan %d: %d tiles total, %d pending.",
@@ -152,12 +191,43 @@ def process_scan(
     tiles_csv: CsvWriter,
     dry_run: bool,
     mosaic_only: bool,
-) -> int:
+    metadata_only: bool = False,
+) -> RunStats:
     """
     Process one scan: fetch metadata, download mosaic and (optionally) tiles.
-    Returns total files downloaded for this scan.
+    Returns a RunStats with counters for what happened this call.
+
+    If metadata_only is True, writes metadata.json and the scans.csv row but
+    skips both the mosaic and the tiles.
     """
     scan_id: int = scan["scan_id"]
+    stats = RunStats()
+
+    # In metadata-only mode, skip the HTTP fetch if metadata.json already exists.
+    # Try the date-hinted path first; fall back to a glob when scan_time is
+    # absent (e.g. when --scan-id is used and the synthetic scan dict has no
+    # scan_time field).
+    if metadata_only and not dry_run:
+        machine_root = output_dir / machine_dir_name(machine)
+        scan_date_hint = _extract_date(scan.get("scan_time", ""))
+        found_meta: Path | None = None
+        if scan_date_hint and scan_date_hint != "unknown":
+            candidate = machine_root / scan_date_hint / str(scan_id) / "metadata.json"
+            if candidate.exists():
+                found_meta = candidate
+        if found_meta is None:
+            matches = list(machine_root.glob(f"*/{scan_id}/metadata.json"))
+            if matches:
+                found_meta = matches[0]
+        if found_meta is not None:
+            log.debug(
+                "[%s] Scan %d: metadata.json already exists, skipping fetch.",
+                machine["label"],
+                scan_id,
+            )
+            stats.scans_skipped += 1
+            return stats
+
     log.info("[%s] Processing scan %d …", machine["label"], scan_id)
 
     try:
@@ -166,7 +236,8 @@ def process_scan(
         log.error(
             "[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
         )
-        return 0
+        stats.scans_failed += 1
+        return stats
 
     if not scan_meta.get("nx") or not scan_meta.get("ny"):
         log.warning(
@@ -174,7 +245,10 @@ def process_scan(
             machine["label"],
             scan_id,
         )
-        return 0
+        stats.scans_failed += 1
+        return stats
+
+    stats.scans_fetched += 1
 
     # Merge list-level metadata into scan_meta (detail page takes precedence)
     for k in (
@@ -199,51 +273,64 @@ def process_scan(
             meta_file.write_text(
                 json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
             )
+            stats.metadata_written += 1
 
-    # Mosaic
+    # Mosaic (skipped entirely in metadata-only mode)
     mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
     mosaic_url = sess.mosaic_url(scan_id)
-    mosaic_downloaded = _download_mosaic(
-        sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
-    )
-    total = 1 if mosaic_downloaded else 0
+    mosaic_already_done = progress.is_done(mosaic_url)
+    if metadata_only:
+        mosaic_just_downloaded = False
+    else:
+        mosaic_just_downloaded = _download_mosaic(
+            sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
+        )
+    if mosaic_just_downloaded:
+        stats.mosaics_downloaded += 1
 
-    # Write scan-level CSV row
-    scans_csv.write(
-        {
-            "machine": machine["label"],
-            "machine_id": machine["machine_id"],
-            "scan_id": scan_id,
-            "name": scan_meta.get("name", ""),
-            "scan_time": scan_meta.get("scan_time", ""),
-            "start_x": scan_meta.get("start_x", ""),
-            "start_y": scan_meta.get("start_y", ""),
-            "end_x": scan_meta.get("end_x", ""),
-            "end_y": scan_meta.get("end_y", ""),
-            "dx": scan_meta.get("dx", ""),
-            "dy": scan_meta.get("dy", ""),
-            "nx": scan_meta.get("nx", ""),
-            "ny": scan_meta.get("ny", ""),
-            "total_tiles": scan_meta.get("total_tiles", ""),
-            "scan_lines": scan_meta.get("scan_lines", ""),
-            "scan_mode": scan_meta.get("scan_mode", ""),
-            "start_datetime": scan_meta.get("start_datetime", ""),
-            "end_datetime": scan_meta.get("end_datetime", ""),
-            "status": scan_meta.get("status", ""),
-            "user": scan_meta.get("user", ""),
-            "disk_space_mb": scan_meta.get("disk_space_mb", ""),
-            "mosaic_url": mosaic_url,
-            "mosaic_local_path": str(mosaic_path),
-            "mosaic_downloaded": mosaic_downloaded,
-        }
-    )
+    # Write scan-level CSV row only if this scan hasn't been recorded before.
+    if mosaic_already_done and not metadata_only:
+        log.debug(
+            "[%s] Scan %d: already in scans.csv (mosaic was previously downloaded), skipping CSV row.",
+            machine["label"],
+            scan_id,
+        )
+    else:
+        scans_csv.write(
+            {
+                "machine": machine["label"],
+                "machine_id": machine["machine_id"],
+                "scan_id": scan_id,
+                "name": scan_meta.get("name", ""),
+                "scan_time": scan_meta.get("scan_time", ""),
+                "start_x": scan_meta.get("start_x", ""),
+                "start_y": scan_meta.get("start_y", ""),
+                "end_x": scan_meta.get("end_x", ""),
+                "end_y": scan_meta.get("end_y", ""),
+                "dx": scan_meta.get("dx", ""),
+                "dy": scan_meta.get("dy", ""),
+                "nx": scan_meta.get("nx", ""),
+                "ny": scan_meta.get("ny", ""),
+                "total_tiles": scan_meta.get("total_tiles", ""),
+                "scan_lines": scan_meta.get("scan_lines", ""),
+                "scan_mode": scan_meta.get("scan_mode", ""),
+                "start_datetime": scan_meta.get("start_datetime", ""),
+                "end_datetime": scan_meta.get("end_datetime", ""),
+                "status": scan_meta.get("status", ""),
+                "user": scan_meta.get("user", ""),
+                "disk_space_mb": scan_meta.get("disk_space_mb", ""),
+                "mosaic_url": mosaic_url,
+                "mosaic_local_path": str(mosaic_path),
+                "mosaic_on_disk": mosaic_path.exists(),
+            }
+        )
 
-    if mosaic_only:
-        return total
+    if mosaic_only or metadata_only:
+        return stats
 
     # Tiles
     tiles = sess.enumerate_tiles(scan_meta)
-    total += _download_tiles_for_scan(
+    stats.tiles_downloaded += _download_tiles_for_scan(
         sess,
         tiles,
         scan_meta,
@@ -255,7 +342,7 @@ def process_scan(
         tiles_csv,
         dry_run,
     )
-    return total
+    return stats
 
 
 # ---------------------------------------------------------------------------
@@ -272,12 +359,13 @@ def scrape_machine(
     scans_csv: CsvWriter,
     dry_run: bool,
     mosaic_only: bool,
-    scan_id_filter: int | None,
-) -> int:
+    metadata_only: bool = False,
+    scan_id_filter: int | None = None,
+) -> RunStats:
     """Login, fetch scans, and download all content for one machine."""
     sess = MachineSession(machine, config)
     if not sess.login():
-        return 0
+        return RunStats()
 
     if scan_id_filter is not None:
         scans: list[dict[str, Any]] = [
@@ -288,11 +376,11 @@ def scrape_machine(
         scans = sess.get_all_scans()
         if not scans:
             log.warning("[%s] No scans found.", machine["label"])
-            return 0
+            return RunStats()
 
-    total = 0
+    stats = RunStats()
     for scan in scans:
-        total += process_scan(
+        stats.merge(process_scan(
             sess=sess,
             scan=scan,
             output_dir=output_dir,
@@ -303,5 +391,6 @@ def scrape_machine(
             tiles_csv=tiles_csv,
             dry_run=dry_run,
             mosaic_only=mosaic_only,
-        )
-    return total
+            metadata_only=metadata_only,
+        ))
+    return stats
diff --git a/spruce/progress.py b/spruce/progress.py
index 0d8b826..4d7fa67 100644
--- a/spruce/progress.py
+++ b/spruce/progress.py
@@ -57,9 +57,11 @@ class ProgressTracker:
 
     def save(self) -> None:
         self.path.parent.mkdir(parents=True, exist_ok=True)
-        self.path.write_text(
+        tmp = self.path.with_suffix(".json.tmp")
+        tmp.write_text(
             json.dumps({"completed_urls": sorted(self._done)}, indent=2)
         )
+        tmp.replace(self.path)  # atomic on POSIX; avoids corrupt JSON on crash
 
 
 class CsvWriter:
diff --git a/spruce/recheck.py b/spruce/recheck.py
index aeeffe6..a6db0b2 100644
--- a/spruce/recheck.py
+++ b/spruce/recheck.py
@@ -86,47 +86,67 @@ def recheck_archive(output_dir: Path, progress: ProgressTracker) -> int:
     non-empty. Removes bad entries from progress so the next run re-downloads
     them. Returns the count of entries removed.
 
-    Only tile URLs are checked (mosaic URLs are skipped — mosaics are large
-    single files and are unlikely to be partially written due to streaming).
+    Both tile URLs and mosaic URLs are checked.
     """
     if len(progress) == 0:
         log.info("Progress file is empty — nothing to recheck.")
         return 0
 
-    tile_urls = [u for u in progress.iter_urls() if "cmd=image" in u]
-    mosaic_count = len(progress) - len(tile_urls)
+    all_urls = list(progress.iter_urls())
+    tile_urls = [u for u in all_urls if "cmd=image" in u]
+    mosaic_urls = [u for u in all_urls if "mosaic.jpg" in u]
     log.info(
-        "Rechecking %d tile URLs (%d mosaic URLs not rechecked) …",
+        "Rechecking %d tile URL(s) and %d mosaic URL(s) …",
         len(tile_urls),
-        mosaic_count,
+        len(mosaic_urls),
     )
 
-    # Build a disk index once
-    existing_files = _build_disk_index(output_dir)
-    log.debug("Found %d tile files on disk.", len(existing_files))
+    # Build a disk index of all tile files once
+    existing_tiles = _build_disk_index(output_dir)
+    log.debug("Found %d tile files on disk.", len(existing_tiles))
 
     bad_urls: list[str] = []
 
+    # --- Tile check ---
     for url in tile_urls:
         p = _parse_tile_url(url)
         scan_id = p["scan_id"]
 
         # Find tile files that live under a directory named after this scan_id
-        candidates = [path for path in existing_files if str(scan_id) in path.parts]
+        candidates = [path for path in existing_tiles if str(scan_id) in path.parts]
 
         if not candidates:
             bad_urls.append(url)
             continue
 
-        if not any(existing_files[path] > 0 for path in candidates):
+        if not any(existing_tiles[path] > 0 for path in candidates):
+            bad_urls.append(url)
+
+    # --- Mosaic check ---
+    for url in mosaic_urls:
+        # Mosaic URLs: http://<host>:8011/RootView_Database/<scan_id>/mosaic.jpg
+        # Corresponding local path: <output_dir>/**/<scan_id>/mosaic.jpg
+        try:
+            scan_id = url.rstrip("/").split("/")[-2]
+        except IndexError:
+            bad_urls.append(url)
+            continue
+
+        matches = list(output_dir.glob(f"*/*/{scan_id}/mosaic.jpg"))
+        if not matches or not any(p.stat().st_size > 0 for p in matches):
+            log.debug("Mosaic missing or zero-byte for scan %s: %s", scan_id, url)
             bad_urls.append(url)
 
     if not bad_urls:
-        log.info("All %d tile URLs look healthy.", len(tile_urls))
+        log.info(
+            "All %d tile URL(s) and %d mosaic URL(s) look healthy.",
+            len(tile_urls),
+            len(mosaic_urls),
+        )
         return 0
 
     log.warning(
-        "Found %d suspect tile URL(s). Removing from progress.",
+        "Found %d suspect URL(s). Removing from progress.",
         len(bad_urls),
     )
     for url in bad_urls:
diff --git a/spruce/session.py b/spruce/session.py
index d992c06..bbb3416 100644
--- a/spruce/session.py
+++ b/spruce/session.py
@@ -263,10 +263,6 @@ class MachineSession:
         }
         if dry_run:
             return row
-        if dest.exists():
-            row["downloaded_at"] = "already_exists"
-            row["file_size_bytes"] = dest.stat().st_size
-            return row
         size = self.download_file(tile["url"], dest)
         if size:
             row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
diff --git a/spruce/settings.py b/spruce/settings.py
index d33f910..55fc292 100644
--- a/spruce/settings.py
+++ b/spruce/settings.py
@@ -46,7 +46,7 @@ SCANS_CSV_FIELDS: list[str] = [
     "disk_space_mb",
     "mosaic_url",
     "mosaic_local_path",
-    "mosaic_downloaded",
+    "mosaic_on_disk",
 ]
 
 TILES_CSV_FIELDS: list[str] = [