From f2193011ca2d62928054e878199f3737f6367b9a Mon Sep 17 00:00:00 2001 From: James Kolpack Date: Fri, 24 Apr 2026 09:44:57 -0400 Subject: [PATCH] Add --metadata-only mode; harden resume and idempotency - Add --metadata-only flag: fetches scan detail pages, writes metadata.json + scans.csv rows, skips all image downloads. Re-runs skip scans whose metadata.json already exists. - Atomic progress.json saves (temp-file rename). - Heal-on-resume: tiles on disk but not in progress are silently re-marked before building the pending list. - scans.csv dedup: skip row if mosaic URL already in progress. - Rename mosaic_downloaded -> mosaic_on_disk (reflects disk state). - --recheck now checks mosaics as well as tiles. - RunStats dataclass replaces raw int return; richer run summary. - Fix argparse allow_abbrev reverted; fix --scan-id + --metadata-only glob fallback when scan_time is absent. - Add .venv/ to .gitignore. - README: fix typo, update worker counts, document all new behaviour. --- .gitignore | 1 + README.md | 62 +++++++++++--- spruce/cli.py | 79 ++++++++++++++--- spruce/orchestrator.py | 189 ++++++++++++++++++++++++++++++----------- spruce/progress.py | 4 +- spruce/recheck.py | 46 +++++++--- spruce/session.py | 4 - spruce/settings.py | 2 +- 8 files changed, 294 insertions(+), 93 deletions(-) diff --git a/.gitignore b/.gitignore index 0022c39..d2f8359 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ __pycache__/ *.pyc .DS_Store explore_dumps/ +.venv/ diff --git a/README.md b/README.md index 3e6c1a5..4d865d7 100644 --- a/README.md +++ b/README.md @@ -42,15 +42,15 @@ A full-tube scan covers a 310 mm × 740 mm cylinder at 3.01 × 2.26 mm steps, pr ### Download speed -Tile downloads are server-limited: the RootView PHP backend renders tiles on-demand, sustaining ~**0.67 tiles/sec** with 8 parallel workers regardless of local bandwidth. Mosaics are pre-rendered and download ~20× faster per MB. +Tile downloads are server-limited: the RootView PHP backend renders tiles on-demand, sustaining ~**0.67 tiles/sec** with 4 parallel workers regardless of local bandwidth. Mosaics are pre-rendered and download ~20× faster per MB. | Scenario | Estimated time | |---|---| | All mosaics (4 workers) | ~3 months | -| Full tiles for one scan (8 workers) | ~14 hours | +| Full tiles for one scan (4 workers) | ~14 hours | | All tiles, full-tube machines only | Years — not recommended | -**Recommended approach:** archive mosaics first (`--mosaic-only`), then selectively download tiles for priority scans. +**Recommended approach:** inventory all scans first (`--metadata-only`, ~80 hours serial or ~7 hours if machines run in parallel), then archive mosaics (`--mosaic-only`), then selectively download tiles for priority scans. --- @@ -58,7 +58,7 @@ Tile downloads are server-limited: the RootView PHP backend renders tiles on-dem ```bash # 1. Clone / download this repo -cd spruce_scrapper +cd spruce_scraper # 2. Install dependencies (Python 3.10+) pip install -r requirements.txt @@ -84,6 +84,10 @@ python scraper.py --list-scans --machine "BW3-20 [AMR-26]" # Preview what would be downloaded (dry run) python scraper.py --machine "BW3-20 [AMR-26]" --dry-run +# Inventory scan parameters only (no images downloaded) — very fast +python scraper.py --metadata-only +python scraper.py --machine "BW3-20 [AMR-26]" --metadata-only + # Download mosaics only for one machine python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only @@ -103,11 +107,12 @@ python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4 |---|---| | `--config FILE` | Config file path (default: `config.yaml`) | | `--machine LABEL` | Restrict to one machine, e.g. `"BW3-20 [AMR-26]"` | -| `--scan-id ID` | Download only this scan (use with `--machine`) | +| `--scan-id ID` | Restrict to one scan ID (use with `--machine`; works with all modes) | | `--mosaic-only` | Download mosaics only; skip individual tiles | +| `--metadata-only` | Fetch scan parameters only; write `metadata.json` + `scans.csv` rows, skip all images. Re-runs skip scans whose `metadata.json` already exists | | `--dry-run` | Print what would be downloaded without saving | | `--workers N` | Parallel download threads (default: 2, hard cap: 4) | -| `--recheck` | Scan archive for zero-byte/missing tiles and remove them from `.progress.json` so they re-download on next run | +| `--recheck` | Scan archive for zero-byte/missing tiles and mosaics; remove bad entries from `.progress.json` so they re-download on next run | | `--list-machines` | Print all machines and exit | | `--list-scans` | Print all scans for `--machine` and exit | | `--verbose` / `-v` | Debug logging | @@ -128,7 +133,7 @@ archives/ ├── metadata.json # full scan parameters (grid, timestamps, etc.) ├── mosaic.jpg # pre-stitched full image (~16 MB) └── tiles/ - ├── tile_r000_c000.jpg # row 0, column 0 + ├── tile_r000_c000.jpg # row 0, column 0 (zero-padding matches grid size) ├── tile_r000_c001.jpg └── ... # 33,784 tiles total for a full-tube scan ``` @@ -137,10 +142,14 @@ Tile filenames encode position: `tile_r{row}_c{col}.jpg` where row increases wit ### Metadata files -**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_downloaded` +**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_on_disk` + +- `mosaic_on_disk`: `True` if `mosaic.jpg` exists on disk at row-write time, regardless of which run downloaded it. Useful for inventory — reflects actual archive state rather than what happened in the current run. **`tiles.csv`** columns: `machine`, `machine_id`, `scan_id`, `scan_time`, `row_index`, `col_index`, `x_mm`, `y_mm`, `url`, `local_path`, `downloaded_at`, `file_size_bytes` +- `downloaded_at`: ISO 8601 UTC timestamp of when the tile was fetched. Empty if the download failed. + --- ## Site structure (RootView) @@ -161,20 +170,47 @@ Grid coordinates (X, Y) are in millimetres, starting from `(start_x, start_y)` w ## Resume and reliability -- **Resumable**: `.progress.json` records every completed URL. Re-running the same command skips already-downloaded files. +- **Resumable**: `.progress.json` records every completed URL. Re-running the same command skips already-downloaded files. `--metadata-only` re-runs additionally skip any scan whose `metadata.json` already exists on disk — no HTTP request is made. +- **Atomic progress saves**: `.progress.json` is written via a temp-file rename, so a crash mid-save never produces a corrupt or empty progress file. +- **Heal on resume**: at the start of each scan's tile pass, any tile file that exists on disk but isn't recorded in progress is silently re-marked as complete, preventing duplicate `tiles.csv` rows and redundant re-downloads. - **Retry logic**: each tile download retries up to 3 times with exponential backoff (5 s → 10 s → 20 s) before logging a warning and moving on. -- **Worker cap**: the RootView server renders tiles on a single-threaded PHP process. Running more than 4 concurrent requests causes cascading read timeouts. The default is 2 workers; the scraper hard-caps at 4 and warns loudly if you try to exceed it. -- **Crash recovery**: if a run is killed mid-flight, some in-progress tiles may have been written as zero-byte files without being marked complete. Run `--recheck` before resuming — it deletes zero-byte files on disk and removes their URLs from `.progress.json` so they are cleanly re-downloaded. +- **Worker cap**: the RootView server renders tiles on a single-threaded PHP process. Running more than 4 concurrent requests causes cascading timeouts. The default is 2 workers; the scraper hard-caps at 4 and warns if you try to exceed it. +- **Crash recovery**: run `--recheck` to find and remove zero-byte or missing tile and mosaic files from `.progress.json` so they are cleanly re-downloaded on the next run. ```bash -# After any interrupted run, always do this first: +# After a hard crash, optionally run recheck before resuming: python scraper.py --recheck -# Then resume normally: +# Then resume normally — the scraper picks up where it left off: python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 ``` --- +## Run summary + +Every run prints a summary table on completion: + +``` +────────────────────────────────────────────── + Run complete +────────────────────────────────────────────── + Machines: 1 + Scans fetched: 428 (2 already cached, 0 failed) + Metadata written: 428 (new JSON files) +────────────────────────────────────────────── + Scans CSV: archives/scans.csv + Progress: archives/.progress.json +────────────────────────────────────────────── +``` + +- **Scans fetched**: metadata detail page was retrieved from the server this run. +- **Already cached**: `metadata.json` already existed on disk; no HTTP request was made. +- **Failed**: fetch error or scan missing required grid parameters. +- **Metadata written**: new `metadata.json` files created (shown in `--metadata-only` mode). +- Mosaic and tile counts appear in their respective modes. + +--- + ## Dependencies | Package | Purpose | diff --git a/spruce/cli.py b/spruce/cli.py index 6370ea9..9232104 100644 --- a/spruce/cli.py +++ b/spruce/cli.py @@ -10,7 +10,7 @@ from pathlib import Path import yaml -from spruce.orchestrator import scrape_machine +from spruce.orchestrator import scrape_machine, RunStats from spruce.parsers import parse_machine_option from spruce.progress import ProgressTracker, CsvWriter from spruce.recheck import recheck_archive, recheck_tile_files @@ -75,6 +75,15 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Download mosaics only; skip individual tiles", ) + p.add_argument( + "--metadata-only", + action="store_true", + help=( + "Fetch scan parameters only; write metadata.json and scans.csv " + "rows but skip mosaics and tiles. Very fast — suitable for " + "inventorying all scans across all machines." + ), + ) p.add_argument( "--dry-run", action="store_true", @@ -220,7 +229,11 @@ def main() -> None: len(machines), ", ".join(m["label"] for m in machines), ) - if args.mosaic_only: + if args.mosaic_only and args.metadata_only: + sys.exit("--mosaic-only and --metadata-only are mutually exclusive.") + if args.metadata_only: + log.info("Mode: metadata only (mosaics and tiles skipped)") + elif args.mosaic_only: log.info("Mode: mosaics only (individual tiles skipped)") if args.dry_run: log.info("Mode: dry-run (no files will be written)") @@ -230,10 +243,10 @@ def main() -> None: tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS) scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS) - total = 0 + totals = RunStats() try: for machine in machines: - count = scrape_machine( + stats = scrape_machine( machine=machine, config=config, output_dir=output_dir, @@ -242,18 +255,62 @@ def main() -> None: scans_csv=scans_csv, dry_run=args.dry_run, mosaic_only=args.mosaic_only, + metadata_only=args.metadata_only, scan_id_filter=args.scan_id, ) - total += count + totals.merge(stats) finally: tiles_csv.close() scans_csv.close() progress.save() - if args.dry_run: - log.info("Dry run complete.") + _print_summary( + totals=totals, + machines=machines, + output_dir=output_dir, + dry_run=args.dry_run, + metadata_only=args.metadata_only, + mosaic_only=args.mosaic_only, + ) + + +def _print_summary( + totals: RunStats, + machines: list[dict], + output_dir: Path, + dry_run: bool, + metadata_only: bool, + mosaic_only: bool, +) -> None: + W = 46 + sep = "─" * W + + def row(label: str, value: str, note: str = "") -> str: + note_str = f" ({note})" if note else "" + return f" {label:<22}{value}{note_str}" + + log.info(sep) + if dry_run: + log.info(" Dry run complete — no files written.") else: - log.info("Done. Total files downloaded: %d", total) - log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME) - log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME) - log.info("Progress : %s", output_dir / PROGRESS_FILENAME) + log.info(" Run complete") + log.info(sep) + log.info(row("Machines:", str(len(machines)))) + log.info( + row("Scans fetched:", str(totals.scans_fetched), + f"{totals.scans_skipped} already cached, " + f"{totals.scans_failed} failed" + if totals.scans_skipped or totals.scans_failed else "") + ) + if not metadata_only: + log.info(row("Mosaics downloaded:", str(totals.mosaics_downloaded))) + if not metadata_only and not mosaic_only: + log.info(row("Tiles downloaded:", str(totals.tiles_downloaded))) + if metadata_only: + log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files")) + log.info(sep) + log.info(row("Scans CSV:", str(output_dir / SCANS_CSV_FILENAME))) + if not metadata_only: + log.info(row("Tiles CSV:", str(output_dir / TILES_CSV_FILENAME))) + log.info(row("Progress:", str(output_dir / PROGRESS_FILENAME))) + log.info(sep) diff --git a/spruce/orchestrator.py b/spruce/orchestrator.py index 1525bf5..dff2371 100644 --- a/spruce/orchestrator.py +++ b/spruce/orchestrator.py @@ -5,9 +5,30 @@ High-level scrape orchestration: drives the per-machine and per-scan loops. import json import logging from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field from pathlib import Path from typing import Any + +@dataclass +class RunStats: + """Accumulated counters for one or more machines.""" + + scans_fetched: int = 0 # metadata fetched from server this run + scans_skipped: int = 0 # metadata.json already on disk; no HTTP request + scans_failed: int = 0 # fetch error or missing grid params + metadata_written: int = 0 # new metadata.json files created + mosaics_downloaded: int = 0 + tiles_downloaded: int = 0 + + def merge(self, other: "RunStats") -> None: + self.scans_fetched += other.scans_fetched + self.scans_skipped += other.scans_skipped + self.scans_failed += other.scans_failed + self.metadata_written += other.metadata_written + self.mosaics_downloaded += other.mosaics_downloaded + self.tiles_downloaded += other.tiles_downloaded + from tqdm import tqdm from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date @@ -66,6 +87,24 @@ def _download_tiles_for_scan( dry_run: bool, ) -> int: """Download all pending tiles for a scan. Returns count of tiles downloaded.""" + # Heal progress for tiles that exist on disk but weren't recorded (e.g. + # crash between write and batch save). Prevents duplicate tiles.csv rows. + healed = 0 + for t in tiles: + if not progress.is_done(t["url"]): + dest = tile_dest(output_dir, machine, scan_meta, t) + if dest.exists() and dest.stat().st_size > 0: + progress.mark_done(t["url"]) + healed += 1 + if healed: + log.debug( + "[%s] Scan %d: healed %d tile(s) already on disk into progress.", + machine["label"], + scan_id, + healed, + ) + progress.save() + pending = [t for t in tiles if not progress.is_done(t["url"])] log.info( "[%s] Scan %d: %d tiles total, %d pending.", @@ -152,12 +191,43 @@ def process_scan( tiles_csv: CsvWriter, dry_run: bool, mosaic_only: bool, -) -> int: + metadata_only: bool = False, +) -> RunStats: """ Process one scan: fetch metadata, download mosaic and (optionally) tiles. - Returns total files downloaded for this scan. + Returns a RunStats with counters for what happened this call. + + If metadata_only is True, writes metadata.json and the scans.csv row but + skips both the mosaic and the tiles. """ scan_id: int = scan["scan_id"] + stats = RunStats() + + # In metadata-only mode, skip the HTTP fetch if metadata.json already exists. + # Try the date-hinted path first; fall back to a glob when scan_time is + # absent (e.g. when --scan-id is used and the synthetic scan dict has no + # scan_time field). + if metadata_only and not dry_run: + machine_root = output_dir / machine_dir_name(machine) + scan_date_hint = _extract_date(scan.get("scan_time", "")) + found_meta: Path | None = None + if scan_date_hint and scan_date_hint != "unknown": + candidate = machine_root / scan_date_hint / str(scan_id) / "metadata.json" + if candidate.exists(): + found_meta = candidate + if found_meta is None: + matches = list(machine_root.glob(f"*/{scan_id}/metadata.json")) + if matches: + found_meta = matches[0] + if found_meta is not None: + log.debug( + "[%s] Scan %d: metadata.json already exists, skipping fetch.", + machine["label"], + scan_id, + ) + stats.scans_skipped += 1 + return stats + log.info("[%s] Processing scan %d …", machine["label"], scan_id) try: @@ -166,7 +236,8 @@ def process_scan( log.error( "[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc ) - return 0 + stats.scans_failed += 1 + return stats if not scan_meta.get("nx") or not scan_meta.get("ny"): log.warning( @@ -174,7 +245,10 @@ def process_scan( machine["label"], scan_id, ) - return 0 + stats.scans_failed += 1 + return stats + + stats.scans_fetched += 1 # Merge list-level metadata into scan_meta (detail page takes precedence) for k in ( @@ -199,51 +273,64 @@ def process_scan( meta_file.write_text( json.dumps(scan_meta, indent=2, default=str), encoding="utf-8" ) + stats.metadata_written += 1 - # Mosaic + # Mosaic (skipped entirely in metadata-only mode) mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id) mosaic_url = sess.mosaic_url(scan_id) - mosaic_downloaded = _download_mosaic( - sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run - ) - total = 1 if mosaic_downloaded else 0 + mosaic_already_done = progress.is_done(mosaic_url) + if metadata_only: + mosaic_just_downloaded = False + else: + mosaic_just_downloaded = _download_mosaic( + sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run + ) + if mosaic_just_downloaded: + stats.mosaics_downloaded += 1 - # Write scan-level CSV row - scans_csv.write( - { - "machine": machine["label"], - "machine_id": machine["machine_id"], - "scan_id": scan_id, - "name": scan_meta.get("name", ""), - "scan_time": scan_meta.get("scan_time", ""), - "start_x": scan_meta.get("start_x", ""), - "start_y": scan_meta.get("start_y", ""), - "end_x": scan_meta.get("end_x", ""), - "end_y": scan_meta.get("end_y", ""), - "dx": scan_meta.get("dx", ""), - "dy": scan_meta.get("dy", ""), - "nx": scan_meta.get("nx", ""), - "ny": scan_meta.get("ny", ""), - "total_tiles": scan_meta.get("total_tiles", ""), - "scan_lines": scan_meta.get("scan_lines", ""), - "scan_mode": scan_meta.get("scan_mode", ""), - "start_datetime": scan_meta.get("start_datetime", ""), - "end_datetime": scan_meta.get("end_datetime", ""), - "status": scan_meta.get("status", ""), - "user": scan_meta.get("user", ""), - "disk_space_mb": scan_meta.get("disk_space_mb", ""), - "mosaic_url": mosaic_url, - "mosaic_local_path": str(mosaic_path), - "mosaic_downloaded": mosaic_downloaded, - } - ) + # Write scan-level CSV row only if this scan hasn't been recorded before. + if mosaic_already_done and not metadata_only: + log.debug( + "[%s] Scan %d: already in scans.csv (mosaic was previously downloaded), skipping CSV row.", + machine["label"], + scan_id, + ) + else: + scans_csv.write( + { + "machine": machine["label"], + "machine_id": machine["machine_id"], + "scan_id": scan_id, + "name": scan_meta.get("name", ""), + "scan_time": scan_meta.get("scan_time", ""), + "start_x": scan_meta.get("start_x", ""), + "start_y": scan_meta.get("start_y", ""), + "end_x": scan_meta.get("end_x", ""), + "end_y": scan_meta.get("end_y", ""), + "dx": scan_meta.get("dx", ""), + "dy": scan_meta.get("dy", ""), + "nx": scan_meta.get("nx", ""), + "ny": scan_meta.get("ny", ""), + "total_tiles": scan_meta.get("total_tiles", ""), + "scan_lines": scan_meta.get("scan_lines", ""), + "scan_mode": scan_meta.get("scan_mode", ""), + "start_datetime": scan_meta.get("start_datetime", ""), + "end_datetime": scan_meta.get("end_datetime", ""), + "status": scan_meta.get("status", ""), + "user": scan_meta.get("user", ""), + "disk_space_mb": scan_meta.get("disk_space_mb", ""), + "mosaic_url": mosaic_url, + "mosaic_local_path": str(mosaic_path), + "mosaic_on_disk": mosaic_path.exists(), + } + ) - if mosaic_only: - return total + if mosaic_only or metadata_only: + return stats # Tiles tiles = sess.enumerate_tiles(scan_meta) - total += _download_tiles_for_scan( + stats.tiles_downloaded += _download_tiles_for_scan( sess, tiles, scan_meta, @@ -255,7 +342,7 @@ def process_scan( tiles_csv, dry_run, ) - return total + return stats # --------------------------------------------------------------------------- @@ -272,12 +359,13 @@ def scrape_machine( scans_csv: CsvWriter, dry_run: bool, mosaic_only: bool, - scan_id_filter: int | None, -) -> int: + metadata_only: bool = False, + scan_id_filter: int | None = None, +) -> RunStats: """Login, fetch scans, and download all content for one machine.""" sess = MachineSession(machine, config) if not sess.login(): - return 0 + return RunStats() if scan_id_filter is not None: scans: list[dict[str, Any]] = [ @@ -288,11 +376,11 @@ def scrape_machine( scans = sess.get_all_scans() if not scans: log.warning("[%s] No scans found.", machine["label"]) - return 0 + return RunStats() - total = 0 + stats = RunStats() for scan in scans: - total += process_scan( + stats.merge(process_scan( sess=sess, scan=scan, output_dir=output_dir, @@ -303,5 +391,6 @@ def scrape_machine( tiles_csv=tiles_csv, dry_run=dry_run, mosaic_only=mosaic_only, - ) - return total + metadata_only=metadata_only, + )) + return stats diff --git a/spruce/progress.py b/spruce/progress.py index 0d8b826..4d7fa67 100644 --- a/spruce/progress.py +++ b/spruce/progress.py @@ -57,9 +57,11 @@ class ProgressTracker: def save(self) -> None: self.path.parent.mkdir(parents=True, exist_ok=True) - self.path.write_text( + tmp = self.path.with_suffix(".json.tmp") + tmp.write_text( json.dumps({"completed_urls": sorted(self._done)}, indent=2) ) + tmp.replace(self.path) # atomic on POSIX; avoids corrupt JSON on crash class CsvWriter: diff --git a/spruce/recheck.py b/spruce/recheck.py index aeeffe6..a6db0b2 100644 --- a/spruce/recheck.py +++ b/spruce/recheck.py @@ -86,47 +86,67 @@ def recheck_archive(output_dir: Path, progress: ProgressTracker) -> int: non-empty. Removes bad entries from progress so the next run re-downloads them. Returns the count of entries removed. - Only tile URLs are checked (mosaic URLs are skipped — mosaics are large - single files and are unlikely to be partially written due to streaming). + Both tile URLs and mosaic URLs are checked. """ if len(progress) == 0: log.info("Progress file is empty — nothing to recheck.") return 0 - tile_urls = [u for u in progress.iter_urls() if "cmd=image" in u] - mosaic_count = len(progress) - len(tile_urls) + all_urls = list(progress.iter_urls()) + tile_urls = [u for u in all_urls if "cmd=image" in u] + mosaic_urls = [u for u in all_urls if "mosaic.jpg" in u] log.info( - "Rechecking %d tile URLs (%d mosaic URLs not rechecked) …", + "Rechecking %d tile URL(s) and %d mosaic URL(s) …", len(tile_urls), - mosaic_count, + len(mosaic_urls), ) - # Build a disk index once - existing_files = _build_disk_index(output_dir) - log.debug("Found %d tile files on disk.", len(existing_files)) + # Build a disk index of all tile files once + existing_tiles = _build_disk_index(output_dir) + log.debug("Found %d tile files on disk.", len(existing_tiles)) bad_urls: list[str] = [] + # --- Tile check --- for url in tile_urls: p = _parse_tile_url(url) scan_id = p["scan_id"] # Find tile files that live under a directory named after this scan_id - candidates = [path for path in existing_files if str(scan_id) in path.parts] + candidates = [path for path in existing_tiles if str(scan_id) in path.parts] if not candidates: bad_urls.append(url) continue - if not any(existing_files[path] > 0 for path in candidates): + if not any(existing_tiles[path] > 0 for path in candidates): + bad_urls.append(url) + + # --- Mosaic check --- + for url in mosaic_urls: + # Mosaic URLs: http://:8011/RootView_Database//mosaic.jpg + # Corresponding local path: /**//mosaic.jpg + try: + scan_id = url.rstrip("/").split("/")[-2] + except IndexError: + bad_urls.append(url) + continue + + matches = list(output_dir.glob(f"*/*/{scan_id}/mosaic.jpg")) + if not matches or not any(p.stat().st_size > 0 for p in matches): + log.debug("Mosaic missing or zero-byte for scan %s: %s", scan_id, url) bad_urls.append(url) if not bad_urls: - log.info("All %d tile URLs look healthy.", len(tile_urls)) + log.info( + "All %d tile URL(s) and %d mosaic URL(s) look healthy.", + len(tile_urls), + len(mosaic_urls), + ) return 0 log.warning( - "Found %d suspect tile URL(s). Removing from progress.", + "Found %d suspect URL(s). Removing from progress.", len(bad_urls), ) for url in bad_urls: diff --git a/spruce/session.py b/spruce/session.py index d992c06..bbb3416 100644 --- a/spruce/session.py +++ b/spruce/session.py @@ -263,10 +263,6 @@ class MachineSession: } if dry_run: return row - if dest.exists(): - row["downloaded_at"] = "already_exists" - row["file_size_bytes"] = dest.stat().st_size - return row size = self.download_file(tile["url"], dest) if size: row["downloaded_at"] = datetime.now(timezone.utc).isoformat() diff --git a/spruce/settings.py b/spruce/settings.py index d33f910..55fc292 100644 --- a/spruce/settings.py +++ b/spruce/settings.py @@ -46,7 +46,7 @@ SCANS_CSV_FIELDS: list[str] = [ "disk_space_mb", "mosaic_url", "mosaic_local_path", - "mosaic_downloaded", + "mosaic_on_disk", ] TILES_CSV_FIELDS: list[str] = [