diff --git a/spruce/cli.py b/spruce/cli.py index 81a3746..c5151b9 100644 --- a/spruce/cli.py +++ b/spruce/cli.py @@ -122,6 +122,17 @@ def parse_args() -> argparse.Namespace: "and report how many were re-queued. Run before resuming after a crash." ), ) + p.add_argument( + "--max-tiles", + type=int, + default=None, + metavar="N", + help=( + "Download at most N tiles per scan (default: all). " + "Pass 1 to probe a single tile — useful for quickly checking " + "whether a scan has real imagery or only placeholder responses." + ), + ) p.add_argument( "--verbose", "-v", @@ -145,6 +156,9 @@ def main() -> None: if args.list_scans_first_page_only and not args.list_scans: sys.exit("--list-scans-first-page-only requires --list-scans") + if args.scan_id is not None and args.scan_id <= 0: + sys.exit("--scan-id must be a positive integer") + # --list-machines doesn't need credentials if args.list_machines: base_url = "http://205.149.147.131:8010/" @@ -270,6 +284,7 @@ def main() -> None: mosaic_only=args.mosaic_only, metadata_only=args.metadata_only, scan_id_filter=args.scan_id, + max_tiles=args.max_tiles, ) totals.merge(stats) finally: @@ -331,6 +346,22 @@ def _print_summary( ) if not metadata_only and not mosaic_only: log.info(row("Tiles downloaded:", str(totals.tiles_downloaded))) + if totals.scans_probe_skipped: + log.info( + row( + "Probe-skipped scans:", + str(totals.scans_probe_skipped), + "probe tile was 404 or placeholder; tile pool skipped", + ) + ) + if not metadata_only and totals.scans_disk_space_skipped: + log.info( + row( + "Zero-disk-space skipped:", + str(totals.scans_disk_space_skipped), + "disk_space_mb=0; mosaic and tiles not attempted", + ) + ) if not dry_run and not metadata_only: log.info( row( diff --git a/spruce/orchestrator.py b/spruce/orchestrator.py index de3e34f..369f2ce 100644 --- a/spruce/orchestrator.py +++ b/spruce/orchestrator.py @@ -9,7 +9,19 @@ from dataclasses import dataclass from pathlib import Path from typing import Any -from spruce.download_result import error_code_str +from spruce.download_result import PERMANENT_MISSING, UNKNOWN, error_code_str + +# RootView returns ~43-byte 1×1 JPEG placeholders for empty cells; stay well +# below smallest observed real tile (~7 KiB in production samples). +PLACEHOLDER_MAX_BYTES = 200 + + +def _is_placeholder_tile(path: Path) -> bool: + """Return True if a downloaded tile looks like a 1×1 server placeholder.""" + try: + return path.is_file() and path.stat().st_size <= PLACEHOLDER_MAX_BYTES + except OSError: + return False @dataclass @@ -21,8 +33,10 @@ class RunStats: scans_failed: int = 0 # metadata fetch error or missing grid params metadata_written: int = 0 # new metadata.json files created mosaics_downloaded: int = 0 - mosaics_failed: int = 0 # mosaic URL attempted but 0 bytes / HTTP error + mosaics_failed: int = 0 # mosaic URL attempted but 0 bytes / HTTP error tiles_downloaded: int = 0 + scans_probe_skipped: int = 0 # probe tile was 404 or placeholder; full tile pool skipped + scans_disk_space_skipped: int = 0 # disk_space_mb == 0; no mosaic or tiles attempted def merge(self, other: "RunStats") -> None: self.scans_fetched += other.scans_fetched @@ -32,6 +46,8 @@ class RunStats: self.mosaics_downloaded += other.mosaics_downloaded self.mosaics_failed += other.mosaics_failed self.tiles_downloaded += other.tiles_downloaded + self.scans_probe_skipped += other.scans_probe_skipped + self.scans_disk_space_skipped += other.scans_disk_space_skipped from tqdm import tqdm @@ -224,6 +240,7 @@ def process_scan( dry_run: bool, mosaic_only: bool, metadata_only: bool = False, + max_tiles: int | None = None, ) -> RunStats: """ Process one scan: fetch metadata, download mosaic and (optionally) tiles. @@ -247,7 +264,8 @@ def process_scan( candidate = machine_root / scan_date_hint / str(scan_id) / "metadata.json" if candidate.exists(): found_meta = candidate - if found_meta is None: + # Date hint is reliable — don't glob if candidate wasn't found. + else: matches = list(machine_root.glob(f"*/{scan_id}/metadata.json")) if matches: found_meta = matches[0] @@ -295,6 +313,23 @@ def process_scan( ): scan_meta.setdefault(k, scan.get(k, "")) + # disk_space_mb == 0 is a reliable signal that the scan has no imagery. + # A 300-scan investigation (50 per bucket) found 0% viability in this bucket. + # Skip the mosaic and tile downloads entirely; write a record so scans.csv + # stays complete. + disk_space_skip = False + if not metadata_only: + try: + if float(scan_meta.get("disk_space_mb") or "nan") == 0.0: + disk_space_skip = True + log.info( + "[%s] Scan %d: disk_space_mb=0 — skipping mosaic and tiles.", + machine["label"], + scan_id, + ) + except (ValueError, TypeError): + pass + # Save per-scan metadata.json scan_date = _extract_date(scan_meta.get("scan_time", "")) scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id) @@ -307,11 +342,11 @@ def process_scan( ) stats.metadata_written += 1 - # Mosaic (skipped entirely in metadata-only mode) + # Mosaic (skipped entirely in metadata-only or disk_space_skip mode) mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id) mosaic_url = sess.mosaic_url(scan_id) mosaic_already_done = progress.is_done(mosaic_url) - if metadata_only: + if metadata_only or disk_space_skip: mosaic_attempt: MosaicAttempt | None = None else: mosaic_attempt = _download_mosaic( @@ -332,6 +367,9 @@ def process_scan( if metadata_only: mds, mer, mco, mcl = "skipped_metadata_only", "", "", "" + elif disk_space_skip: + mds, mer, mco, mcl = "skipped_zero_disk_space", "", "", "" + stats.scans_disk_space_skipped += 1 elif mosaic_attempt is not None: mds = mosaic_attempt.csv_status mer = mosaic_attempt.error @@ -381,11 +419,46 @@ def process_scan( } ) - if mosaic_only or metadata_only: + if mosaic_only or metadata_only or disk_space_skip: return stats # Tiles tiles = sess.enumerate_tiles(scan_meta) + if max_tiles is not None: + tiles = tiles[:max_tiles] + + # Tile probe: always download one tile before launching the full thread + # pool. Two failure modes justify this: + # 1. Mosaic failed (404/410 or empty body) — scan was set up but never + # run; tile grid is all placeholders or 404s. + # 2. Mosaic succeeded but tiles are server-side placeholders (1x1 JPEG, + # ~43 B) — mosaic was generated from empty data; downloading the full + # grid would fire thousands of guaranteed-placeholder requests. + if ( + not dry_run + and tiles + and not progress.is_done(tiles[0]["url"]) + ): + probe_tile = tiles[0] + probe_dest = tile_dest(output_dir, machine, scan_meta, probe_tile) + probe_res = sess.download_file(probe_tile["url"], probe_dest) + if not probe_res.ok or _is_placeholder_tile(probe_dest): + probe_dest.unlink(missing_ok=True) + detail = ( + "is placeholder" + if probe_res.ok + else f"failed ({probe_res.error_class or probe_res.error or 'unknown'})" + ) + log.info( + "[%s] Scan %d: probe tile %s — empty/placeholder scan, skipping %d tile(s).", + machine["label"], + scan_id, + detail, + len(tiles), + ) + stats.scans_probe_skipped += 1 + return stats + stats.tiles_downloaded += _download_tiles_for_scan( sess, tiles, @@ -417,6 +490,7 @@ def scrape_machine( mosaic_only: bool, metadata_only: bool = False, scan_id_filter: int | None = None, + max_tiles: int | None = None, ) -> RunStats: """Login, fetch scans, and download all content for one machine.""" sess = MachineSession(machine, config) @@ -448,5 +522,6 @@ def scrape_machine( dry_run=dry_run, mosaic_only=mosaic_only, metadata_only=metadata_only, + max_tiles=max_tiles, )) return stats