Added skip logic - Based on random sampling, when disk_space_mb=0, it is safe to entirely skip it
This commit is contained in:
@@ -122,6 +122,17 @@ def parse_args() -> argparse.Namespace:
|
|||||||
"and report how many were re-queued. Run before resuming after a crash."
|
"and report how many were re-queued. Run before resuming after a crash."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--max-tiles",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
metavar="N",
|
||||||
|
help=(
|
||||||
|
"Download at most N tiles per scan (default: all). "
|
||||||
|
"Pass 1 to probe a single tile — useful for quickly checking "
|
||||||
|
"whether a scan has real imagery or only placeholder responses."
|
||||||
|
),
|
||||||
|
)
|
||||||
p.add_argument(
|
p.add_argument(
|
||||||
"--verbose",
|
"--verbose",
|
||||||
"-v",
|
"-v",
|
||||||
@@ -145,6 +156,9 @@ def main() -> None:
|
|||||||
if args.list_scans_first_page_only and not args.list_scans:
|
if args.list_scans_first_page_only and not args.list_scans:
|
||||||
sys.exit("--list-scans-first-page-only requires --list-scans")
|
sys.exit("--list-scans-first-page-only requires --list-scans")
|
||||||
|
|
||||||
|
if args.scan_id is not None and args.scan_id <= 0:
|
||||||
|
sys.exit("--scan-id must be a positive integer")
|
||||||
|
|
||||||
# --list-machines doesn't need credentials
|
# --list-machines doesn't need credentials
|
||||||
if args.list_machines:
|
if args.list_machines:
|
||||||
base_url = "http://205.149.147.131:8010/"
|
base_url = "http://205.149.147.131:8010/"
|
||||||
@@ -270,6 +284,7 @@ def main() -> None:
|
|||||||
mosaic_only=args.mosaic_only,
|
mosaic_only=args.mosaic_only,
|
||||||
metadata_only=args.metadata_only,
|
metadata_only=args.metadata_only,
|
||||||
scan_id_filter=args.scan_id,
|
scan_id_filter=args.scan_id,
|
||||||
|
max_tiles=args.max_tiles,
|
||||||
)
|
)
|
||||||
totals.merge(stats)
|
totals.merge(stats)
|
||||||
finally:
|
finally:
|
||||||
@@ -331,6 +346,22 @@ def _print_summary(
|
|||||||
)
|
)
|
||||||
if not metadata_only and not mosaic_only:
|
if not metadata_only and not mosaic_only:
|
||||||
log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
|
log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
|
||||||
|
if totals.scans_probe_skipped:
|
||||||
|
log.info(
|
||||||
|
row(
|
||||||
|
"Probe-skipped scans:",
|
||||||
|
str(totals.scans_probe_skipped),
|
||||||
|
"probe tile was 404 or placeholder; tile pool skipped",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if not metadata_only and totals.scans_disk_space_skipped:
|
||||||
|
log.info(
|
||||||
|
row(
|
||||||
|
"Zero-disk-space skipped:",
|
||||||
|
str(totals.scans_disk_space_skipped),
|
||||||
|
"disk_space_mb=0; mosaic and tiles not attempted",
|
||||||
|
)
|
||||||
|
)
|
||||||
if not dry_run and not metadata_only:
|
if not dry_run and not metadata_only:
|
||||||
log.info(
|
log.info(
|
||||||
row(
|
row(
|
||||||
|
|||||||
+80
-5
@@ -9,7 +9,19 @@ from dataclasses import dataclass
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from spruce.download_result import error_code_str
|
from spruce.download_result import PERMANENT_MISSING, UNKNOWN, error_code_str
|
||||||
|
|
||||||
|
# RootView returns ~43-byte 1×1 JPEG placeholders for empty cells; stay well
|
||||||
|
# below smallest observed real tile (~7 KiB in production samples).
|
||||||
|
PLACEHOLDER_MAX_BYTES = 200
|
||||||
|
|
||||||
|
|
||||||
|
def _is_placeholder_tile(path: Path) -> bool:
|
||||||
|
"""Return True if a downloaded tile looks like a 1×1 server placeholder."""
|
||||||
|
try:
|
||||||
|
return path.is_file() and path.stat().st_size <= PLACEHOLDER_MAX_BYTES
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -23,6 +35,8 @@ class RunStats:
|
|||||||
mosaics_downloaded: int = 0
|
mosaics_downloaded: int = 0
|
||||||
mosaics_failed: int = 0 # mosaic URL attempted but 0 bytes / HTTP error
|
mosaics_failed: int = 0 # mosaic URL attempted but 0 bytes / HTTP error
|
||||||
tiles_downloaded: int = 0
|
tiles_downloaded: int = 0
|
||||||
|
scans_probe_skipped: int = 0 # probe tile was 404 or placeholder; full tile pool skipped
|
||||||
|
scans_disk_space_skipped: int = 0 # disk_space_mb == 0; no mosaic or tiles attempted
|
||||||
|
|
||||||
def merge(self, other: "RunStats") -> None:
|
def merge(self, other: "RunStats") -> None:
|
||||||
self.scans_fetched += other.scans_fetched
|
self.scans_fetched += other.scans_fetched
|
||||||
@@ -32,6 +46,8 @@ class RunStats:
|
|||||||
self.mosaics_downloaded += other.mosaics_downloaded
|
self.mosaics_downloaded += other.mosaics_downloaded
|
||||||
self.mosaics_failed += other.mosaics_failed
|
self.mosaics_failed += other.mosaics_failed
|
||||||
self.tiles_downloaded += other.tiles_downloaded
|
self.tiles_downloaded += other.tiles_downloaded
|
||||||
|
self.scans_probe_skipped += other.scans_probe_skipped
|
||||||
|
self.scans_disk_space_skipped += other.scans_disk_space_skipped
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
@@ -224,6 +240,7 @@ def process_scan(
|
|||||||
dry_run: bool,
|
dry_run: bool,
|
||||||
mosaic_only: bool,
|
mosaic_only: bool,
|
||||||
metadata_only: bool = False,
|
metadata_only: bool = False,
|
||||||
|
max_tiles: int | None = None,
|
||||||
) -> RunStats:
|
) -> RunStats:
|
||||||
"""
|
"""
|
||||||
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
|
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
|
||||||
@@ -247,7 +264,8 @@ def process_scan(
|
|||||||
candidate = machine_root / scan_date_hint / str(scan_id) / "metadata.json"
|
candidate = machine_root / scan_date_hint / str(scan_id) / "metadata.json"
|
||||||
if candidate.exists():
|
if candidate.exists():
|
||||||
found_meta = candidate
|
found_meta = candidate
|
||||||
if found_meta is None:
|
# Date hint is reliable — don't glob if candidate wasn't found.
|
||||||
|
else:
|
||||||
matches = list(machine_root.glob(f"*/{scan_id}/metadata.json"))
|
matches = list(machine_root.glob(f"*/{scan_id}/metadata.json"))
|
||||||
if matches:
|
if matches:
|
||||||
found_meta = matches[0]
|
found_meta = matches[0]
|
||||||
@@ -295,6 +313,23 @@ def process_scan(
|
|||||||
):
|
):
|
||||||
scan_meta.setdefault(k, scan.get(k, ""))
|
scan_meta.setdefault(k, scan.get(k, ""))
|
||||||
|
|
||||||
|
# disk_space_mb == 0 is a reliable signal that the scan has no imagery.
|
||||||
|
# A 300-scan investigation (50 per bucket) found 0% viability in this bucket.
|
||||||
|
# Skip the mosaic and tile downloads entirely; write a record so scans.csv
|
||||||
|
# stays complete.
|
||||||
|
disk_space_skip = False
|
||||||
|
if not metadata_only:
|
||||||
|
try:
|
||||||
|
if float(scan_meta.get("disk_space_mb") or "nan") == 0.0:
|
||||||
|
disk_space_skip = True
|
||||||
|
log.info(
|
||||||
|
"[%s] Scan %d: disk_space_mb=0 — skipping mosaic and tiles.",
|
||||||
|
machine["label"],
|
||||||
|
scan_id,
|
||||||
|
)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
# Save per-scan metadata.json
|
# Save per-scan metadata.json
|
||||||
scan_date = _extract_date(scan_meta.get("scan_time", ""))
|
scan_date = _extract_date(scan_meta.get("scan_time", ""))
|
||||||
scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id)
|
scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id)
|
||||||
@@ -307,11 +342,11 @@ def process_scan(
|
|||||||
)
|
)
|
||||||
stats.metadata_written += 1
|
stats.metadata_written += 1
|
||||||
|
|
||||||
# Mosaic (skipped entirely in metadata-only mode)
|
# Mosaic (skipped entirely in metadata-only or disk_space_skip mode)
|
||||||
mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
|
mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
|
||||||
mosaic_url = sess.mosaic_url(scan_id)
|
mosaic_url = sess.mosaic_url(scan_id)
|
||||||
mosaic_already_done = progress.is_done(mosaic_url)
|
mosaic_already_done = progress.is_done(mosaic_url)
|
||||||
if metadata_only:
|
if metadata_only or disk_space_skip:
|
||||||
mosaic_attempt: MosaicAttempt | None = None
|
mosaic_attempt: MosaicAttempt | None = None
|
||||||
else:
|
else:
|
||||||
mosaic_attempt = _download_mosaic(
|
mosaic_attempt = _download_mosaic(
|
||||||
@@ -332,6 +367,9 @@ def process_scan(
|
|||||||
|
|
||||||
if metadata_only:
|
if metadata_only:
|
||||||
mds, mer, mco, mcl = "skipped_metadata_only", "", "", ""
|
mds, mer, mco, mcl = "skipped_metadata_only", "", "", ""
|
||||||
|
elif disk_space_skip:
|
||||||
|
mds, mer, mco, mcl = "skipped_zero_disk_space", "", "", ""
|
||||||
|
stats.scans_disk_space_skipped += 1
|
||||||
elif mosaic_attempt is not None:
|
elif mosaic_attempt is not None:
|
||||||
mds = mosaic_attempt.csv_status
|
mds = mosaic_attempt.csv_status
|
||||||
mer = mosaic_attempt.error
|
mer = mosaic_attempt.error
|
||||||
@@ -381,11 +419,46 @@ def process_scan(
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
if mosaic_only or metadata_only:
|
if mosaic_only or metadata_only or disk_space_skip:
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
# Tiles
|
# Tiles
|
||||||
tiles = sess.enumerate_tiles(scan_meta)
|
tiles = sess.enumerate_tiles(scan_meta)
|
||||||
|
if max_tiles is not None:
|
||||||
|
tiles = tiles[:max_tiles]
|
||||||
|
|
||||||
|
# Tile probe: always download one tile before launching the full thread
|
||||||
|
# pool. Two failure modes justify this:
|
||||||
|
# 1. Mosaic failed (404/410 or empty body) — scan was set up but never
|
||||||
|
# run; tile grid is all placeholders or 404s.
|
||||||
|
# 2. Mosaic succeeded but tiles are server-side placeholders (1x1 JPEG,
|
||||||
|
# ~43 B) — mosaic was generated from empty data; downloading the full
|
||||||
|
# grid would fire thousands of guaranteed-placeholder requests.
|
||||||
|
if (
|
||||||
|
not dry_run
|
||||||
|
and tiles
|
||||||
|
and not progress.is_done(tiles[0]["url"])
|
||||||
|
):
|
||||||
|
probe_tile = tiles[0]
|
||||||
|
probe_dest = tile_dest(output_dir, machine, scan_meta, probe_tile)
|
||||||
|
probe_res = sess.download_file(probe_tile["url"], probe_dest)
|
||||||
|
if not probe_res.ok or _is_placeholder_tile(probe_dest):
|
||||||
|
probe_dest.unlink(missing_ok=True)
|
||||||
|
detail = (
|
||||||
|
"is placeholder"
|
||||||
|
if probe_res.ok
|
||||||
|
else f"failed ({probe_res.error_class or probe_res.error or 'unknown'})"
|
||||||
|
)
|
||||||
|
log.info(
|
||||||
|
"[%s] Scan %d: probe tile %s — empty/placeholder scan, skipping %d tile(s).",
|
||||||
|
machine["label"],
|
||||||
|
scan_id,
|
||||||
|
detail,
|
||||||
|
len(tiles),
|
||||||
|
)
|
||||||
|
stats.scans_probe_skipped += 1
|
||||||
|
return stats
|
||||||
|
|
||||||
stats.tiles_downloaded += _download_tiles_for_scan(
|
stats.tiles_downloaded += _download_tiles_for_scan(
|
||||||
sess,
|
sess,
|
||||||
tiles,
|
tiles,
|
||||||
@@ -417,6 +490,7 @@ def scrape_machine(
|
|||||||
mosaic_only: bool,
|
mosaic_only: bool,
|
||||||
metadata_only: bool = False,
|
metadata_only: bool = False,
|
||||||
scan_id_filter: int | None = None,
|
scan_id_filter: int | None = None,
|
||||||
|
max_tiles: int | None = None,
|
||||||
) -> RunStats:
|
) -> RunStats:
|
||||||
"""Login, fetch scans, and download all content for one machine."""
|
"""Login, fetch scans, and download all content for one machine."""
|
||||||
sess = MachineSession(machine, config)
|
sess = MachineSession(machine, config)
|
||||||
@@ -448,5 +522,6 @@ def scrape_machine(
|
|||||||
dry_run=dry_run,
|
dry_run=dry_run,
|
||||||
mosaic_only=mosaic_only,
|
mosaic_only=mosaic_only,
|
||||||
metadata_only=metadata_only,
|
metadata_only=metadata_only,
|
||||||
|
max_tiles=max_tiles,
|
||||||
))
|
))
|
||||||
return stats
|
return stats
|
||||||
|
|||||||
Reference in New Issue
Block a user