Add --metadata-only mode; harden resume and idempotency
- Add --metadata-only flag: fetches scan detail pages, writes metadata.json + scans.csv rows, skips all image downloads. Re-runs skip scans whose metadata.json already exists. - Atomic progress.json saves (temp-file rename). - Heal-on-resume: tiles on disk but not in progress are silently re-marked before building the pending list. - scans.csv dedup: skip row if mosaic URL already in progress. - Rename mosaic_downloaded -> mosaic_on_disk (reflects disk state). - --recheck now checks mosaics as well as tiles. - RunStats dataclass replaces raw int return; richer run summary. - Fix argparse allow_abbrev reverted; fix --scan-id + --metadata-only glob fallback when scan_time is absent. - Add .venv/ to .gitignore. - README: fix typo, update worker counts, document all new behaviour.
This commit is contained in:
+139
-50
@@ -5,9 +5,30 @@ High-level scrape orchestration: drives the per-machine and per-scan loops.
|
||||
import json
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunStats:
|
||||
"""Accumulated counters for one or more machines."""
|
||||
|
||||
scans_fetched: int = 0 # metadata fetched from server this run
|
||||
scans_skipped: int = 0 # metadata.json already on disk; no HTTP request
|
||||
scans_failed: int = 0 # fetch error or missing grid params
|
||||
metadata_written: int = 0 # new metadata.json files created
|
||||
mosaics_downloaded: int = 0
|
||||
tiles_downloaded: int = 0
|
||||
|
||||
def merge(self, other: "RunStats") -> None:
|
||||
self.scans_fetched += other.scans_fetched
|
||||
self.scans_skipped += other.scans_skipped
|
||||
self.scans_failed += other.scans_failed
|
||||
self.metadata_written += other.metadata_written
|
||||
self.mosaics_downloaded += other.mosaics_downloaded
|
||||
self.tiles_downloaded += other.tiles_downloaded
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
|
||||
@@ -66,6 +87,24 @@ def _download_tiles_for_scan(
|
||||
dry_run: bool,
|
||||
) -> int:
|
||||
"""Download all pending tiles for a scan. Returns count of tiles downloaded."""
|
||||
# Heal progress for tiles that exist on disk but weren't recorded (e.g.
|
||||
# crash between write and batch save). Prevents duplicate tiles.csv rows.
|
||||
healed = 0
|
||||
for t in tiles:
|
||||
if not progress.is_done(t["url"]):
|
||||
dest = tile_dest(output_dir, machine, scan_meta, t)
|
||||
if dest.exists() and dest.stat().st_size > 0:
|
||||
progress.mark_done(t["url"])
|
||||
healed += 1
|
||||
if healed:
|
||||
log.debug(
|
||||
"[%s] Scan %d: healed %d tile(s) already on disk into progress.",
|
||||
machine["label"],
|
||||
scan_id,
|
||||
healed,
|
||||
)
|
||||
progress.save()
|
||||
|
||||
pending = [t for t in tiles if not progress.is_done(t["url"])]
|
||||
log.info(
|
||||
"[%s] Scan %d: %d tiles total, %d pending.",
|
||||
@@ -152,12 +191,43 @@ def process_scan(
|
||||
tiles_csv: CsvWriter,
|
||||
dry_run: bool,
|
||||
mosaic_only: bool,
|
||||
) -> int:
|
||||
metadata_only: bool = False,
|
||||
) -> RunStats:
|
||||
"""
|
||||
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
|
||||
Returns total files downloaded for this scan.
|
||||
Returns a RunStats with counters for what happened this call.
|
||||
|
||||
If metadata_only is True, writes metadata.json and the scans.csv row but
|
||||
skips both the mosaic and the tiles.
|
||||
"""
|
||||
scan_id: int = scan["scan_id"]
|
||||
stats = RunStats()
|
||||
|
||||
# In metadata-only mode, skip the HTTP fetch if metadata.json already exists.
|
||||
# Try the date-hinted path first; fall back to a glob when scan_time is
|
||||
# absent (e.g. when --scan-id is used and the synthetic scan dict has no
|
||||
# scan_time field).
|
||||
if metadata_only and not dry_run:
|
||||
machine_root = output_dir / machine_dir_name(machine)
|
||||
scan_date_hint = _extract_date(scan.get("scan_time", ""))
|
||||
found_meta: Path | None = None
|
||||
if scan_date_hint and scan_date_hint != "unknown":
|
||||
candidate = machine_root / scan_date_hint / str(scan_id) / "metadata.json"
|
||||
if candidate.exists():
|
||||
found_meta = candidate
|
||||
if found_meta is None:
|
||||
matches = list(machine_root.glob(f"*/{scan_id}/metadata.json"))
|
||||
if matches:
|
||||
found_meta = matches[0]
|
||||
if found_meta is not None:
|
||||
log.debug(
|
||||
"[%s] Scan %d: metadata.json already exists, skipping fetch.",
|
||||
machine["label"],
|
||||
scan_id,
|
||||
)
|
||||
stats.scans_skipped += 1
|
||||
return stats
|
||||
|
||||
log.info("[%s] Processing scan %d …", machine["label"], scan_id)
|
||||
|
||||
try:
|
||||
@@ -166,7 +236,8 @@ def process_scan(
|
||||
log.error(
|
||||
"[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
|
||||
)
|
||||
return 0
|
||||
stats.scans_failed += 1
|
||||
return stats
|
||||
|
||||
if not scan_meta.get("nx") or not scan_meta.get("ny"):
|
||||
log.warning(
|
||||
@@ -174,7 +245,10 @@ def process_scan(
|
||||
machine["label"],
|
||||
scan_id,
|
||||
)
|
||||
return 0
|
||||
stats.scans_failed += 1
|
||||
return stats
|
||||
|
||||
stats.scans_fetched += 1
|
||||
|
||||
# Merge list-level metadata into scan_meta (detail page takes precedence)
|
||||
for k in (
|
||||
@@ -199,51 +273,64 @@ def process_scan(
|
||||
meta_file.write_text(
|
||||
json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
|
||||
)
|
||||
stats.metadata_written += 1
|
||||
|
||||
# Mosaic
|
||||
# Mosaic (skipped entirely in metadata-only mode)
|
||||
mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
|
||||
mosaic_url = sess.mosaic_url(scan_id)
|
||||
mosaic_downloaded = _download_mosaic(
|
||||
sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
|
||||
)
|
||||
total = 1 if mosaic_downloaded else 0
|
||||
mosaic_already_done = progress.is_done(mosaic_url)
|
||||
if metadata_only:
|
||||
mosaic_just_downloaded = False
|
||||
else:
|
||||
mosaic_just_downloaded = _download_mosaic(
|
||||
sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
|
||||
)
|
||||
if mosaic_just_downloaded:
|
||||
stats.mosaics_downloaded += 1
|
||||
|
||||
# Write scan-level CSV row
|
||||
scans_csv.write(
|
||||
{
|
||||
"machine": machine["label"],
|
||||
"machine_id": machine["machine_id"],
|
||||
"scan_id": scan_id,
|
||||
"name": scan_meta.get("name", ""),
|
||||
"scan_time": scan_meta.get("scan_time", ""),
|
||||
"start_x": scan_meta.get("start_x", ""),
|
||||
"start_y": scan_meta.get("start_y", ""),
|
||||
"end_x": scan_meta.get("end_x", ""),
|
||||
"end_y": scan_meta.get("end_y", ""),
|
||||
"dx": scan_meta.get("dx", ""),
|
||||
"dy": scan_meta.get("dy", ""),
|
||||
"nx": scan_meta.get("nx", ""),
|
||||
"ny": scan_meta.get("ny", ""),
|
||||
"total_tiles": scan_meta.get("total_tiles", ""),
|
||||
"scan_lines": scan_meta.get("scan_lines", ""),
|
||||
"scan_mode": scan_meta.get("scan_mode", ""),
|
||||
"start_datetime": scan_meta.get("start_datetime", ""),
|
||||
"end_datetime": scan_meta.get("end_datetime", ""),
|
||||
"status": scan_meta.get("status", ""),
|
||||
"user": scan_meta.get("user", ""),
|
||||
"disk_space_mb": scan_meta.get("disk_space_mb", ""),
|
||||
"mosaic_url": mosaic_url,
|
||||
"mosaic_local_path": str(mosaic_path),
|
||||
"mosaic_downloaded": mosaic_downloaded,
|
||||
}
|
||||
)
|
||||
# Write scan-level CSV row only if this scan hasn't been recorded before.
|
||||
if mosaic_already_done and not metadata_only:
|
||||
log.debug(
|
||||
"[%s] Scan %d: already in scans.csv (mosaic was previously downloaded), skipping CSV row.",
|
||||
machine["label"],
|
||||
scan_id,
|
||||
)
|
||||
else:
|
||||
scans_csv.write(
|
||||
{
|
||||
"machine": machine["label"],
|
||||
"machine_id": machine["machine_id"],
|
||||
"scan_id": scan_id,
|
||||
"name": scan_meta.get("name", ""),
|
||||
"scan_time": scan_meta.get("scan_time", ""),
|
||||
"start_x": scan_meta.get("start_x", ""),
|
||||
"start_y": scan_meta.get("start_y", ""),
|
||||
"end_x": scan_meta.get("end_x", ""),
|
||||
"end_y": scan_meta.get("end_y", ""),
|
||||
"dx": scan_meta.get("dx", ""),
|
||||
"dy": scan_meta.get("dy", ""),
|
||||
"nx": scan_meta.get("nx", ""),
|
||||
"ny": scan_meta.get("ny", ""),
|
||||
"total_tiles": scan_meta.get("total_tiles", ""),
|
||||
"scan_lines": scan_meta.get("scan_lines", ""),
|
||||
"scan_mode": scan_meta.get("scan_mode", ""),
|
||||
"start_datetime": scan_meta.get("start_datetime", ""),
|
||||
"end_datetime": scan_meta.get("end_datetime", ""),
|
||||
"status": scan_meta.get("status", ""),
|
||||
"user": scan_meta.get("user", ""),
|
||||
"disk_space_mb": scan_meta.get("disk_space_mb", ""),
|
||||
"mosaic_url": mosaic_url,
|
||||
"mosaic_local_path": str(mosaic_path),
|
||||
"mosaic_on_disk": mosaic_path.exists(),
|
||||
}
|
||||
)
|
||||
|
||||
if mosaic_only:
|
||||
return total
|
||||
if mosaic_only or metadata_only:
|
||||
return stats
|
||||
|
||||
# Tiles
|
||||
tiles = sess.enumerate_tiles(scan_meta)
|
||||
total += _download_tiles_for_scan(
|
||||
stats.tiles_downloaded += _download_tiles_for_scan(
|
||||
sess,
|
||||
tiles,
|
||||
scan_meta,
|
||||
@@ -255,7 +342,7 @@ def process_scan(
|
||||
tiles_csv,
|
||||
dry_run,
|
||||
)
|
||||
return total
|
||||
return stats
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -272,12 +359,13 @@ def scrape_machine(
|
||||
scans_csv: CsvWriter,
|
||||
dry_run: bool,
|
||||
mosaic_only: bool,
|
||||
scan_id_filter: int | None,
|
||||
) -> int:
|
||||
metadata_only: bool = False,
|
||||
scan_id_filter: int | None = None,
|
||||
) -> RunStats:
|
||||
"""Login, fetch scans, and download all content for one machine."""
|
||||
sess = MachineSession(machine, config)
|
||||
if not sess.login():
|
||||
return 0
|
||||
return RunStats()
|
||||
|
||||
if scan_id_filter is not None:
|
||||
scans: list[dict[str, Any]] = [
|
||||
@@ -288,11 +376,11 @@ def scrape_machine(
|
||||
scans = sess.get_all_scans()
|
||||
if not scans:
|
||||
log.warning("[%s] No scans found.", machine["label"])
|
||||
return 0
|
||||
return RunStats()
|
||||
|
||||
total = 0
|
||||
stats = RunStats()
|
||||
for scan in scans:
|
||||
total += process_scan(
|
||||
stats.merge(process_scan(
|
||||
sess=sess,
|
||||
scan=scan,
|
||||
output_dir=output_dir,
|
||||
@@ -303,5 +391,6 @@ def scrape_machine(
|
||||
tiles_csv=tiles_csv,
|
||||
dry_run=dry_run,
|
||||
mosaic_only=mosaic_only,
|
||||
)
|
||||
return total
|
||||
metadata_only=metadata_only,
|
||||
))
|
||||
return stats
|
||||
|
||||
Reference in New Issue
Block a user