Add --metadata-only mode; harden resume and idempotency
- Add --metadata-only flag: fetches scan detail pages, writes metadata.json + scans.csv rows, skips all image downloads. Re-runs skip scans whose metadata.json already exists. - Atomic progress.json saves (temp-file rename). - Heal-on-resume: tiles on disk but not in progress are silently re-marked before building the pending list. - scans.csv dedup: skip row if mosaic URL already in progress. - Rename mosaic_downloaded -> mosaic_on_disk (reflects disk state). - --recheck now checks mosaics as well as tiles. - RunStats dataclass replaces raw int return; richer run summary. - Fix argparse allow_abbrev reverted; fix --scan-id + --metadata-only glob fallback when scan_time is absent. - Add .venv/ to .gitignore. - README: fix typo, update worker counts, document all new behaviour.
This commit is contained in:
+68
-11
@@ -10,7 +10,7 @@ from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from spruce.orchestrator import scrape_machine
|
||||
from spruce.orchestrator import scrape_machine, RunStats
|
||||
from spruce.parsers import parse_machine_option
|
||||
from spruce.progress import ProgressTracker, CsvWriter
|
||||
from spruce.recheck import recheck_archive, recheck_tile_files
|
||||
@@ -75,6 +75,15 @@ def parse_args() -> argparse.Namespace:
|
||||
action="store_true",
|
||||
help="Download mosaics only; skip individual tiles",
|
||||
)
|
||||
p.add_argument(
|
||||
"--metadata-only",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Fetch scan parameters only; write metadata.json and scans.csv "
|
||||
"rows but skip mosaics and tiles. Very fast — suitable for "
|
||||
"inventorying all scans across all machines."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
@@ -220,7 +229,11 @@ def main() -> None:
|
||||
len(machines),
|
||||
", ".join(m["label"] for m in machines),
|
||||
)
|
||||
if args.mosaic_only:
|
||||
if args.mosaic_only and args.metadata_only:
|
||||
sys.exit("--mosaic-only and --metadata-only are mutually exclusive.")
|
||||
if args.metadata_only:
|
||||
log.info("Mode: metadata only (mosaics and tiles skipped)")
|
||||
elif args.mosaic_only:
|
||||
log.info("Mode: mosaics only (individual tiles skipped)")
|
||||
if args.dry_run:
|
||||
log.info("Mode: dry-run (no files will be written)")
|
||||
@@ -230,10 +243,10 @@ def main() -> None:
|
||||
tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
|
||||
scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
|
||||
|
||||
total = 0
|
||||
totals = RunStats()
|
||||
try:
|
||||
for machine in machines:
|
||||
count = scrape_machine(
|
||||
stats = scrape_machine(
|
||||
machine=machine,
|
||||
config=config,
|
||||
output_dir=output_dir,
|
||||
@@ -242,18 +255,62 @@ def main() -> None:
|
||||
scans_csv=scans_csv,
|
||||
dry_run=args.dry_run,
|
||||
mosaic_only=args.mosaic_only,
|
||||
metadata_only=args.metadata_only,
|
||||
scan_id_filter=args.scan_id,
|
||||
)
|
||||
total += count
|
||||
totals.merge(stats)
|
||||
finally:
|
||||
tiles_csv.close()
|
||||
scans_csv.close()
|
||||
progress.save()
|
||||
|
||||
if args.dry_run:
|
||||
log.info("Dry run complete.")
|
||||
_print_summary(
|
||||
totals=totals,
|
||||
machines=machines,
|
||||
output_dir=output_dir,
|
||||
dry_run=args.dry_run,
|
||||
metadata_only=args.metadata_only,
|
||||
mosaic_only=args.mosaic_only,
|
||||
)
|
||||
|
||||
|
||||
def _print_summary(
|
||||
totals: RunStats,
|
||||
machines: list[dict],
|
||||
output_dir: Path,
|
||||
dry_run: bool,
|
||||
metadata_only: bool,
|
||||
mosaic_only: bool,
|
||||
) -> None:
|
||||
W = 46
|
||||
sep = "─" * W
|
||||
|
||||
def row(label: str, value: str, note: str = "") -> str:
|
||||
note_str = f" ({note})" if note else ""
|
||||
return f" {label:<22}{value}{note_str}"
|
||||
|
||||
log.info(sep)
|
||||
if dry_run:
|
||||
log.info(" Dry run complete — no files written.")
|
||||
else:
|
||||
log.info("Done. Total files downloaded: %d", total)
|
||||
log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME)
|
||||
log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME)
|
||||
log.info("Progress : %s", output_dir / PROGRESS_FILENAME)
|
||||
log.info(" Run complete")
|
||||
log.info(sep)
|
||||
log.info(row("Machines:", str(len(machines))))
|
||||
log.info(
|
||||
row("Scans fetched:", str(totals.scans_fetched),
|
||||
f"{totals.scans_skipped} already cached, "
|
||||
f"{totals.scans_failed} failed"
|
||||
if totals.scans_skipped or totals.scans_failed else "")
|
||||
)
|
||||
if not metadata_only:
|
||||
log.info(row("Mosaics downloaded:", str(totals.mosaics_downloaded)))
|
||||
if not metadata_only and not mosaic_only:
|
||||
log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
|
||||
if metadata_only:
|
||||
log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files"))
|
||||
log.info(sep)
|
||||
log.info(row("Scans CSV:", str(output_dir / SCANS_CSV_FILENAME)))
|
||||
if not metadata_only:
|
||||
log.info(row("Tiles CSV:", str(output_dir / TILES_CSV_FILENAME)))
|
||||
log.info(row("Progress:", str(output_dir / PROGRESS_FILENAME)))
|
||||
log.info(sep)
|
||||
|
||||
Reference in New Issue
Block a user