Scraping resilience, metadata tooling, and repository hygiene

Consolidates mosaic and session hardening (login retry, skip processed scans, no retry on 404, started_at), progress reporting (Markdown tables, by-year rollup, rolling-window rate/ETA), and metadata workflow scripts (run_metadata_scan.sh, scan_progress_report.py, export_machine_metadata.py). Adds mosaic reconstruction sample JPEGs referenced by the report. Updates .gitignore for backup/ and .claude/; sample_random_scans helper is documented for branch testing/sample-runs only (see README).
This commit is contained in:
2026-05-14 19:52:53 -04:00
parent 752c278dff
commit 6390f5d529
23 changed files with 788 additions and 188 deletions
+50 -3
View File
@@ -241,6 +241,7 @@ def process_scan(
mosaic_only: bool,
metadata_only: bool = False,
max_tiles: int | None = None,
scans_csv_existing_ids: set[int] | None = None,
) -> RunStats:
"""
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
@@ -379,9 +380,16 @@ def process_scan(
mds, mer, mco, mcl = "", "", "", ""
# Write scan-level CSV row only if this scan hasn't been recorded before.
if mosaic_already_done and not metadata_only:
# Skip if: (1) mosaic URL already in .progress.json, or (2) scan already
# has a non-pending row in scans.csv from a prior run.
already_recorded = (mosaic_already_done and not metadata_only) or (
not metadata_only
and scans_csv_existing_ids is not None
and scan_id in scans_csv_existing_ids
)
if already_recorded:
log.debug(
"[%s] Scan %d: already in scans.csv (mosaic was previously downloaded), skipping CSV row.",
"[%s] Scan %d: already in scans.csv, skipping CSV row.",
machine["label"],
scan_id,
)
@@ -494,7 +502,20 @@ def scrape_machine(
) -> RunStats:
"""Login, fetch scans, and download all content for one machine."""
sess = MachineSession(machine, config)
if not sess.login():
login_ok = False
for attempt in range(1, 4):
if sess.login():
login_ok = True
break
if attempt < 3:
log.warning(
"[%s] Login failed (attempt %d/3) — retrying in 10s.",
machine["label"],
attempt,
)
time.sleep(10)
if not login_ok:
log.error("[%s] Login failed after 3 attempts — skipping machine.", machine["label"])
return RunStats()
if scan_id_filter is not None:
@@ -508,8 +529,33 @@ def scrape_machine(
log.warning("[%s] No scans found.", machine["label"])
return RunStats()
# Build a set of scan_ids already fully processed in a prior run so we can
# skip them entirely (no metadata fetch, no mosaic request).
# Only scans with a definitive non-pending status count; skipped_metadata_only
# rows still need to be processed in mosaic mode.
PENDING_STATUSES = {"skipped_metadata_only", ""}
existing_ids: set[int] = set()
if not metadata_only and scans_csv._fh.name:
existing_path = Path(scans_csv._fh.name)
if existing_path.exists():
import csv as _csv
with open(existing_path, newline="", encoding="utf-8") as _f:
for _row in _csv.DictReader(_f):
if _row.get("machine") == machine["label"]:
if _row.get("mosaic_download_status", "") not in PENDING_STATUSES:
existing_ids.add(int(_row["scan_id"]))
stats = RunStats()
for scan in scans:
# Skip scans already fully processed in a prior run — avoids redundant
# metadata fetches and mosaic requests for known-failed / known-done scans.
if not metadata_only and scan["scan_id"] in existing_ids:
log.debug(
"[%s] Scan %d: already processed, skipping.",
machine["label"],
scan["scan_id"],
)
continue
stats.merge(process_scan(
sess=sess,
scan=scan,
@@ -523,5 +569,6 @@ def scrape_machine(
mosaic_only=mosaic_only,
metadata_only=metadata_only,
max_tiles=max_tiles,
scans_csv_existing_ids=existing_ids,
))
return stats