Enhance CSV metadata with error tracking for mosaics and tiles
This commit is contained in:
+54
-23
@@ -5,10 +5,12 @@ High-level scrape orchestration: drives the per-machine and per-scan loops.
|
||||
import json
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from spruce.download_result import error_code_str
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunStats:
|
||||
@@ -46,6 +48,17 @@ log = logging.getLogger(__name__)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class MosaicAttempt:
|
||||
"""Outcome of a mosaic download attempt (for scans.csv and RunStats)."""
|
||||
|
||||
downloaded: bool
|
||||
csv_status: str
|
||||
error: str
|
||||
error_code: str
|
||||
error_class: str
|
||||
|
||||
|
||||
def _download_mosaic(
|
||||
sess: MachineSession,
|
||||
scan_meta: dict[str, Any],
|
||||
@@ -55,17 +68,19 @@ def _download_mosaic(
|
||||
machine: dict[str, Any],
|
||||
config: dict[str, Any],
|
||||
dry_run: bool,
|
||||
) -> bool:
|
||||
"""Download the scan mosaic if not already done. Returns True if downloaded."""
|
||||
) -> MosaicAttempt:
|
||||
"""Download the scan mosaic if not already done."""
|
||||
url = sess.mosaic_url(scan_id)
|
||||
if progress.is_done(url):
|
||||
return False
|
||||
return MosaicAttempt(
|
||||
False, "already_done", "", "", ""
|
||||
)
|
||||
if dry_run:
|
||||
log.info("[DRY-RUN] Mosaic: %s → %s", url, mosaic_path)
|
||||
return False
|
||||
return MosaicAttempt(False, "dry_run", "", "", "")
|
||||
log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id)
|
||||
size = sess.download_file(url, mosaic_path)
|
||||
if size:
|
||||
res = sess.download_file(url, mosaic_path)
|
||||
if res.ok:
|
||||
if config.get("write_exif", True):
|
||||
mmeta: dict[str, Any] | None = config.get("machine_metadata", {}).get(
|
||||
machine["label"]
|
||||
@@ -79,10 +94,16 @@ def _download_mosaic(
|
||||
"[%s] Mosaic saved: %s (%.1f MB)",
|
||||
machine["label"],
|
||||
mosaic_path,
|
||||
size / 1e6,
|
||||
res.size / 1e6,
|
||||
)
|
||||
return True
|
||||
return False
|
||||
return MosaicAttempt(True, "downloaded", "", "", "")
|
||||
return MosaicAttempt(
|
||||
False,
|
||||
"failed",
|
||||
res.error or "",
|
||||
error_code_str(res.status_code),
|
||||
res.error_class,
|
||||
)
|
||||
|
||||
|
||||
def _download_tiles_for_scan(
|
||||
@@ -161,8 +182,8 @@ def _download_tiles_for_scan(
|
||||
) as pbar:
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
if result.get("file_size_bytes"):
|
||||
batch.append(result)
|
||||
batch.append(result)
|
||||
if result.get("status") == "downloaded":
|
||||
progress.mark_done(result["url"])
|
||||
downloaded += 1
|
||||
pbar.update(1)
|
||||
@@ -291,9 +312,9 @@ def process_scan(
|
||||
mosaic_url = sess.mosaic_url(scan_id)
|
||||
mosaic_already_done = progress.is_done(mosaic_url)
|
||||
if metadata_only:
|
||||
mosaic_just_downloaded = False
|
||||
mosaic_attempt: MosaicAttempt | None = None
|
||||
else:
|
||||
mosaic_just_downloaded = _download_mosaic(
|
||||
mosaic_attempt = _download_mosaic(
|
||||
sess,
|
||||
scan_meta,
|
||||
scan_id,
|
||||
@@ -303,15 +324,21 @@ def process_scan(
|
||||
config,
|
||||
dry_run,
|
||||
)
|
||||
if not metadata_only and mosaic_just_downloaded:
|
||||
stats.mosaics_downloaded += 1
|
||||
elif (
|
||||
not metadata_only
|
||||
and not dry_run
|
||||
and not mosaic_already_done
|
||||
and not mosaic_just_downloaded
|
||||
):
|
||||
stats.mosaics_failed += 1
|
||||
if not metadata_only and mosaic_attempt:
|
||||
if mosaic_attempt.downloaded:
|
||||
stats.mosaics_downloaded += 1
|
||||
elif not dry_run and mosaic_attempt.csv_status == "failed":
|
||||
stats.mosaics_failed += 1
|
||||
|
||||
if metadata_only:
|
||||
mds, mer, mco, mcl = "skipped_metadata_only", "", "", ""
|
||||
elif mosaic_attempt is not None:
|
||||
mds = mosaic_attempt.csv_status
|
||||
mer = mosaic_attempt.error
|
||||
mco = mosaic_attempt.error_code
|
||||
mcl = mosaic_attempt.error_class
|
||||
else:
|
||||
mds, mer, mco, mcl = "", "", "", ""
|
||||
|
||||
# Write scan-level CSV row only if this scan hasn't been recorded before.
|
||||
if mosaic_already_done and not metadata_only:
|
||||
@@ -347,6 +374,10 @@ def process_scan(
|
||||
"mosaic_url": mosaic_url,
|
||||
"mosaic_local_path": str(mosaic_path),
|
||||
"mosaic_on_disk": mosaic_path.exists(),
|
||||
"mosaic_download_status": mds,
|
||||
"mosaic_error": mer,
|
||||
"mosaic_error_code": mco,
|
||||
"mosaic_error_class": mcl,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user