Enhance CSV metadata with error tracking for mosaics and tiles

This commit is contained in:
2026-04-25 16:06:54 -04:00
parent e8d3bf7180
commit ae37c06f15
10 changed files with 406 additions and 34 deletions
+9 -1
View File
@@ -313,11 +313,19 @@ def _print_summary(
row(
"Mosaics failed:",
str(totals.mosaics_failed),
"0 bytes or HTTP error; see log above",
"0 bytes or HTTP error; see scans.csv and logs",
)
)
if not metadata_only and not mosaic_only:
log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
if not dry_run and not metadata_only:
log.info(
row(
"Mosaic & tile errors:",
"",
f"{SCANS_CSV_FILENAME} & {TILES_CSV_FILENAME} (error_class, error, error_code)",
)
)
if metadata_only:
log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files"))
log.info(sep)
+63
View File
@@ -0,0 +1,63 @@
"""
Structured HTTP download result and error classification.
"""
from __future__ import annotations
from dataclasses import dataclass
import requests
# public constants for error_class
PERMANENT_MISSING = "permanent_missing"
TRANSIENT = "transient"
UNKNOWN = "unknown"
OK = "" # success (no error class)
@dataclass(frozen=True)
class DownloadResult:
"""Result of a streaming download (after all retries if applicable)."""
size: int
status_code: int | None
error: str | None
error_class: str
@property
def ok(self) -> bool:
return self.size > 0 and self.error is None
def classify_http_error(
status_code: int | None, exc: BaseException | None
) -> str:
"""
404/410 => likely gone forever.
5xx and transport/timeouts => retry may help.
"""
if status_code in (404, 410):
return PERMANENT_MISSING
if status_code is not None and 500 <= status_code < 600:
return TRANSIENT
if exc is not None:
if isinstance(
exc,
(
requests.Timeout,
requests.ConnectTimeout,
requests.ReadTimeout,
),
):
return TRANSIENT
if isinstance(exc, (requests.exceptions.ConnectionError, OSError)):
return TRANSIENT
if isinstance(exc, requests.exceptions.ChunkedEncodingError):
return TRANSIENT
return UNKNOWN
def error_code_str(status_code: int | None) -> str:
if status_code is None:
return ""
return str(status_code)
+54 -23
View File
@@ -5,10 +5,12 @@ High-level scrape orchestration: drives the per-machine and per-scan loops.
import json
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from spruce.download_result import error_code_str
@dataclass
class RunStats:
@@ -46,6 +48,17 @@ log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
@dataclass
class MosaicAttempt:
"""Outcome of a mosaic download attempt (for scans.csv and RunStats)."""
downloaded: bool
csv_status: str
error: str
error_code: str
error_class: str
def _download_mosaic(
sess: MachineSession,
scan_meta: dict[str, Any],
@@ -55,17 +68,19 @@ def _download_mosaic(
machine: dict[str, Any],
config: dict[str, Any],
dry_run: bool,
) -> bool:
"""Download the scan mosaic if not already done. Returns True if downloaded."""
) -> MosaicAttempt:
"""Download the scan mosaic if not already done."""
url = sess.mosaic_url(scan_id)
if progress.is_done(url):
return False
return MosaicAttempt(
False, "already_done", "", "", ""
)
if dry_run:
log.info("[DRY-RUN] Mosaic: %s%s", url, mosaic_path)
return False
return MosaicAttempt(False, "dry_run", "", "", "")
log.info("[%s] Downloading mosaic for scan %d", machine["label"], scan_id)
size = sess.download_file(url, mosaic_path)
if size:
res = sess.download_file(url, mosaic_path)
if res.ok:
if config.get("write_exif", True):
mmeta: dict[str, Any] | None = config.get("machine_metadata", {}).get(
machine["label"]
@@ -79,10 +94,16 @@ def _download_mosaic(
"[%s] Mosaic saved: %s (%.1f MB)",
machine["label"],
mosaic_path,
size / 1e6,
res.size / 1e6,
)
return True
return False
return MosaicAttempt(True, "downloaded", "", "", "")
return MosaicAttempt(
False,
"failed",
res.error or "",
error_code_str(res.status_code),
res.error_class,
)
def _download_tiles_for_scan(
@@ -161,8 +182,8 @@ def _download_tiles_for_scan(
) as pbar:
for future in as_completed(futures):
result = future.result()
if result.get("file_size_bytes"):
batch.append(result)
batch.append(result)
if result.get("status") == "downloaded":
progress.mark_done(result["url"])
downloaded += 1
pbar.update(1)
@@ -291,9 +312,9 @@ def process_scan(
mosaic_url = sess.mosaic_url(scan_id)
mosaic_already_done = progress.is_done(mosaic_url)
if metadata_only:
mosaic_just_downloaded = False
mosaic_attempt: MosaicAttempt | None = None
else:
mosaic_just_downloaded = _download_mosaic(
mosaic_attempt = _download_mosaic(
sess,
scan_meta,
scan_id,
@@ -303,15 +324,21 @@ def process_scan(
config,
dry_run,
)
if not metadata_only and mosaic_just_downloaded:
stats.mosaics_downloaded += 1
elif (
not metadata_only
and not dry_run
and not mosaic_already_done
and not mosaic_just_downloaded
):
stats.mosaics_failed += 1
if not metadata_only and mosaic_attempt:
if mosaic_attempt.downloaded:
stats.mosaics_downloaded += 1
elif not dry_run and mosaic_attempt.csv_status == "failed":
stats.mosaics_failed += 1
if metadata_only:
mds, mer, mco, mcl = "skipped_metadata_only", "", "", ""
elif mosaic_attempt is not None:
mds = mosaic_attempt.csv_status
mer = mosaic_attempt.error
mco = mosaic_attempt.error_code
mcl = mosaic_attempt.error_class
else:
mds, mer, mco, mcl = "", "", "", ""
# Write scan-level CSV row only if this scan hasn't been recorded before.
if mosaic_already_done and not metadata_only:
@@ -347,6 +374,10 @@ def process_scan(
"mosaic_url": mosaic_url,
"mosaic_local_path": str(mosaic_path),
"mosaic_on_disk": mosaic_path.exists(),
"mosaic_download_status": mds,
"mosaic_error": mer,
"mosaic_error_code": mco,
"mosaic_error_class": mcl,
}
)
+46 -7
View File
@@ -12,6 +12,13 @@ from typing import Any
import requests
from bs4 import BeautifulSoup
from spruce.download_result import (
OK,
UNKNOWN,
DownloadResult,
classify_http_error,
error_code_str,
)
from spruce.parsers import parse_scan_row, parse_scan_view, _grid_values
log = logging.getLogger(__name__)
@@ -205,8 +212,14 @@ class MachineSession:
# Downloads
# ------------------------------------------------------------------
def download_file(self, url: str, dest: Path, retries: int = 3) -> int:
"""Stream-download url to dest with retries. Returns bytes written (0 on failure)."""
def download_file(
self, url: str, dest: Path, retries: int = 3
) -> DownloadResult:
"""
Stream-download url to dest with retries.
Returns a DownloadResult (bytes, optional HTTP code, final error, class).
"""
dest.parent.mkdir(parents=True, exist_ok=True)
backoff = 5.0
for attempt in range(1, retries + 1):
@@ -221,8 +234,21 @@ class MachineSession:
if chunk:
fh.write(chunk)
size += len(chunk)
return size
if size == 0:
return DownloadResult(
0,
resp.status_code,
"0 bytes in response body",
UNKNOWN,
)
return DownloadResult(size, resp.status_code, None, OK)
except Exception as exc:
sc: int | None = None
if (
isinstance(exc, requests.HTTPError)
and exc.response is not None
):
sc = exc.response.status_code
if attempt < retries:
log.debug(
"Attempt %d/%d failed %s: %s — retrying in %.0fs",
@@ -241,7 +267,9 @@ class MachineSession:
url,
exc,
)
return 0
cl = classify_http_error(sc, exc)
return DownloadResult(0, sc, str(exc), cl)
return DownloadResult(0, None, "download_file: exhausted", UNKNOWN)
def download_tile(
self, tile: dict[str, Any], dest: Path, dry_run: bool
@@ -260,11 +288,22 @@ class MachineSession:
"local_path": str(dest),
"downloaded_at": "",
"file_size_bytes": "",
"status": "",
"error": "",
"error_code": "",
"error_class": "",
}
if dry_run:
row["status"] = "dry_run"
return row
size = self.download_file(tile["url"], dest)
if size:
res = self.download_file(tile["url"], dest)
if res.ok:
row["status"] = "downloaded"
row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
row["file_size_bytes"] = size
row["file_size_bytes"] = res.size
else:
row["status"] = "failed"
row["error"] = res.error or ""
row["error_code"] = error_code_str(res.status_code)
row["error_class"] = res.error_class
return row
+8
View File
@@ -47,6 +47,10 @@ SCANS_CSV_FIELDS: list[str] = [
"mosaic_url",
"mosaic_local_path",
"mosaic_on_disk",
"mosaic_download_status",
"mosaic_error",
"mosaic_error_code",
"mosaic_error_class",
]
TILES_CSV_FIELDS: list[str] = [
@@ -60,6 +64,10 @@ TILES_CSV_FIELDS: list[str] = [
"y_mm",
"url",
"local_path",
"status",
"error",
"error_code",
"error_class",
"downloaded_at",
"file_size_bytes",
]