Enhance CSV metadata with error tracking for mosaics and tiles

This commit is contained in:
2026-04-25 16:06:54 -04:00
parent e8d3bf7180
commit ae37c06f15
10 changed files with 406 additions and 34 deletions
+46 -7
View File
@@ -12,6 +12,13 @@ from typing import Any
import requests
from bs4 import BeautifulSoup
from spruce.download_result import (
OK,
UNKNOWN,
DownloadResult,
classify_http_error,
error_code_str,
)
from spruce.parsers import parse_scan_row, parse_scan_view, _grid_values
log = logging.getLogger(__name__)
@@ -205,8 +212,14 @@ class MachineSession:
# Downloads
# ------------------------------------------------------------------
def download_file(self, url: str, dest: Path, retries: int = 3) -> int:
"""Stream-download url to dest with retries. Returns bytes written (0 on failure)."""
def download_file(
self, url: str, dest: Path, retries: int = 3
) -> DownloadResult:
"""
Stream-download url to dest with retries.
Returns a DownloadResult (bytes, optional HTTP code, final error, class).
"""
dest.parent.mkdir(parents=True, exist_ok=True)
backoff = 5.0
for attempt in range(1, retries + 1):
@@ -221,8 +234,21 @@ class MachineSession:
if chunk:
fh.write(chunk)
size += len(chunk)
return size
if size == 0:
return DownloadResult(
0,
resp.status_code,
"0 bytes in response body",
UNKNOWN,
)
return DownloadResult(size, resp.status_code, None, OK)
except Exception as exc:
sc: int | None = None
if (
isinstance(exc, requests.HTTPError)
and exc.response is not None
):
sc = exc.response.status_code
if attempt < retries:
log.debug(
"Attempt %d/%d failed %s: %s — retrying in %.0fs",
@@ -241,7 +267,9 @@ class MachineSession:
url,
exc,
)
return 0
cl = classify_http_error(sc, exc)
return DownloadResult(0, sc, str(exc), cl)
return DownloadResult(0, None, "download_file: exhausted", UNKNOWN)
def download_tile(
self, tile: dict[str, Any], dest: Path, dry_run: bool
@@ -260,11 +288,22 @@ class MachineSession:
"local_path": str(dest),
"downloaded_at": "",
"file_size_bytes": "",
"status": "",
"error": "",
"error_code": "",
"error_class": "",
}
if dry_run:
row["status"] = "dry_run"
return row
size = self.download_file(tile["url"], dest)
if size:
res = self.download_file(tile["url"], dest)
if res.ok:
row["status"] = "downloaded"
row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
row["file_size_bytes"] = size
row["file_size_bytes"] = res.size
else:
row["status"] = "failed"
row["error"] = res.error or ""
row["error_code"] = error_code_str(res.status_code)
row["error_class"] = res.error_class
return row