Enhance CSV metadata with error tracking for mosaics and tiles
This commit is contained in:
@@ -153,13 +153,17 @@ Tile filenames encode position: `tile_r{row}_c{col}.jpg` where row increases wit
|
|||||||
|
|
||||||
### Metadata files
|
### Metadata files
|
||||||
|
|
||||||
**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_on_disk`
|
**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_on_disk`, `mosaic_download_status`, `mosaic_error`, `mosaic_error_code`, `mosaic_error_class`
|
||||||
|
|
||||||
- `mosaic_on_disk`: `True` if `mosaic.jpg` exists on disk at row-write time, regardless of which run downloaded it. Useful for inventory — reflects actual archive state rather than what happened in the current run.
|
- `mosaic_on_disk`: `True` if `mosaic.jpg` exists on disk at row-write time, regardless of which run downloaded it. Useful for inventory — reflects actual archive state rather than what happened in the current run.
|
||||||
|
- `mosaic_download_status`: one of `downloaded`, `failed`, `already_done`, `dry_run`, `skipped_metadata_only` (in `--metadata-only` mode). Failed attempts are still written so you can see missing server-side images in the same CSV.
|
||||||
|
- `mosaic_error` / `mosaic_error_code` / `mosaic_error_class`: set when the URL was tried and the file was not stored successfully. **`mosaic_error_class`** is a coarse hint: `permanent_missing` for HTTP 404/410, `transient` for 5xx or common network/timeout-style failures, and `unknown` for other cases (including a 200 with an empty body). **Rows are append-only;** a failed download leaves an audit record without overwriting prior runs’ history. Delete or rotate the CSVs if you need a new header (see `spruce.settings.SCANS_CSV_FIELDS` / `TILES_CSV_FIELDS`).
|
||||||
|
|
||||||
**`tiles.csv`** columns: `machine`, `machine_id`, `scan_id`, `scan_time`, `row_index`, `col_index`, `x_mm`, `y_mm`, `url`, `local_path`, `downloaded_at`, `file_size_bytes`
|
**`tiles.csv`** columns: `machine`, `machine_id`, `scan_id`, `scan_time`, `row_index`, `col_index`, `x_mm`, `y_mm`, `url`, `local_path`, `status`, `error`, `error_code`, `error_class`, `downloaded_at`, `file_size_bytes`
|
||||||
|
|
||||||
- `downloaded_at`: ISO 8601 UTC timestamp of when the tile was fetched. Empty if the download failed.
|
- `status`: `downloaded`, `failed`, or `dry_run` (if `--dry-run`). Failed rows are kept for the same reason as mosaics.
|
||||||
|
- `error` / `error_code` / `error_class`: same rough semantics as the mosaic fields (`permanent_missing` / `transient` / `unknown`). `error_code` is the HTTP status when available.
|
||||||
|
- `downloaded_at`: ISO 8601 UTC timestamp when the tile was fetched. Empty on failure.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
+9
-1
@@ -313,11 +313,19 @@ def _print_summary(
|
|||||||
row(
|
row(
|
||||||
"Mosaics failed:",
|
"Mosaics failed:",
|
||||||
str(totals.mosaics_failed),
|
str(totals.mosaics_failed),
|
||||||
"0 bytes or HTTP error; see log above",
|
"0 bytes or HTTP error; see scans.csv and logs",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if not metadata_only and not mosaic_only:
|
if not metadata_only and not mosaic_only:
|
||||||
log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
|
log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
|
||||||
|
if not dry_run and not metadata_only:
|
||||||
|
log.info(
|
||||||
|
row(
|
||||||
|
"Mosaic & tile errors:",
|
||||||
|
"",
|
||||||
|
f"{SCANS_CSV_FILENAME} & {TILES_CSV_FILENAME} (error_class, error, error_code)",
|
||||||
|
)
|
||||||
|
)
|
||||||
if metadata_only:
|
if metadata_only:
|
||||||
log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files"))
|
log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files"))
|
||||||
log.info(sep)
|
log.info(sep)
|
||||||
|
|||||||
@@ -0,0 +1,63 @@
|
|||||||
|
"""
|
||||||
|
Structured HTTP download result and error classification.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# public constants for error_class
|
||||||
|
PERMANENT_MISSING = "permanent_missing"
|
||||||
|
TRANSIENT = "transient"
|
||||||
|
UNKNOWN = "unknown"
|
||||||
|
OK = "" # success (no error class)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DownloadResult:
|
||||||
|
"""Result of a streaming download (after all retries if applicable)."""
|
||||||
|
|
||||||
|
size: int
|
||||||
|
status_code: int | None
|
||||||
|
error: str | None
|
||||||
|
error_class: str
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ok(self) -> bool:
|
||||||
|
return self.size > 0 and self.error is None
|
||||||
|
|
||||||
|
|
||||||
|
def classify_http_error(
|
||||||
|
status_code: int | None, exc: BaseException | None
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
404/410 => likely gone forever.
|
||||||
|
5xx and transport/timeouts => retry may help.
|
||||||
|
"""
|
||||||
|
if status_code in (404, 410):
|
||||||
|
return PERMANENT_MISSING
|
||||||
|
if status_code is not None and 500 <= status_code < 600:
|
||||||
|
return TRANSIENT
|
||||||
|
if exc is not None:
|
||||||
|
if isinstance(
|
||||||
|
exc,
|
||||||
|
(
|
||||||
|
requests.Timeout,
|
||||||
|
requests.ConnectTimeout,
|
||||||
|
requests.ReadTimeout,
|
||||||
|
),
|
||||||
|
):
|
||||||
|
return TRANSIENT
|
||||||
|
if isinstance(exc, (requests.exceptions.ConnectionError, OSError)):
|
||||||
|
return TRANSIENT
|
||||||
|
if isinstance(exc, requests.exceptions.ChunkedEncodingError):
|
||||||
|
return TRANSIENT
|
||||||
|
return UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def error_code_str(status_code: int | None) -> str:
|
||||||
|
if status_code is None:
|
||||||
|
return ""
|
||||||
|
return str(status_code)
|
||||||
+51
-20
@@ -5,10 +5,12 @@ High-level scrape orchestration: drives the per-machine and per-scan loops.
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from spruce.download_result import error_code_str
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RunStats:
|
class RunStats:
|
||||||
@@ -46,6 +48,17 @@ log = logging.getLogger(__name__)
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MosaicAttempt:
|
||||||
|
"""Outcome of a mosaic download attempt (for scans.csv and RunStats)."""
|
||||||
|
|
||||||
|
downloaded: bool
|
||||||
|
csv_status: str
|
||||||
|
error: str
|
||||||
|
error_code: str
|
||||||
|
error_class: str
|
||||||
|
|
||||||
|
|
||||||
def _download_mosaic(
|
def _download_mosaic(
|
||||||
sess: MachineSession,
|
sess: MachineSession,
|
||||||
scan_meta: dict[str, Any],
|
scan_meta: dict[str, Any],
|
||||||
@@ -55,17 +68,19 @@ def _download_mosaic(
|
|||||||
machine: dict[str, Any],
|
machine: dict[str, Any],
|
||||||
config: dict[str, Any],
|
config: dict[str, Any],
|
||||||
dry_run: bool,
|
dry_run: bool,
|
||||||
) -> bool:
|
) -> MosaicAttempt:
|
||||||
"""Download the scan mosaic if not already done. Returns True if downloaded."""
|
"""Download the scan mosaic if not already done."""
|
||||||
url = sess.mosaic_url(scan_id)
|
url = sess.mosaic_url(scan_id)
|
||||||
if progress.is_done(url):
|
if progress.is_done(url):
|
||||||
return False
|
return MosaicAttempt(
|
||||||
|
False, "already_done", "", "", ""
|
||||||
|
)
|
||||||
if dry_run:
|
if dry_run:
|
||||||
log.info("[DRY-RUN] Mosaic: %s → %s", url, mosaic_path)
|
log.info("[DRY-RUN] Mosaic: %s → %s", url, mosaic_path)
|
||||||
return False
|
return MosaicAttempt(False, "dry_run", "", "", "")
|
||||||
log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id)
|
log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id)
|
||||||
size = sess.download_file(url, mosaic_path)
|
res = sess.download_file(url, mosaic_path)
|
||||||
if size:
|
if res.ok:
|
||||||
if config.get("write_exif", True):
|
if config.get("write_exif", True):
|
||||||
mmeta: dict[str, Any] | None = config.get("machine_metadata", {}).get(
|
mmeta: dict[str, Any] | None = config.get("machine_metadata", {}).get(
|
||||||
machine["label"]
|
machine["label"]
|
||||||
@@ -79,10 +94,16 @@ def _download_mosaic(
|
|||||||
"[%s] Mosaic saved: %s (%.1f MB)",
|
"[%s] Mosaic saved: %s (%.1f MB)",
|
||||||
machine["label"],
|
machine["label"],
|
||||||
mosaic_path,
|
mosaic_path,
|
||||||
size / 1e6,
|
res.size / 1e6,
|
||||||
|
)
|
||||||
|
return MosaicAttempt(True, "downloaded", "", "", "")
|
||||||
|
return MosaicAttempt(
|
||||||
|
False,
|
||||||
|
"failed",
|
||||||
|
res.error or "",
|
||||||
|
error_code_str(res.status_code),
|
||||||
|
res.error_class,
|
||||||
)
|
)
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _download_tiles_for_scan(
|
def _download_tiles_for_scan(
|
||||||
@@ -161,8 +182,8 @@ def _download_tiles_for_scan(
|
|||||||
) as pbar:
|
) as pbar:
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
result = future.result()
|
result = future.result()
|
||||||
if result.get("file_size_bytes"):
|
|
||||||
batch.append(result)
|
batch.append(result)
|
||||||
|
if result.get("status") == "downloaded":
|
||||||
progress.mark_done(result["url"])
|
progress.mark_done(result["url"])
|
||||||
downloaded += 1
|
downloaded += 1
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
@@ -291,9 +312,9 @@ def process_scan(
|
|||||||
mosaic_url = sess.mosaic_url(scan_id)
|
mosaic_url = sess.mosaic_url(scan_id)
|
||||||
mosaic_already_done = progress.is_done(mosaic_url)
|
mosaic_already_done = progress.is_done(mosaic_url)
|
||||||
if metadata_only:
|
if metadata_only:
|
||||||
mosaic_just_downloaded = False
|
mosaic_attempt: MosaicAttempt | None = None
|
||||||
else:
|
else:
|
||||||
mosaic_just_downloaded = _download_mosaic(
|
mosaic_attempt = _download_mosaic(
|
||||||
sess,
|
sess,
|
||||||
scan_meta,
|
scan_meta,
|
||||||
scan_id,
|
scan_id,
|
||||||
@@ -303,16 +324,22 @@ def process_scan(
|
|||||||
config,
|
config,
|
||||||
dry_run,
|
dry_run,
|
||||||
)
|
)
|
||||||
if not metadata_only and mosaic_just_downloaded:
|
if not metadata_only and mosaic_attempt:
|
||||||
|
if mosaic_attempt.downloaded:
|
||||||
stats.mosaics_downloaded += 1
|
stats.mosaics_downloaded += 1
|
||||||
elif (
|
elif not dry_run and mosaic_attempt.csv_status == "failed":
|
||||||
not metadata_only
|
|
||||||
and not dry_run
|
|
||||||
and not mosaic_already_done
|
|
||||||
and not mosaic_just_downloaded
|
|
||||||
):
|
|
||||||
stats.mosaics_failed += 1
|
stats.mosaics_failed += 1
|
||||||
|
|
||||||
|
if metadata_only:
|
||||||
|
mds, mer, mco, mcl = "skipped_metadata_only", "", "", ""
|
||||||
|
elif mosaic_attempt is not None:
|
||||||
|
mds = mosaic_attempt.csv_status
|
||||||
|
mer = mosaic_attempt.error
|
||||||
|
mco = mosaic_attempt.error_code
|
||||||
|
mcl = mosaic_attempt.error_class
|
||||||
|
else:
|
||||||
|
mds, mer, mco, mcl = "", "", "", ""
|
||||||
|
|
||||||
# Write scan-level CSV row only if this scan hasn't been recorded before.
|
# Write scan-level CSV row only if this scan hasn't been recorded before.
|
||||||
if mosaic_already_done and not metadata_only:
|
if mosaic_already_done and not metadata_only:
|
||||||
log.debug(
|
log.debug(
|
||||||
@@ -347,6 +374,10 @@ def process_scan(
|
|||||||
"mosaic_url": mosaic_url,
|
"mosaic_url": mosaic_url,
|
||||||
"mosaic_local_path": str(mosaic_path),
|
"mosaic_local_path": str(mosaic_path),
|
||||||
"mosaic_on_disk": mosaic_path.exists(),
|
"mosaic_on_disk": mosaic_path.exists(),
|
||||||
|
"mosaic_download_status": mds,
|
||||||
|
"mosaic_error": mer,
|
||||||
|
"mosaic_error_code": mco,
|
||||||
|
"mosaic_error_class": mcl,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
+46
-7
@@ -12,6 +12,13 @@ from typing import Any
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from spruce.download_result import (
|
||||||
|
OK,
|
||||||
|
UNKNOWN,
|
||||||
|
DownloadResult,
|
||||||
|
classify_http_error,
|
||||||
|
error_code_str,
|
||||||
|
)
|
||||||
from spruce.parsers import parse_scan_row, parse_scan_view, _grid_values
|
from spruce.parsers import parse_scan_row, parse_scan_view, _grid_values
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
@@ -205,8 +212,14 @@ class MachineSession:
|
|||||||
# Downloads
|
# Downloads
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
def download_file(self, url: str, dest: Path, retries: int = 3) -> int:
|
def download_file(
|
||||||
"""Stream-download url to dest with retries. Returns bytes written (0 on failure)."""
|
self, url: str, dest: Path, retries: int = 3
|
||||||
|
) -> DownloadResult:
|
||||||
|
"""
|
||||||
|
Stream-download url to dest with retries.
|
||||||
|
|
||||||
|
Returns a DownloadResult (bytes, optional HTTP code, final error, class).
|
||||||
|
"""
|
||||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||||
backoff = 5.0
|
backoff = 5.0
|
||||||
for attempt in range(1, retries + 1):
|
for attempt in range(1, retries + 1):
|
||||||
@@ -221,8 +234,21 @@ class MachineSession:
|
|||||||
if chunk:
|
if chunk:
|
||||||
fh.write(chunk)
|
fh.write(chunk)
|
||||||
size += len(chunk)
|
size += len(chunk)
|
||||||
return size
|
if size == 0:
|
||||||
|
return DownloadResult(
|
||||||
|
0,
|
||||||
|
resp.status_code,
|
||||||
|
"0 bytes in response body",
|
||||||
|
UNKNOWN,
|
||||||
|
)
|
||||||
|
return DownloadResult(size, resp.status_code, None, OK)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
sc: int | None = None
|
||||||
|
if (
|
||||||
|
isinstance(exc, requests.HTTPError)
|
||||||
|
and exc.response is not None
|
||||||
|
):
|
||||||
|
sc = exc.response.status_code
|
||||||
if attempt < retries:
|
if attempt < retries:
|
||||||
log.debug(
|
log.debug(
|
||||||
"Attempt %d/%d failed %s: %s — retrying in %.0fs",
|
"Attempt %d/%d failed %s: %s — retrying in %.0fs",
|
||||||
@@ -241,7 +267,9 @@ class MachineSession:
|
|||||||
url,
|
url,
|
||||||
exc,
|
exc,
|
||||||
)
|
)
|
||||||
return 0
|
cl = classify_http_error(sc, exc)
|
||||||
|
return DownloadResult(0, sc, str(exc), cl)
|
||||||
|
return DownloadResult(0, None, "download_file: exhausted", UNKNOWN)
|
||||||
|
|
||||||
def download_tile(
|
def download_tile(
|
||||||
self, tile: dict[str, Any], dest: Path, dry_run: bool
|
self, tile: dict[str, Any], dest: Path, dry_run: bool
|
||||||
@@ -260,11 +288,22 @@ class MachineSession:
|
|||||||
"local_path": str(dest),
|
"local_path": str(dest),
|
||||||
"downloaded_at": "",
|
"downloaded_at": "",
|
||||||
"file_size_bytes": "",
|
"file_size_bytes": "",
|
||||||
|
"status": "",
|
||||||
|
"error": "",
|
||||||
|
"error_code": "",
|
||||||
|
"error_class": "",
|
||||||
}
|
}
|
||||||
if dry_run:
|
if dry_run:
|
||||||
|
row["status"] = "dry_run"
|
||||||
return row
|
return row
|
||||||
size = self.download_file(tile["url"], dest)
|
res = self.download_file(tile["url"], dest)
|
||||||
if size:
|
if res.ok:
|
||||||
|
row["status"] = "downloaded"
|
||||||
row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
|
row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
|
||||||
row["file_size_bytes"] = size
|
row["file_size_bytes"] = res.size
|
||||||
|
else:
|
||||||
|
row["status"] = "failed"
|
||||||
|
row["error"] = res.error or ""
|
||||||
|
row["error_code"] = error_code_str(res.status_code)
|
||||||
|
row["error_class"] = res.error_class
|
||||||
return row
|
return row
|
||||||
|
|||||||
@@ -47,6 +47,10 @@ SCANS_CSV_FIELDS: list[str] = [
|
|||||||
"mosaic_url",
|
"mosaic_url",
|
||||||
"mosaic_local_path",
|
"mosaic_local_path",
|
||||||
"mosaic_on_disk",
|
"mosaic_on_disk",
|
||||||
|
"mosaic_download_status",
|
||||||
|
"mosaic_error",
|
||||||
|
"mosaic_error_code",
|
||||||
|
"mosaic_error_class",
|
||||||
]
|
]
|
||||||
|
|
||||||
TILES_CSV_FIELDS: list[str] = [
|
TILES_CSV_FIELDS: list[str] = [
|
||||||
@@ -60,6 +64,10 @@ TILES_CSV_FIELDS: list[str] = [
|
|||||||
"y_mm",
|
"y_mm",
|
||||||
"url",
|
"url",
|
||||||
"local_path",
|
"local_path",
|
||||||
|
"status",
|
||||||
|
"error",
|
||||||
|
"error_code",
|
||||||
|
"error_class",
|
||||||
"downloaded_at",
|
"downloaded_at",
|
||||||
"file_size_bytes",
|
"file_size_bytes",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -0,0 +1,36 @@
|
|||||||
|
"""Tests for spruce.download_result classification."""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from spruce.download_result import (
|
||||||
|
PERMANENT_MISSING,
|
||||||
|
TRANSIENT,
|
||||||
|
UNKNOWN,
|
||||||
|
classify_http_error,
|
||||||
|
error_code_str,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_404_permanent():
|
||||||
|
assert classify_http_error(404, None) == PERMANENT_MISSING
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_410_permanent():
|
||||||
|
assert classify_http_error(410, None) == PERMANENT_MISSING
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_503_transient():
|
||||||
|
assert classify_http_error(503, None) == TRANSIENT
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_timeout_transient():
|
||||||
|
assert classify_http_error(None, requests.Timeout()) == TRANSIENT
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_unknown_4xx():
|
||||||
|
assert classify_http_error(403, None) == UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def test_error_code_str():
|
||||||
|
assert error_code_str(None) == ""
|
||||||
|
assert error_code_str(404) == "404"
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
"""Mosaic download outcomes for scans.csv (RunStats / MosaicAttempt)."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
from spruce.download_result import (
|
||||||
|
DownloadResult,
|
||||||
|
PERMANENT_MISSING,
|
||||||
|
TRANSIENT,
|
||||||
|
UNKNOWN,
|
||||||
|
)
|
||||||
|
from spruce.orchestrator import _download_mosaic
|
||||||
|
|
||||||
|
_MACHINE = {"label": "M", "option_value": "v", "machine_id": "1"}
|
||||||
|
_CONFIG: dict = {
|
||||||
|
"write_exif": False,
|
||||||
|
"timeout": 10,
|
||||||
|
"request_delay": 0.0,
|
||||||
|
"machine_metadata": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_mosaic_404_permanent_class():
|
||||||
|
sess = MagicMock()
|
||||||
|
url = "http://img/RootView_Database/9/mosaic.jpg"
|
||||||
|
sess.mosaic_url.return_value = url
|
||||||
|
sess.download_file.return_value = DownloadResult(0, 404, "404", PERMANENT_MISSING)
|
||||||
|
|
||||||
|
progress = MagicMock()
|
||||||
|
progress.is_done.return_value = False
|
||||||
|
mpath = Path("/tmp/mosaic_404.jpg")
|
||||||
|
|
||||||
|
out = _download_mosaic(
|
||||||
|
sess, {}, 9, mpath, progress, _MACHINE, _CONFIG, dry_run=False
|
||||||
|
)
|
||||||
|
assert out.csv_status == "failed"
|
||||||
|
assert out.error_class == PERMANENT_MISSING
|
||||||
|
assert out.error_code == "404"
|
||||||
|
progress.mark_done.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_mosaic_503_transient_class():
|
||||||
|
sess = MagicMock()
|
||||||
|
sess.mosaic_url.return_value = "http://x/m.jpg"
|
||||||
|
sess.download_file.return_value = DownloadResult(0, 503, "err", TRANSIENT)
|
||||||
|
|
||||||
|
progress = MagicMock()
|
||||||
|
progress.is_done.return_value = False
|
||||||
|
|
||||||
|
out = _download_mosaic(
|
||||||
|
sess,
|
||||||
|
{},
|
||||||
|
1,
|
||||||
|
Path("/tmp/m.jpg"),
|
||||||
|
progress,
|
||||||
|
_MACHINE,
|
||||||
|
_CONFIG,
|
||||||
|
dry_run=False,
|
||||||
|
)
|
||||||
|
assert out.error_class == TRANSIENT
|
||||||
|
assert out.error_code == "503"
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_mosaic_empty_body_unknown():
|
||||||
|
sess = MagicMock()
|
||||||
|
sess.mosaic_url.return_value = "http://x/m.jpg"
|
||||||
|
sess.download_file.return_value = DownloadResult(
|
||||||
|
0, 200, "0 bytes in response body", UNKNOWN
|
||||||
|
)
|
||||||
|
|
||||||
|
progress = MagicMock()
|
||||||
|
progress.is_done.return_value = False
|
||||||
|
|
||||||
|
out = _download_mosaic(
|
||||||
|
sess,
|
||||||
|
{},
|
||||||
|
1,
|
||||||
|
Path("/tmp/m.jpg"),
|
||||||
|
progress,
|
||||||
|
_MACHINE,
|
||||||
|
_CONFIG,
|
||||||
|
dry_run=False,
|
||||||
|
)
|
||||||
|
assert out.error_class == UNKNOWN
|
||||||
|
assert "bytes" in out.error or out.error
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
"""HTTP download result wiring in MachineSession."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from spruce.download_result import PERMANENT_MISSING, TRANSIENT
|
||||||
|
from spruce.session import MachineSession
|
||||||
|
|
||||||
|
|
||||||
|
def _minimal_config() -> dict:
|
||||||
|
return {
|
||||||
|
"base_url": "http://127.0.0.1:8010/",
|
||||||
|
"image_base_url": "http://127.0.0.1:8011/",
|
||||||
|
"username": "u",
|
||||||
|
"password": "p",
|
||||||
|
"timeout": 10,
|
||||||
|
"request_delay": 0.0,
|
||||||
|
"workers": 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _machine() -> dict:
|
||||||
|
return {
|
||||||
|
"label": "T [AMR-0]",
|
||||||
|
"option_value": "x",
|
||||||
|
"machine_id": "0",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_file_http_404_result(tmp_path: Path):
|
||||||
|
dest = tmp_path / "f.bin"
|
||||||
|
sess = MachineSession(_machine(), _minimal_config())
|
||||||
|
|
||||||
|
def fail_get(*_a, **_k):
|
||||||
|
r = requests.Response()
|
||||||
|
r.status_code = 404
|
||||||
|
r.url = "http://example/mosaic.jpg"
|
||||||
|
r.reason = "Not Found"
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
sess.http.get = fail_get # type: ignore[method-assign]
|
||||||
|
|
||||||
|
res = sess.download_file("http://example/mosaic.jpg", dest, retries=1)
|
||||||
|
assert res.size == 0
|
||||||
|
assert res.status_code == 404
|
||||||
|
assert res.error_class == PERMANENT_MISSING
|
||||||
|
assert "404" in (res.error or "")
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_tile_row_on_failure(tmp_path: Path):
|
||||||
|
dest = tmp_path / "t.jpg"
|
||||||
|
tile = {
|
||||||
|
"scan_id": 1,
|
||||||
|
"row_index": 0,
|
||||||
|
"col_index": 0,
|
||||||
|
"x_mm": 0.0,
|
||||||
|
"y_mm": 0.0,
|
||||||
|
"url": "http://example/tile",
|
||||||
|
}
|
||||||
|
sess = MachineSession(_machine(), _minimal_config())
|
||||||
|
|
||||||
|
def fail_get(*_a, **_k):
|
||||||
|
r = requests.Response()
|
||||||
|
r.status_code = 500
|
||||||
|
r.url = "http://example"
|
||||||
|
r.reason = "Server Error"
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
sess.http.get = fail_get # type: ignore[method-assign]
|
||||||
|
row = sess.download_tile(tile, dest, dry_run=False)
|
||||||
|
assert row["status"] == "failed"
|
||||||
|
assert row["error_code"] == "500"
|
||||||
|
assert row["error_class"] == TRANSIENT
|
||||||
@@ -7,6 +7,8 @@ import yaml
|
|||||||
|
|
||||||
from spruce.settings import (
|
from spruce.settings import (
|
||||||
MAX_SAFE_WORKERS,
|
MAX_SAFE_WORKERS,
|
||||||
|
SCANS_CSV_FIELDS,
|
||||||
|
TILES_CSV_FIELDS,
|
||||||
_clamp_workers,
|
_clamp_workers,
|
||||||
load_config,
|
load_config,
|
||||||
)
|
)
|
||||||
@@ -91,3 +93,24 @@ def test_load_config_missing_password_exits(tmp_path):
|
|||||||
path.write_text(yaml.dump({"username": "x"}))
|
path.write_text(yaml.dump({"username": "x"}))
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
load_config(str(path))
|
load_config(str(path))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CSV schemas (failure columns)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_scans_csv_fields_includes_mosaic_failure_columns():
|
||||||
|
for name in (
|
||||||
|
"mosaic_download_status",
|
||||||
|
"mosaic_error",
|
||||||
|
"mosaic_error_code",
|
||||||
|
"mosaic_error_class",
|
||||||
|
):
|
||||||
|
assert name in SCANS_CSV_FIELDS
|
||||||
|
|
||||||
|
|
||||||
|
def test_tiles_csv_fields_includes_status_and_error_columns():
|
||||||
|
for name in ("status", "error", "error_code", "error_class"):
|
||||||
|
assert name in TILES_CSV_FIELDS
|
||||||
|
assert TILES_CSV_FIELDS.index("status") < TILES_CSV_FIELDS.index("downloaded_at")
|
||||||
|
|||||||
Reference in New Issue
Block a user