diff --git a/README.md b/README.md index 4da9bd9..87b94ab 100644 --- a/README.md +++ b/README.md @@ -153,13 +153,17 @@ Tile filenames encode position: `tile_r{row}_c{col}.jpg` where row increases wit ### Metadata files -**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_on_disk` +**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_on_disk`, `mosaic_download_status`, `mosaic_error`, `mosaic_error_code`, `mosaic_error_class` - `mosaic_on_disk`: `True` if `mosaic.jpg` exists on disk at row-write time, regardless of which run downloaded it. Useful for inventory — reflects actual archive state rather than what happened in the current run. +- `mosaic_download_status`: one of `downloaded`, `failed`, `already_done`, `dry_run`, `skipped_metadata_only` (in `--metadata-only` mode). Failed attempts are still written so you can see missing server-side images in the same CSV. +- `mosaic_error` / `mosaic_error_code` / `mosaic_error_class`: set when the URL was tried and the file was not stored successfully. **`mosaic_error_class`** is a coarse hint: `permanent_missing` for HTTP 404/410, `transient` for 5xx or common network/timeout-style failures, and `unknown` for other cases (including a 200 with an empty body). **Rows are append-only;** a failed download leaves an audit record without overwriting prior runs’ history. Delete or rotate the CSVs if you need a new header (see `spruce.settings.SCANS_CSV_FIELDS` / `TILES_CSV_FIELDS`). -**`tiles.csv`** columns: `machine`, `machine_id`, `scan_id`, `scan_time`, `row_index`, `col_index`, `x_mm`, `y_mm`, `url`, `local_path`, `downloaded_at`, `file_size_bytes` +**`tiles.csv`** columns: `machine`, `machine_id`, `scan_id`, `scan_time`, `row_index`, `col_index`, `x_mm`, `y_mm`, `url`, `local_path`, `status`, `error`, `error_code`, `error_class`, `downloaded_at`, `file_size_bytes` -- `downloaded_at`: ISO 8601 UTC timestamp of when the tile was fetched. Empty if the download failed. +- `status`: `downloaded`, `failed`, or `dry_run` (if `--dry-run`). Failed rows are kept for the same reason as mosaics. +- `error` / `error_code` / `error_class`: same rough semantics as the mosaic fields (`permanent_missing` / `transient` / `unknown`). `error_code` is the HTTP status when available. +- `downloaded_at`: ISO 8601 UTC timestamp when the tile was fetched. Empty on failure. --- diff --git a/spruce/cli.py b/spruce/cli.py index 853c7e3..3795bea 100644 --- a/spruce/cli.py +++ b/spruce/cli.py @@ -313,11 +313,19 @@ def _print_summary( row( "Mosaics failed:", str(totals.mosaics_failed), - "0 bytes or HTTP error; see log above", + "0 bytes or HTTP error; see scans.csv and logs", ) ) if not metadata_only and not mosaic_only: log.info(row("Tiles downloaded:", str(totals.tiles_downloaded))) + if not dry_run and not metadata_only: + log.info( + row( + "Mosaic & tile errors:", + "", + f"{SCANS_CSV_FILENAME} & {TILES_CSV_FILENAME} (error_class, error, error_code)", + ) + ) if metadata_only: log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files")) log.info(sep) diff --git a/spruce/download_result.py b/spruce/download_result.py new file mode 100644 index 0000000..1591b89 --- /dev/null +++ b/spruce/download_result.py @@ -0,0 +1,63 @@ +""" +Structured HTTP download result and error classification. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +import requests + +# public constants for error_class +PERMANENT_MISSING = "permanent_missing" +TRANSIENT = "transient" +UNKNOWN = "unknown" +OK = "" # success (no error class) + + +@dataclass(frozen=True) +class DownloadResult: + """Result of a streaming download (after all retries if applicable).""" + + size: int + status_code: int | None + error: str | None + error_class: str + + @property + def ok(self) -> bool: + return self.size > 0 and self.error is None + + +def classify_http_error( + status_code: int | None, exc: BaseException | None +) -> str: + """ + 404/410 => likely gone forever. + 5xx and transport/timeouts => retry may help. + """ + if status_code in (404, 410): + return PERMANENT_MISSING + if status_code is not None and 500 <= status_code < 600: + return TRANSIENT + if exc is not None: + if isinstance( + exc, + ( + requests.Timeout, + requests.ConnectTimeout, + requests.ReadTimeout, + ), + ): + return TRANSIENT + if isinstance(exc, (requests.exceptions.ConnectionError, OSError)): + return TRANSIENT + if isinstance(exc, requests.exceptions.ChunkedEncodingError): + return TRANSIENT + return UNKNOWN + + +def error_code_str(status_code: int | None) -> str: + if status_code is None: + return "" + return str(status_code) diff --git a/spruce/orchestrator.py b/spruce/orchestrator.py index 1294ee8..de3e34f 100644 --- a/spruce/orchestrator.py +++ b/spruce/orchestrator.py @@ -5,10 +5,12 @@ High-level scrape orchestration: drives the per-machine and per-scan loops. import json import logging from concurrent.futures import ThreadPoolExecutor, as_completed -from dataclasses import dataclass, field +from dataclasses import dataclass from pathlib import Path from typing import Any +from spruce.download_result import error_code_str + @dataclass class RunStats: @@ -46,6 +48,17 @@ log = logging.getLogger(__name__) # --------------------------------------------------------------------------- +@dataclass +class MosaicAttempt: + """Outcome of a mosaic download attempt (for scans.csv and RunStats).""" + + downloaded: bool + csv_status: str + error: str + error_code: str + error_class: str + + def _download_mosaic( sess: MachineSession, scan_meta: dict[str, Any], @@ -55,17 +68,19 @@ def _download_mosaic( machine: dict[str, Any], config: dict[str, Any], dry_run: bool, -) -> bool: - """Download the scan mosaic if not already done. Returns True if downloaded.""" +) -> MosaicAttempt: + """Download the scan mosaic if not already done.""" url = sess.mosaic_url(scan_id) if progress.is_done(url): - return False + return MosaicAttempt( + False, "already_done", "", "", "" + ) if dry_run: log.info("[DRY-RUN] Mosaic: %s → %s", url, mosaic_path) - return False + return MosaicAttempt(False, "dry_run", "", "", "") log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id) - size = sess.download_file(url, mosaic_path) - if size: + res = sess.download_file(url, mosaic_path) + if res.ok: if config.get("write_exif", True): mmeta: dict[str, Any] | None = config.get("machine_metadata", {}).get( machine["label"] @@ -79,10 +94,16 @@ def _download_mosaic( "[%s] Mosaic saved: %s (%.1f MB)", machine["label"], mosaic_path, - size / 1e6, + res.size / 1e6, ) - return True - return False + return MosaicAttempt(True, "downloaded", "", "", "") + return MosaicAttempt( + False, + "failed", + res.error or "", + error_code_str(res.status_code), + res.error_class, + ) def _download_tiles_for_scan( @@ -161,8 +182,8 @@ def _download_tiles_for_scan( ) as pbar: for future in as_completed(futures): result = future.result() - if result.get("file_size_bytes"): - batch.append(result) + batch.append(result) + if result.get("status") == "downloaded": progress.mark_done(result["url"]) downloaded += 1 pbar.update(1) @@ -291,9 +312,9 @@ def process_scan( mosaic_url = sess.mosaic_url(scan_id) mosaic_already_done = progress.is_done(mosaic_url) if metadata_only: - mosaic_just_downloaded = False + mosaic_attempt: MosaicAttempt | None = None else: - mosaic_just_downloaded = _download_mosaic( + mosaic_attempt = _download_mosaic( sess, scan_meta, scan_id, @@ -303,15 +324,21 @@ def process_scan( config, dry_run, ) - if not metadata_only and mosaic_just_downloaded: - stats.mosaics_downloaded += 1 - elif ( - not metadata_only - and not dry_run - and not mosaic_already_done - and not mosaic_just_downloaded - ): - stats.mosaics_failed += 1 + if not metadata_only and mosaic_attempt: + if mosaic_attempt.downloaded: + stats.mosaics_downloaded += 1 + elif not dry_run and mosaic_attempt.csv_status == "failed": + stats.mosaics_failed += 1 + + if metadata_only: + mds, mer, mco, mcl = "skipped_metadata_only", "", "", "" + elif mosaic_attempt is not None: + mds = mosaic_attempt.csv_status + mer = mosaic_attempt.error + mco = mosaic_attempt.error_code + mcl = mosaic_attempt.error_class + else: + mds, mer, mco, mcl = "", "", "", "" # Write scan-level CSV row only if this scan hasn't been recorded before. if mosaic_already_done and not metadata_only: @@ -347,6 +374,10 @@ def process_scan( "mosaic_url": mosaic_url, "mosaic_local_path": str(mosaic_path), "mosaic_on_disk": mosaic_path.exists(), + "mosaic_download_status": mds, + "mosaic_error": mer, + "mosaic_error_code": mco, + "mosaic_error_class": mcl, } ) diff --git a/spruce/session.py b/spruce/session.py index bbb3416..aecef2b 100644 --- a/spruce/session.py +++ b/spruce/session.py @@ -12,6 +12,13 @@ from typing import Any import requests from bs4 import BeautifulSoup +from spruce.download_result import ( + OK, + UNKNOWN, + DownloadResult, + classify_http_error, + error_code_str, +) from spruce.parsers import parse_scan_row, parse_scan_view, _grid_values log = logging.getLogger(__name__) @@ -205,8 +212,14 @@ class MachineSession: # Downloads # ------------------------------------------------------------------ - def download_file(self, url: str, dest: Path, retries: int = 3) -> int: - """Stream-download url to dest with retries. Returns bytes written (0 on failure).""" + def download_file( + self, url: str, dest: Path, retries: int = 3 + ) -> DownloadResult: + """ + Stream-download url to dest with retries. + + Returns a DownloadResult (bytes, optional HTTP code, final error, class). + """ dest.parent.mkdir(parents=True, exist_ok=True) backoff = 5.0 for attempt in range(1, retries + 1): @@ -221,8 +234,21 @@ class MachineSession: if chunk: fh.write(chunk) size += len(chunk) - return size + if size == 0: + return DownloadResult( + 0, + resp.status_code, + "0 bytes in response body", + UNKNOWN, + ) + return DownloadResult(size, resp.status_code, None, OK) except Exception as exc: + sc: int | None = None + if ( + isinstance(exc, requests.HTTPError) + and exc.response is not None + ): + sc = exc.response.status_code if attempt < retries: log.debug( "Attempt %d/%d failed %s: %s — retrying in %.0fs", @@ -241,7 +267,9 @@ class MachineSession: url, exc, ) - return 0 + cl = classify_http_error(sc, exc) + return DownloadResult(0, sc, str(exc), cl) + return DownloadResult(0, None, "download_file: exhausted", UNKNOWN) def download_tile( self, tile: dict[str, Any], dest: Path, dry_run: bool @@ -260,11 +288,22 @@ class MachineSession: "local_path": str(dest), "downloaded_at": "", "file_size_bytes": "", + "status": "", + "error": "", + "error_code": "", + "error_class": "", } if dry_run: + row["status"] = "dry_run" return row - size = self.download_file(tile["url"], dest) - if size: + res = self.download_file(tile["url"], dest) + if res.ok: + row["status"] = "downloaded" row["downloaded_at"] = datetime.now(timezone.utc).isoformat() - row["file_size_bytes"] = size + row["file_size_bytes"] = res.size + else: + row["status"] = "failed" + row["error"] = res.error or "" + row["error_code"] = error_code_str(res.status_code) + row["error_class"] = res.error_class return row diff --git a/spruce/settings.py b/spruce/settings.py index 33ed042..89b60c4 100644 --- a/spruce/settings.py +++ b/spruce/settings.py @@ -47,6 +47,10 @@ SCANS_CSV_FIELDS: list[str] = [ "mosaic_url", "mosaic_local_path", "mosaic_on_disk", + "mosaic_download_status", + "mosaic_error", + "mosaic_error_code", + "mosaic_error_class", ] TILES_CSV_FIELDS: list[str] = [ @@ -60,6 +64,10 @@ TILES_CSV_FIELDS: list[str] = [ "y_mm", "url", "local_path", + "status", + "error", + "error_code", + "error_class", "downloaded_at", "file_size_bytes", ] diff --git a/tests/test_download_result.py b/tests/test_download_result.py new file mode 100644 index 0000000..0fa0d9c --- /dev/null +++ b/tests/test_download_result.py @@ -0,0 +1,36 @@ +"""Tests for spruce.download_result classification.""" + +import requests + +from spruce.download_result import ( + PERMANENT_MISSING, + TRANSIENT, + UNKNOWN, + classify_http_error, + error_code_str, +) + + +def test_classify_404_permanent(): + assert classify_http_error(404, None) == PERMANENT_MISSING + + +def test_classify_410_permanent(): + assert classify_http_error(410, None) == PERMANENT_MISSING + + +def test_classify_503_transient(): + assert classify_http_error(503, None) == TRANSIENT + + +def test_classify_timeout_transient(): + assert classify_http_error(None, requests.Timeout()) == TRANSIENT + + +def test_classify_unknown_4xx(): + assert classify_http_error(403, None) == UNKNOWN + + +def test_error_code_str(): + assert error_code_str(None) == "" + assert error_code_str(404) == "404" diff --git a/tests/test_orchestrator_mosaic.py b/tests/test_orchestrator_mosaic.py new file mode 100644 index 0000000..559df6a --- /dev/null +++ b/tests/test_orchestrator_mosaic.py @@ -0,0 +1,85 @@ +"""Mosaic download outcomes for scans.csv (RunStats / MosaicAttempt).""" + +from pathlib import Path +from unittest.mock import MagicMock + +from spruce.download_result import ( + DownloadResult, + PERMANENT_MISSING, + TRANSIENT, + UNKNOWN, +) +from spruce.orchestrator import _download_mosaic + +_MACHINE = {"label": "M", "option_value": "v", "machine_id": "1"} +_CONFIG: dict = { + "write_exif": False, + "timeout": 10, + "request_delay": 0.0, + "machine_metadata": {}, +} + + +def test_download_mosaic_404_permanent_class(): + sess = MagicMock() + url = "http://img/RootView_Database/9/mosaic.jpg" + sess.mosaic_url.return_value = url + sess.download_file.return_value = DownloadResult(0, 404, "404", PERMANENT_MISSING) + + progress = MagicMock() + progress.is_done.return_value = False + mpath = Path("/tmp/mosaic_404.jpg") + + out = _download_mosaic( + sess, {}, 9, mpath, progress, _MACHINE, _CONFIG, dry_run=False + ) + assert out.csv_status == "failed" + assert out.error_class == PERMANENT_MISSING + assert out.error_code == "404" + progress.mark_done.assert_not_called() + + +def test_download_mosaic_503_transient_class(): + sess = MagicMock() + sess.mosaic_url.return_value = "http://x/m.jpg" + sess.download_file.return_value = DownloadResult(0, 503, "err", TRANSIENT) + + progress = MagicMock() + progress.is_done.return_value = False + + out = _download_mosaic( + sess, + {}, + 1, + Path("/tmp/m.jpg"), + progress, + _MACHINE, + _CONFIG, + dry_run=False, + ) + assert out.error_class == TRANSIENT + assert out.error_code == "503" + + +def test_download_mosaic_empty_body_unknown(): + sess = MagicMock() + sess.mosaic_url.return_value = "http://x/m.jpg" + sess.download_file.return_value = DownloadResult( + 0, 200, "0 bytes in response body", UNKNOWN + ) + + progress = MagicMock() + progress.is_done.return_value = False + + out = _download_mosaic( + sess, + {}, + 1, + Path("/tmp/m.jpg"), + progress, + _MACHINE, + _CONFIG, + dry_run=False, + ) + assert out.error_class == UNKNOWN + assert "bytes" in out.error or out.error diff --git a/tests/test_session_download.py b/tests/test_session_download.py new file mode 100644 index 0000000..e8f1cf1 --- /dev/null +++ b/tests/test_session_download.py @@ -0,0 +1,75 @@ +"""HTTP download result wiring in MachineSession.""" + +from pathlib import Path + +import pytest +import requests + +from spruce.download_result import PERMANENT_MISSING, TRANSIENT +from spruce.session import MachineSession + + +def _minimal_config() -> dict: + return { + "base_url": "http://127.0.0.1:8010/", + "image_base_url": "http://127.0.0.1:8011/", + "username": "u", + "password": "p", + "timeout": 10, + "request_delay": 0.0, + "workers": 2, + } + + +def _machine() -> dict: + return { + "label": "T [AMR-0]", + "option_value": "x", + "machine_id": "0", + } + + +def test_download_file_http_404_result(tmp_path: Path): + dest = tmp_path / "f.bin" + sess = MachineSession(_machine(), _minimal_config()) + + def fail_get(*_a, **_k): + r = requests.Response() + r.status_code = 404 + r.url = "http://example/mosaic.jpg" + r.reason = "Not Found" + r.raise_for_status() + + sess.http.get = fail_get # type: ignore[method-assign] + + res = sess.download_file("http://example/mosaic.jpg", dest, retries=1) + assert res.size == 0 + assert res.status_code == 404 + assert res.error_class == PERMANENT_MISSING + assert "404" in (res.error or "") + + +def test_download_tile_row_on_failure(tmp_path: Path): + dest = tmp_path / "t.jpg" + tile = { + "scan_id": 1, + "row_index": 0, + "col_index": 0, + "x_mm": 0.0, + "y_mm": 0.0, + "url": "http://example/tile", + } + sess = MachineSession(_machine(), _minimal_config()) + + def fail_get(*_a, **_k): + r = requests.Response() + r.status_code = 500 + r.url = "http://example" + r.reason = "Server Error" + r.raise_for_status() + + sess.http.get = fail_get # type: ignore[method-assign] + row = sess.download_tile(tile, dest, dry_run=False) + assert row["status"] == "failed" + assert row["error_code"] == "500" + assert row["error_class"] == TRANSIENT diff --git a/tests/test_settings.py b/tests/test_settings.py index 6b49ee3..b2c635c 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -7,6 +7,8 @@ import yaml from spruce.settings import ( MAX_SAFE_WORKERS, + SCANS_CSV_FIELDS, + TILES_CSV_FIELDS, _clamp_workers, load_config, ) @@ -91,3 +93,24 @@ def test_load_config_missing_password_exits(tmp_path): path.write_text(yaml.dump({"username": "x"})) with pytest.raises(SystemExit): load_config(str(path)) + + +# --------------------------------------------------------------------------- +# CSV schemas (failure columns) +# --------------------------------------------------------------------------- + + +def test_scans_csv_fields_includes_mosaic_failure_columns(): + for name in ( + "mosaic_download_status", + "mosaic_error", + "mosaic_error_code", + "mosaic_error_class", + ): + assert name in SCANS_CSV_FIELDS + + +def test_tiles_csv_fields_includes_status_and_error_columns(): + for name in ("status", "error", "error_code", "error_class"): + assert name in TILES_CSV_FIELDS + assert TILES_CSV_FIELDS.index("status") < TILES_CSV_FIELDS.index("downloaded_at")