Scraping resilience, metadata tooling, and repository hygiene
Consolidates mosaic and session hardening (login retry, skip processed scans, no retry on 404, started_at), progress reporting (Markdown tables, by-year rollup, rolling-window rate/ETA), and metadata workflow scripts (run_metadata_scan.sh, scan_progress_report.py, export_machine_metadata.py). Adds mosaic reconstruction sample JPEGs referenced by the report. Updates .gitignore for backup/ and .claude/; sample_random_scans helper is documented for branch testing/sample-runs only (see README).
This commit is contained in:
+5
-1
@@ -14,6 +14,7 @@ from bs4 import BeautifulSoup
|
||||
|
||||
from spruce.download_result import (
|
||||
OK,
|
||||
PERMANENT_MISSING,
|
||||
UNKNOWN,
|
||||
DownloadResult,
|
||||
classify_http_error,
|
||||
@@ -263,6 +264,10 @@ class MachineSession:
|
||||
and exc.response is not None
|
||||
):
|
||||
sc = exc.response.status_code
|
||||
cl = classify_http_error(sc, exc)
|
||||
if cl == PERMANENT_MISSING:
|
||||
# 404/410 will never succeed — don't waste time retrying.
|
||||
return DownloadResult(0, sc, str(exc), cl)
|
||||
if attempt < retries:
|
||||
log.warning(
|
||||
"Attempt %d/%d failed %s: %s — retrying in %.0fs",
|
||||
@@ -281,7 +286,6 @@ class MachineSession:
|
||||
url,
|
||||
exc,
|
||||
)
|
||||
cl = classify_http_error(sc, exc)
|
||||
return DownloadResult(0, sc, str(exc), cl)
|
||||
return DownloadResult(0, None, "download_file: exhausted", UNKNOWN)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user