Restore: login retry, skip processed scans, no-retry on 404, started_at tracking
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
+50
-3
@@ -241,6 +241,7 @@ def process_scan(
|
||||
mosaic_only: bool,
|
||||
metadata_only: bool = False,
|
||||
max_tiles: int | None = None,
|
||||
scans_csv_existing_ids: set[int] | None = None,
|
||||
) -> RunStats:
|
||||
"""
|
||||
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
|
||||
@@ -379,9 +380,16 @@ def process_scan(
|
||||
mds, mer, mco, mcl = "", "", "", ""
|
||||
|
||||
# Write scan-level CSV row only if this scan hasn't been recorded before.
|
||||
if mosaic_already_done and not metadata_only:
|
||||
# Skip if: (1) mosaic URL already in .progress.json, or (2) scan already
|
||||
# has a non-pending row in scans.csv from a prior run.
|
||||
already_recorded = (mosaic_already_done and not metadata_only) or (
|
||||
not metadata_only
|
||||
and scans_csv_existing_ids is not None
|
||||
and scan_id in scans_csv_existing_ids
|
||||
)
|
||||
if already_recorded:
|
||||
log.debug(
|
||||
"[%s] Scan %d: already in scans.csv (mosaic was previously downloaded), skipping CSV row.",
|
||||
"[%s] Scan %d: already in scans.csv, skipping CSV row.",
|
||||
machine["label"],
|
||||
scan_id,
|
||||
)
|
||||
@@ -494,7 +502,20 @@ def scrape_machine(
|
||||
) -> RunStats:
|
||||
"""Login, fetch scans, and download all content for one machine."""
|
||||
sess = MachineSession(machine, config)
|
||||
if not sess.login():
|
||||
login_ok = False
|
||||
for attempt in range(1, 4):
|
||||
if sess.login():
|
||||
login_ok = True
|
||||
break
|
||||
if attempt < 3:
|
||||
log.warning(
|
||||
"[%s] Login failed (attempt %d/3) — retrying in 10s.",
|
||||
machine["label"],
|
||||
attempt,
|
||||
)
|
||||
time.sleep(10)
|
||||
if not login_ok:
|
||||
log.error("[%s] Login failed after 3 attempts — skipping machine.", machine["label"])
|
||||
return RunStats()
|
||||
|
||||
if scan_id_filter is not None:
|
||||
@@ -508,8 +529,33 @@ def scrape_machine(
|
||||
log.warning("[%s] No scans found.", machine["label"])
|
||||
return RunStats()
|
||||
|
||||
# Build a set of scan_ids already fully processed in a prior run so we can
|
||||
# skip them entirely (no metadata fetch, no mosaic request).
|
||||
# Only scans with a definitive non-pending status count; skipped_metadata_only
|
||||
# rows still need to be processed in mosaic mode.
|
||||
PENDING_STATUSES = {"skipped_metadata_only", ""}
|
||||
existing_ids: set[int] = set()
|
||||
if not metadata_only and scans_csv._fh.name:
|
||||
existing_path = Path(scans_csv._fh.name)
|
||||
if existing_path.exists():
|
||||
import csv as _csv
|
||||
with open(existing_path, newline="", encoding="utf-8") as _f:
|
||||
for _row in _csv.DictReader(_f):
|
||||
if _row.get("machine") == machine["label"]:
|
||||
if _row.get("mosaic_download_status", "") not in PENDING_STATUSES:
|
||||
existing_ids.add(int(_row["scan_id"]))
|
||||
|
||||
stats = RunStats()
|
||||
for scan in scans:
|
||||
# Skip scans already fully processed in a prior run — avoids redundant
|
||||
# metadata fetches and mosaic requests for known-failed / known-done scans.
|
||||
if not metadata_only and scan["scan_id"] in existing_ids:
|
||||
log.debug(
|
||||
"[%s] Scan %d: already processed, skipping.",
|
||||
machine["label"],
|
||||
scan["scan_id"],
|
||||
)
|
||||
continue
|
||||
stats.merge(process_scan(
|
||||
sess=sess,
|
||||
scan=scan,
|
||||
@@ -523,5 +569,6 @@ def scrape_machine(
|
||||
mosaic_only=mosaic_only,
|
||||
metadata_only=metadata_only,
|
||||
max_tiles=max_tiles,
|
||||
scans_csv_existing_ids=existing_ids,
|
||||
))
|
||||
return stats
|
||||
|
||||
+8
-1
@@ -5,6 +5,7 @@ Progress tracking (JSON) and CSV writing.
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
@@ -27,6 +28,7 @@ class ProgressTracker:
|
||||
def __init__(self, path: Path) -> None:
|
||||
self.path = path
|
||||
self._done: set[str] = set()
|
||||
self.started_at: str = datetime.now(timezone.utc).isoformat()
|
||||
self._load()
|
||||
|
||||
def _load(self) -> None:
|
||||
@@ -34,6 +36,8 @@ class ProgressTracker:
|
||||
try:
|
||||
data = json.loads(self.path.read_text())
|
||||
self._done = set(data.get("completed_urls", []))
|
||||
if "started_at" in data:
|
||||
self.started_at = data["started_at"]
|
||||
log.info("Resuming: %d URLs already downloaded.", len(self._done))
|
||||
except Exception:
|
||||
log.warning("Could not read progress file; starting fresh.")
|
||||
@@ -59,7 +63,10 @@ class ProgressTracker:
|
||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = self.path.with_suffix(".json.tmp")
|
||||
tmp.write_text(
|
||||
json.dumps({"completed_urls": sorted(self._done)}, indent=2)
|
||||
json.dumps(
|
||||
{"started_at": self.started_at, "completed_urls": sorted(self._done)},
|
||||
indent=2,
|
||||
)
|
||||
)
|
||||
tmp.replace(self.path) # atomic on POSIX; avoids corrupt JSON on crash
|
||||
|
||||
|
||||
+5
-1
@@ -14,6 +14,7 @@ from bs4 import BeautifulSoup
|
||||
|
||||
from spruce.download_result import (
|
||||
OK,
|
||||
PERMANENT_MISSING,
|
||||
UNKNOWN,
|
||||
DownloadResult,
|
||||
classify_http_error,
|
||||
@@ -263,6 +264,10 @@ class MachineSession:
|
||||
and exc.response is not None
|
||||
):
|
||||
sc = exc.response.status_code
|
||||
cl = classify_http_error(sc, exc)
|
||||
if cl == PERMANENT_MISSING:
|
||||
# 404/410 will never succeed — don't waste time retrying.
|
||||
return DownloadResult(0, sc, str(exc), cl)
|
||||
if attempt < retries:
|
||||
log.warning(
|
||||
"Attempt %d/%d failed %s: %s — retrying in %.0fs",
|
||||
@@ -281,7 +286,6 @@ class MachineSession:
|
||||
url,
|
||||
exc,
|
||||
)
|
||||
cl = classify_http_error(sc, exc)
|
||||
return DownloadResult(0, sc, str(exc), cl)
|
||||
return DownloadResult(0, None, "download_file: exhausted", UNKNOWN)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user