Add --retry-failed mode and mosaic retry estimates to progress report
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -2,11 +2,13 @@
|
||||
"""
|
||||
Report mosaic download progress from archives/scans.csv.
|
||||
|
||||
Output is formatted as Markdown. Add --by-year for a per-machine ×
|
||||
per-year breakdown table.
|
||||
Output is Markdown. Use ``--by-year`` for a per-machine × per-year
|
||||
done/failed table. When the first mosaic pass is complete (no pending rows)
|
||||
but failures remain, a **Mosaic retry estimates** section is printed with
|
||||
queue counts and duration hints.
|
||||
|
||||
Rate/ETA require two calls at least 60 s apart. Mean mosaic size is
|
||||
sampled from up to 100 already-downloaded files and cached for 1 hour.
|
||||
Rate/ETA use a 30-minute rolling window when snapshots show progress.
|
||||
Mean mosaic size is sampled from up to 100 downloads (1-hour cache).
|
||||
|
||||
Usage:
|
||||
python scripts/mosaic_progress_report.py [--archive DIR] [--by-year]
|
||||
@@ -30,6 +32,11 @@ _R_PRE19 = 1.00
|
||||
_R_PURGED = 0.00
|
||||
_R_RECENT = 0.82
|
||||
|
||||
FIRST_PASS_FALLBACK_RATE_PER_HR = 1100.0
|
||||
RETRY_OPTIMISTIC_RATE_PER_HR = 1800.0
|
||||
RETRY_REALISTIC_RATE_PER_HR = 1100.0
|
||||
RETRY_PESSIMISTIC_RATE_PER_HR = 300.0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
@@ -127,6 +134,12 @@ def _expected_remaining(pending_rows: list[dict]) -> float:
|
||||
return count
|
||||
|
||||
|
||||
def _retry_hours_from_rate(n_scans: int, rate_per_hr: float) -> str:
|
||||
if n_scans <= 0 or rate_per_hr <= 0:
|
||||
return "—"
|
||||
return _fmt_duration(n_scans / rate_per_hr * 3600.0)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -213,15 +226,32 @@ def main() -> None:
|
||||
|
||||
rate_per_sec: float | None = None
|
||||
rate_window_str = ""
|
||||
snap_delta_proc = 0
|
||||
if recent:
|
||||
oldest = recent[0]
|
||||
dt = now.timestamp() - oldest["ts"]
|
||||
dp = processed - oldest["proc"]
|
||||
snap_delta_proc = dp
|
||||
if dt >= 60 and dp > 0:
|
||||
rate_per_sec = dp / dt
|
||||
window_min = dt / 60
|
||||
rate_window_str = f"{window_min:.0f}-min avg"
|
||||
|
||||
# One-time baseline after initial mosaic crawl finished (no pending rows).
|
||||
if pending == 0 and "first_pass_mean_rate_per_hr" not in cache:
|
||||
cache["first_pass_completed_at"] = now.isoformat()
|
||||
cache["first_pass_processed"] = total
|
||||
cache["first_pass_mean_rate_per_hr"] = FIRST_PASS_FALLBACK_RATE_PER_HR
|
||||
|
||||
first_pass_rate_hr = float(
|
||||
cache.get("first_pass_mean_rate_per_hr", FIRST_PASS_FALLBACK_RATE_PER_HR)
|
||||
)
|
||||
live_rate_hr = rate_per_sec * 3600 if rate_per_sec else None
|
||||
# Active scrape shows progress in snapshots; idle archive shows dp == 0.
|
||||
retry_estimate_rate_hr = (
|
||||
live_rate_hr if live_rate_hr is not None else first_pass_rate_hr
|
||||
)
|
||||
|
||||
# --- Disk space ---
|
||||
mean_bytes: float | None = None
|
||||
size_note = ""
|
||||
@@ -325,6 +355,76 @@ def main() -> None:
|
||||
align=["l", "r", "r", "r", "r", "r"],
|
||||
))
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Retry estimates (first pass complete: pending == 0, failures remain)
|
||||
# -----------------------------------------------------------------------
|
||||
if failed > 0 and pending == 0:
|
||||
failed_rows_list = [
|
||||
r for r in latest.values()
|
||||
if r.get("mosaic_download_status") == "failed"
|
||||
]
|
||||
n_all = len(failed_rows_list)
|
||||
n_2023 = sum(
|
||||
1 for r in failed_rows_list
|
||||
if (r.get("scan_time") or "")[:4] >= "2023"
|
||||
and len((r.get("scan_time") or "")[:4]) == 4
|
||||
)
|
||||
n_200 = sum(
|
||||
1 for r in failed_rows_list
|
||||
if r.get("mosaic_error_code") == "200"
|
||||
)
|
||||
rate_note = (
|
||||
"rolling 30-min window"
|
||||
if snap_delta_proc > 0
|
||||
else f"first-pass baseline ({first_pass_rate_hr:,.0f}/hr)"
|
||||
)
|
||||
print()
|
||||
print("### Mosaic retry estimates\n")
|
||||
print(
|
||||
f"*Suggested command after server fix:* "
|
||||
f"`python scraper.py --retry-failed --workers 2` "
|
||||
f"(filters: `--retry-since YEAR`, `--retry-error-code CODE`)*\n"
|
||||
)
|
||||
print(
|
||||
f"*ETA column uses **{retry_estimate_rate_hr:,.0f} scans/hr** "
|
||||
f"({rate_note}). Fixed columns use scenario rates.*\n"
|
||||
)
|
||||
est_hdr = (
|
||||
"Retry scope",
|
||||
"Count",
|
||||
f"@{RETRY_OPTIMISTIC_RATE_PER_HR:.0f}/hr",
|
||||
f"@{RETRY_REALISTIC_RATE_PER_HR:.0f}/hr",
|
||||
f"@{RETRY_PESSIMISTIC_RATE_PER_HR:.0f}/hr",
|
||||
f"@{retry_estimate_rate_hr:.0f}/hr",
|
||||
)
|
||||
retry_tbl_rows = [
|
||||
[
|
||||
"HTTP 200 (empty body)",
|
||||
f"{n_200:,}",
|
||||
_retry_hours_from_rate(n_200, RETRY_OPTIMISTIC_RATE_PER_HR),
|
||||
_retry_hours_from_rate(n_200, RETRY_REALISTIC_RATE_PER_HR),
|
||||
_retry_hours_from_rate(n_200, RETRY_PESSIMISTIC_RATE_PER_HR),
|
||||
_retry_hours_from_rate(n_200, retry_estimate_rate_hr),
|
||||
],
|
||||
[
|
||||
"Failed, scan_time ≥ 2023",
|
||||
f"{n_2023:,}",
|
||||
_retry_hours_from_rate(n_2023, RETRY_OPTIMISTIC_RATE_PER_HR),
|
||||
_retry_hours_from_rate(n_2023, RETRY_REALISTIC_RATE_PER_HR),
|
||||
_retry_hours_from_rate(n_2023, RETRY_PESSIMISTIC_RATE_PER_HR),
|
||||
_retry_hours_from_rate(n_2023, retry_estimate_rate_hr),
|
||||
],
|
||||
[
|
||||
"**All failed**",
|
||||
f"**{n_all:,}**",
|
||||
_retry_hours_from_rate(n_all, RETRY_OPTIMISTIC_RATE_PER_HR),
|
||||
_retry_hours_from_rate(n_all, RETRY_REALISTIC_RATE_PER_HR),
|
||||
_retry_hours_from_rate(n_all, RETRY_PESSIMISTIC_RATE_PER_HR),
|
||||
_retry_hours_from_rate(n_all, retry_estimate_rate_hr),
|
||||
],
|
||||
]
|
||||
print(_md_table(list(est_hdr), retry_tbl_rows, align=["l", "r", "r", "r", "r", "r"]))
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# --by-year table
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user