Add --by-year table and reformat progress report as Markdown
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -2,15 +2,14 @@
|
||||
"""
|
||||
Report mosaic download progress from archives/scans.csv.
|
||||
|
||||
Shows elapsed time, processing rate, estimated time remaining, and
|
||||
projected disk usage (actual downloaded + model-estimated remaining).
|
||||
Output is formatted as Markdown. Add --by-year for a per-machine ×
|
||||
per-year breakdown table.
|
||||
|
||||
Rate/ETA require two calls at least 60s apart. Mean mosaic size is
|
||||
sampled from up to 100 already-downloaded files and cached so later
|
||||
calls skip the filesystem hit unless the sample is stale.
|
||||
Rate/ETA require two calls at least 60 s apart. Mean mosaic size is
|
||||
sampled from up to 100 already-downloaded files and cached for 1 hour.
|
||||
|
||||
Usage:
|
||||
python scripts/mosaic_progress_report.py [--archive ARCHIVE_DIR]
|
||||
python scripts/mosaic_progress_report.py [--archive DIR] [--by-year]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -23,16 +22,19 @@ from collections import Counter
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Year-based viability model derived from BW1-4 observations:
|
||||
# pre-2019 → kept on server long-term (~100%)
|
||||
# 2019-2022 → purged (~0%)
|
||||
# 2023+ → recent, mostly available (~82%)
|
||||
# pre-2019 → kept on server long-term (~100 %)
|
||||
# 2019-2022 → purged (~ 0 %)
|
||||
# 2023+ → recent, mostly available (~82 %)
|
||||
_R_PRE19 = 1.00
|
||||
_R_PURGED = 0.00
|
||||
_R_RECENT = 0.82
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _parse_dt(s: str) -> datetime | None:
|
||||
try:
|
||||
return datetime.fromisoformat(s.replace("Z", "+00:00"))
|
||||
@@ -52,29 +54,41 @@ def _fmt_duration(seconds: float) -> str:
|
||||
return f"{m}m {int(seconds % 60):02d}s"
|
||||
|
||||
|
||||
def _fmt_size(bytes_: float) -> str:
|
||||
if bytes_ >= 1e12:
|
||||
return f"{bytes_/1e12:.2f} TB"
|
||||
if bytes_ >= 1e9:
|
||||
return f"{bytes_/1e9:.2f} GB"
|
||||
if bytes_ >= 1e6:
|
||||
return f"{bytes_/1e6:.1f} MB"
|
||||
return f"{bytes_/1e3:.0f} KB"
|
||||
def _fmt_size(b: float) -> str:
|
||||
if b >= 1e12:
|
||||
return f"{b/1e12:.2f} TB"
|
||||
if b >= 1e9:
|
||||
return f"{b/1e9:.2f} GB"
|
||||
if b >= 1e6:
|
||||
return f"{b/1e6:.1f} MB"
|
||||
return f"{b/1e3:.0f} KB"
|
||||
|
||||
|
||||
def _md_table(headers: list[str], rows: list[list[str]], *, align: list[str] | None = None) -> str:
|
||||
"""Render a Markdown table. align values: 'l', 'r', 'c' (default 'l')."""
|
||||
if align is None:
|
||||
align = ["l"] * len(headers)
|
||||
sep_map = {"l": ":---", "r": "---:", "c": ":---:"}
|
||||
|
||||
def row_str(cells: list[str]) -> str:
|
||||
return "| " + " | ".join(cells) + " |"
|
||||
|
||||
lines = [
|
||||
row_str(headers),
|
||||
row_str([sep_map.get(a, ":---") for a in align]),
|
||||
]
|
||||
for r in rows:
|
||||
lines.append(row_str(r))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _sample_mean_bytes(rows: list[dict], cache: dict, max_sample: int = 100) -> float | None:
|
||||
"""
|
||||
Return mean mosaic size in bytes, using a cached value when fresh
|
||||
(< 1 hour old and based on a similarly-sized download set).
|
||||
Falls back to filesystem sampling of `rows`.
|
||||
"""
|
||||
cached_mean = cache.get("mean_bytes")
|
||||
cached_n = cache.get("sample_n", 0)
|
||||
cached_ts = _parse_dt(cache.get("size_ts", ""))
|
||||
now = datetime.now(timezone.utc)
|
||||
if (
|
||||
cached_mean
|
||||
and cached_ts
|
||||
cached_mean and cached_ts
|
||||
and (now - cached_ts).total_seconds() < 3600
|
||||
and cached_n >= min(max_sample, len(rows))
|
||||
):
|
||||
@@ -100,8 +114,7 @@ def _sample_mean_bytes(rows: list[dict], cache: dict, max_sample: int = 100) ->
|
||||
return mean
|
||||
|
||||
|
||||
def _expected_remaining_downloads(pending_rows: list[dict]) -> float:
|
||||
"""Estimate how many pending scans will produce a successful mosaic."""
|
||||
def _expected_remaining(pending_rows: list[dict]) -> float:
|
||||
count = 0.0
|
||||
for row in pending_rows:
|
||||
yr = row.get("scan_time", "")[:4]
|
||||
@@ -114,9 +127,17 @@ def _expected_remaining_downloads(pending_rows: list[dict]) -> float:
|
||||
return count
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--archive", default="archives", help="Archives directory")
|
||||
parser.add_argument("--archive", default="archives")
|
||||
parser.add_argument(
|
||||
"--by-year", action="store_true",
|
||||
help="Add a per-machine × per-year done/failed breakdown table",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
archive = Path(args.archive)
|
||||
@@ -127,7 +148,7 @@ def main() -> None:
|
||||
if not scans_csv.exists():
|
||||
sys.exit(f"scans.csv not found: {scans_csv}")
|
||||
|
||||
# --- Load scans (deduplicated: last row wins per machine+scan_id) ---
|
||||
# --- Load & deduplicate (last row per machine+scan_id) ---
|
||||
latest: dict[tuple[str, str], dict] = {}
|
||||
with open(scans_csv, newline="", encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
@@ -135,15 +156,18 @@ def main() -> None:
|
||||
latest[key] = row
|
||||
|
||||
by_machine: dict[str, Counter] = {}
|
||||
# machine -> year -> Counter(status -> count)
|
||||
by_machine_year: dict[str, dict[str, Counter]] = {}
|
||||
total_counts: Counter = Counter()
|
||||
downloaded_rows: list[dict] = []
|
||||
pending_rows: list[dict] = []
|
||||
|
||||
for (_machine, _sid), row in latest.items():
|
||||
for (_m, _sid), row in latest.items():
|
||||
status = row.get("mosaic_download_status", "")
|
||||
m = row.get("machine", "")
|
||||
by_machine.setdefault(m, Counter())
|
||||
by_machine[m][status] += 1
|
||||
m = row.get("machine", "")
|
||||
yr = (row.get("scan_time") or "")[:4] or "????"
|
||||
by_machine.setdefault(m, Counter())[status] += 1
|
||||
by_machine_year.setdefault(m, {}).setdefault(yr, Counter())[status] += 1
|
||||
total_counts[status] += 1
|
||||
if status == "downloaded":
|
||||
downloaded_rows.append(row)
|
||||
@@ -159,7 +183,7 @@ def main() -> None:
|
||||
attempted = downloaded + failed
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# --- Elapsed time ---
|
||||
# --- Elapsed ---
|
||||
elapsed_str = ""
|
||||
if progress_json.exists():
|
||||
try:
|
||||
@@ -170,22 +194,22 @@ def main() -> None:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- Rate cache (load, compute, update) ---
|
||||
# --- Rate cache ---
|
||||
cache: dict = {}
|
||||
if rate_cache_path.exists():
|
||||
try:
|
||||
cache = json.loads(rate_cache_path.read_text())
|
||||
except Exception:
|
||||
cache = {}
|
||||
pass
|
||||
|
||||
rate_scans_per_sec: float | None = None
|
||||
rate_per_sec: float | None = None
|
||||
prev_ts = _parse_dt(cache.get("timestamp", ""))
|
||||
prev_proc = cache.get("processed")
|
||||
if prev_ts and prev_proc is not None:
|
||||
delta_t = (now - prev_ts).total_seconds()
|
||||
delta_p = processed - int(prev_proc)
|
||||
if delta_t >= 60 and delta_p > 0:
|
||||
rate_scans_per_sec = delta_p / delta_t
|
||||
dt = (now - prev_ts).total_seconds()
|
||||
dp = processed - int(prev_proc)
|
||||
if dt >= 60 and dp > 0:
|
||||
rate_per_sec = dp / dt
|
||||
|
||||
# --- Disk space ---
|
||||
mean_bytes: float | None = None
|
||||
@@ -193,16 +217,13 @@ def main() -> None:
|
||||
if downloaded_rows:
|
||||
mean_bytes = _sample_mean_bytes(downloaded_rows, cache)
|
||||
if mean_bytes and cache.get("sample_n"):
|
||||
size_note = f"(mean {_fmt_size(mean_bytes)} from {cache['sample_n']} files)"
|
||||
size_note = f"mean {_fmt_size(mean_bytes)} × {cache['sample_n']} sampled files"
|
||||
|
||||
downloaded_bytes: float | None = None
|
||||
remaining_bytes: float | None = None
|
||||
total_bytes: float | None = None
|
||||
dl_bytes: float | None = None
|
||||
rem_bytes: float | None = None
|
||||
if mean_bytes:
|
||||
downloaded_bytes = downloaded * mean_bytes
|
||||
exp_remaining = _expected_remaining_downloads(pending_rows)
|
||||
remaining_bytes = exp_remaining * mean_bytes
|
||||
total_bytes = downloaded_bytes + remaining_bytes
|
||||
dl_bytes = downloaded * mean_bytes
|
||||
rem_bytes = _expected_remaining(pending_rows) * mean_bytes
|
||||
|
||||
# Update cache
|
||||
cache["timestamp"] = now.isoformat()
|
||||
@@ -212,57 +233,134 @@ def main() -> None:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- Formatted output ---
|
||||
rate_str = ""
|
||||
eta_str = ""
|
||||
if rate_scans_per_sec and rate_scans_per_sec > 0:
|
||||
rate_str = f"{rate_scans_per_sec * 3600:,.0f} scans/hr"
|
||||
eta_str = _fmt_duration(pending / rate_scans_per_sec)
|
||||
rate_str = eta_str = ""
|
||||
if rate_per_sec and rate_per_sec > 0:
|
||||
rate_str = f"{rate_per_sec * 3600:,.0f} scans/hr"
|
||||
eta_str = _fmt_duration(pending / rate_per_sec)
|
||||
|
||||
print(f"Mosaic download progress \u2014 {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
||||
print(f"Archive: {archive.resolve()}")
|
||||
# -----------------------------------------------------------------------
|
||||
# Output — Markdown
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
timing_parts = []
|
||||
ts = datetime.now().strftime("%Y-%m-%d %H:%M")
|
||||
print(f"# Mosaic download progress — {ts}\n")
|
||||
print(f"**Archive:** `{archive.resolve()}` ")
|
||||
meta_parts = []
|
||||
if elapsed_str:
|
||||
timing_parts.append(f"Elapsed: {elapsed_str}")
|
||||
meta_parts.append(f"**Elapsed:** {elapsed_str}")
|
||||
if rate_str:
|
||||
timing_parts.append(f"Rate: {rate_str}")
|
||||
meta_parts.append(f"**Rate:** {rate_str}")
|
||||
if eta_str:
|
||||
timing_parts.append(f"ETA: {eta_str}")
|
||||
if timing_parts:
|
||||
print(" ".join(timing_parts))
|
||||
|
||||
meta_parts.append(f"**ETA:** {eta_str}")
|
||||
if meta_parts:
|
||||
print(" ".join(meta_parts) + " ")
|
||||
print()
|
||||
print(f" Downloaded: {downloaded:>8,} ({100*downloaded/total:.1f}%)")
|
||||
print(f" Failed: {failed:>8,} ({100*failed/total:.1f}%)")
|
||||
print(f" Skipped (disk=0): {zero_skipped:>8,} ({100*zero_skipped/total:.1f}%)")
|
||||
print(f" Pending: {pending:>8,} ({100*pending/total:.1f}%)")
|
||||
print(f" Total scans: {total:>8,}")
|
||||
|
||||
# Summary table
|
||||
summary_rows = [
|
||||
["Downloaded", f"{downloaded:,}", f"{100*downloaded/total:.1f}%"],
|
||||
["Failed", f"{failed:,}", f"{100*failed/total:.1f}%"],
|
||||
["Skipped (disk=0)",f"{zero_skipped:,}", f"{100*zero_skipped/total:.1f}%"],
|
||||
["Pending", f"{pending:,}", f"{100*pending/total:.1f}%"],
|
||||
["**Total**", f"**{total:,}**", ""],
|
||||
]
|
||||
if attempted:
|
||||
print(f" Success rate: {100*downloaded/attempted:.1f}% (of attempted)")
|
||||
summary_rows.append(["**Success rate**", f"**{100*downloaded/attempted:.1f}%**", "*(of attempted)*"])
|
||||
print(_md_table(["Metric", "Count", ""], summary_rows, align=["l", "r", "l"]))
|
||||
print()
|
||||
|
||||
if total_bytes is not None:
|
||||
print(f" Disk space {size_note}")
|
||||
print(f" Downloaded so far: {_fmt_size(downloaded_bytes):>10}")
|
||||
print(f" Estimated remaining: {_fmt_size(remaining_bytes):>10} (model-based)")
|
||||
print(f" Grand total: {_fmt_size(total_bytes):>10}")
|
||||
# Disk space
|
||||
if dl_bytes is not None and rem_bytes is not None:
|
||||
total_bytes = dl_bytes + rem_bytes
|
||||
print(f"### Disk space\n")
|
||||
print(f"_{size_note}_\n")
|
||||
ds_rows = [
|
||||
["Downloaded so far", _fmt_size(dl_bytes), ""],
|
||||
["Estimated remaining", _fmt_size(rem_bytes), "*(model-based)*"],
|
||||
["**Grand total**", f"**{_fmt_size(total_bytes)}**", ""],
|
||||
]
|
||||
print(_md_table(["", "Size", ""], ds_rows, align=["l", "r", "l"]))
|
||||
print()
|
||||
|
||||
print(f" {'Machine':<25} {'Done':>7} {'Failed':>7} {'Skip0':>6} {'Pending':>8} {'Total':>7}")
|
||||
print(" " + "-" * 70)
|
||||
for machine in sorted(by_machine):
|
||||
mc = by_machine[machine]
|
||||
# Per-machine breakdown
|
||||
print("### Per-machine breakdown\n")
|
||||
machines = sorted(by_machine)
|
||||
mc_rows = []
|
||||
for m in machines:
|
||||
mc = by_machine[m]
|
||||
mt = sum(mc.values())
|
||||
print(
|
||||
f" {machine:<25} {mc['downloaded']:>7,} {mc['failed']:>7,} "
|
||||
f"{mc['skipped_zero_disk_space']:>6,} {mc['skipped_metadata_only']:>8,} {mt:>7,}"
|
||||
mc_rows.append([
|
||||
m,
|
||||
f"{mc['downloaded']:,}",
|
||||
f"{mc['failed']:,}",
|
||||
f"{mc['skipped_zero_disk_space']:,}",
|
||||
f"{mc['skipped_metadata_only']:,}",
|
||||
f"{mt:,}",
|
||||
])
|
||||
mc_rows.append([
|
||||
"**TOTAL**",
|
||||
f"**{downloaded:,}**",
|
||||
f"**{failed:,}**",
|
||||
f"**{zero_skipped:,}**",
|
||||
f"**{pending:,}**",
|
||||
f"**{total:,}**",
|
||||
])
|
||||
print(_md_table(
|
||||
["Machine", "Done", "Failed", "Skip0", "Pending", "Total"],
|
||||
mc_rows,
|
||||
align=["l", "r", "r", "r", "r", "r"],
|
||||
))
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# --by-year table
|
||||
# -----------------------------------------------------------------------
|
||||
if args.by_year:
|
||||
print()
|
||||
print("### Downloads by machine and year\n")
|
||||
print("*Format: done / failed*\n")
|
||||
|
||||
# Only include years that have at least one downloaded or failed scan
|
||||
all_years = sorted(
|
||||
yr for yr in set(
|
||||
yr for m_data in by_machine_year.values() for yr in m_data
|
||||
)
|
||||
if any(
|
||||
by_machine_year[m].get(yr, Counter()).get("downloaded", 0)
|
||||
+ by_machine_year[m].get(yr, Counter()).get("failed", 0) > 0
|
||||
for m in by_machine_year
|
||||
)
|
||||
)
|
||||
print(" " + "-" * 70)
|
||||
print(
|
||||
f" {'TOTAL':<25} {downloaded:>7,} {failed:>7,} "
|
||||
f"{zero_skipped:>6,} {pending:>8,} {total:>7,}"
|
||||
)
|
||||
|
||||
# Totals per year
|
||||
yr_totals: dict[str, Counter] = {}
|
||||
for yr in all_years:
|
||||
yr_totals[yr] = Counter()
|
||||
for m in machines:
|
||||
yr_totals[yr] += by_machine_year.get(m, {}).get(yr, Counter())
|
||||
|
||||
year_rows = []
|
||||
for m in machines:
|
||||
row_cells = [m]
|
||||
for yr in all_years:
|
||||
c = by_machine_year.get(m, {}).get(yr, Counter())
|
||||
d = c.get("downloaded", 0)
|
||||
f = c.get("failed", 0)
|
||||
row_cells.append(f"{d:,} / {f:,}" if (d or f) else "—")
|
||||
year_rows.append(row_cells)
|
||||
|
||||
# Totals row
|
||||
total_cells = ["**TOTAL**"]
|
||||
for yr in all_years:
|
||||
d = yr_totals[yr].get("downloaded", 0)
|
||||
f = yr_totals[yr].get("failed", 0)
|
||||
total_cells.append(f"**{d:,} / {f:,}**" if (d or f) else "—")
|
||||
year_rows.append(total_cells)
|
||||
|
||||
print(_md_table(
|
||||
["Machine"] + all_years,
|
||||
year_rows,
|
||||
align=["l"] + ["r"] * len(all_years),
|
||||
))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user