Add --by-year table and reformat progress report as Markdown

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-13 08:16:22 -04:00
parent e3f0c07119
commit fa4a60843c
1 changed files with 185 additions and 87 deletions
@@ -2,15 +2,14 @@
 """
 Report mosaic download progress from archives/scans.csv.

-Shows elapsed time, processing rate, estimated time remaining, and
-projected disk usage (actual downloaded + model-estimated remaining).
+Output is formatted as Markdown.  Add --by-year for a per-machine ×
+per-year breakdown table.

-Rate/ETA require two calls at least 60s apart.  Mean mosaic size is
-sampled from up to 100 already-downloaded files and cached so later
-calls skip the filesystem hit unless the sample is stale.
+Rate/ETA require two calls at least 60 s apart.  Mean mosaic size is
+sampled from up to 100 already-downloaded files and cached for 1 hour.

 Usage:
-    python scripts/mosaic_progress_report.py [--archive ARCHIVE_DIR]
+    python scripts/mosaic_progress_report.py [--archive DIR] [--by-year]
 """

 import argparse
@@ -23,16 +22,19 @@ from collections import Counter
 from datetime import datetime, timezone
 from pathlib import Path

-
 # Year-based viability model derived from BW1-4 observations:
-#   pre-2019  → kept on server long-term (~100%)
-#   2019-2022 → purged                   (~0%)
-#   2023+     → recent, mostly available (~82%)
+#   pre-2019  → kept on server long-term  (~100 %)
+#   2019-2022 → purged                    (~ 0 %)
+#   2023+     → recent, mostly available  (~82 %)
 _R_PRE19  = 1.00
 _R_PURGED = 0.00
 _R_RECENT = 0.82


+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
 def _parse_dt(s: str) -> datetime | None:
    try:
        return datetime.fromisoformat(s.replace("Z", "+00:00"))
@@ -52,29 +54,41 @@ def _fmt_duration(seconds: float) -> str:
    return f"{m}m {int(seconds % 60):02d}s"


-def _fmt_size(bytes_: float) -> str:
-    if bytes_ >= 1e12:
-        return f"{bytes_/1e12:.2f} TB"
-    if bytes_ >= 1e9:
-        return f"{bytes_/1e9:.2f} GB"
-    if bytes_ >= 1e6:
-        return f"{bytes_/1e6:.1f} MB"
-    return f"{bytes_/1e3:.0f} KB"
+def _fmt_size(b: float) -> str:
+    if b >= 1e12:
+        return f"{b/1e12:.2f} TB"
+    if b >= 1e9:
+        return f"{b/1e9:.2f} GB"
+    if b >= 1e6:
+        return f"{b/1e6:.1f} MB"
+    return f"{b/1e3:.0f} KB"
+
+
+def _md_table(headers: list[str], rows: list[list[str]], *, align: list[str] | None = None) -> str:
+    """Render a Markdown table.  align values: 'l', 'r', 'c' (default 'l')."""
+    if align is None:
+        align = ["l"] * len(headers)
+    sep_map = {"l": ":---", "r": "---:", "c": ":---:"}
+
+    def row_str(cells: list[str]) -> str:
+        return "| " + " | ".join(cells) + " |"
+
+    lines = [
+        row_str(headers),
+        row_str([sep_map.get(a, ":---") for a in align]),
+    ]
+    for r in rows:
+        lines.append(row_str(r))
+    return "\n".join(lines)


 def _sample_mean_bytes(rows: list[dict], cache: dict, max_sample: int = 100) -> float | None:
-    """
-    Return mean mosaic size in bytes, using a cached value when fresh
-    (< 1 hour old and based on a similarly-sized download set).
-    Falls back to filesystem sampling of `rows`.
-    """
    cached_mean = cache.get("mean_bytes")
    cached_n    = cache.get("sample_n", 0)
    cached_ts   = _parse_dt(cache.get("size_ts", ""))
    now = datetime.now(timezone.utc)
    if (
-        cached_mean
-        and cached_ts
+        cached_mean and cached_ts
        and (now - cached_ts).total_seconds() < 3600
        and cached_n >= min(max_sample, len(rows))
    ):
@@ -100,8 +114,7 @@ def _sample_mean_bytes(rows: list[dict], cache: dict, max_sample: int = 100) ->
    return mean


-def _expected_remaining_downloads(pending_rows: list[dict]) -> float:
-    """Estimate how many pending scans will produce a successful mosaic."""
+def _expected_remaining(pending_rows: list[dict]) -> float:
    count = 0.0
    for row in pending_rows:
        yr = row.get("scan_time", "")[:4]
@@ -114,9 +127,17 @@ def _expected_remaining_downloads(pending_rows: list[dict]) -> float:
    return count


+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
 def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--archive", default="archives", help="Archives directory")
+    parser.add_argument("--archive", default="archives")
+    parser.add_argument(
+        "--by-year", action="store_true",
+        help="Add a per-machine × per-year done/failed breakdown table",
+    )
    args = parser.parse_args()

    archive         = Path(args.archive)
@@ -127,7 +148,7 @@ def main() -> None:
    if not scans_csv.exists():
        sys.exit(f"scans.csv not found: {scans_csv}")

-    # --- Load scans (deduplicated: last row wins per machine+scan_id) ---
+    # --- Load & deduplicate (last row per machine+scan_id) ---
    latest: dict[tuple[str, str], dict] = {}
    with open(scans_csv, newline="", encoding="utf-8") as f:
        for row in csv.DictReader(f):
@@ -135,15 +156,18 @@ def main() -> None:
            latest[key] = row

    by_machine: dict[str, Counter] = {}
+    # machine -> year -> Counter(status -> count)
+    by_machine_year: dict[str, dict[str, Counter]] = {}
    total_counts: Counter = Counter()
    downloaded_rows: list[dict] = []
    pending_rows:    list[dict] = []

-    for (_machine, _sid), row in latest.items():
+    for (_m, _sid), row in latest.items():
        status = row.get("mosaic_download_status", "")
-        m = row.get("machine", "")
-        by_machine.setdefault(m, Counter())
-        by_machine[m][status] += 1
+        m  = row.get("machine", "")
+        yr = (row.get("scan_time") or "")[:4] or "????"
+        by_machine.setdefault(m, Counter())[status] += 1
+        by_machine_year.setdefault(m, {}).setdefault(yr, Counter())[status] += 1
        total_counts[status] += 1
        if status == "downloaded":
            downloaded_rows.append(row)
@@ -159,7 +183,7 @@ def main() -> None:
    attempted    = downloaded + failed
    now          = datetime.now(timezone.utc)

-    # --- Elapsed time ---
+    # --- Elapsed ---
    elapsed_str = ""
    if progress_json.exists():
        try:
@@ -170,22 +194,22 @@ def main() -> None:
        except Exception:
            pass

-    # --- Rate cache (load, compute, update) ---
+    # --- Rate cache ---
    cache: dict = {}
    if rate_cache_path.exists():
        try:
            cache = json.loads(rate_cache_path.read_text())
        except Exception:
-            cache = {}
+            pass

-    rate_scans_per_sec: float | None = None
+    rate_per_sec: float | None = None
    prev_ts   = _parse_dt(cache.get("timestamp", ""))
    prev_proc = cache.get("processed")
    if prev_ts and prev_proc is not None:
-        delta_t = (now - prev_ts).total_seconds()
-        delta_p = processed - int(prev_proc)
-        if delta_t >= 60 and delta_p > 0:
-            rate_scans_per_sec = delta_p / delta_t
+        dt = (now - prev_ts).total_seconds()
+        dp = processed - int(prev_proc)
+        if dt >= 60 and dp > 0:
+            rate_per_sec = dp / dt

    # --- Disk space ---
    mean_bytes: float | None = None
@@ -193,16 +217,13 @@ def main() -> None:
    if downloaded_rows:
        mean_bytes = _sample_mean_bytes(downloaded_rows, cache)
        if mean_bytes and cache.get("sample_n"):
-            size_note = f"(mean {_fmt_size(mean_bytes)} from {cache['sample_n']} files)"
+            size_note = f"mean {_fmt_size(mean_bytes)} × {cache['sample_n']} sampled files"

-    downloaded_bytes: float | None = None
-    remaining_bytes:  float | None = None
-    total_bytes:      float | None = None
+    dl_bytes: float | None = None
+    rem_bytes: float | None = None
    if mean_bytes:
-        downloaded_bytes = downloaded * mean_bytes
-        exp_remaining    = _expected_remaining_downloads(pending_rows)
-        remaining_bytes  = exp_remaining * mean_bytes
-        total_bytes      = downloaded_bytes + remaining_bytes
+        dl_bytes  = downloaded * mean_bytes
+        rem_bytes = _expected_remaining(pending_rows) * mean_bytes

    # Update cache
    cache["timestamp"] = now.isoformat()
@@ -212,57 +233,134 @@ def main() -> None:
    except Exception:
        pass

-    # --- Formatted output ---
-    rate_str = ""
-    eta_str  = ""
-    if rate_scans_per_sec and rate_scans_per_sec > 0:
-        rate_str = f"{rate_scans_per_sec * 3600:,.0f} scans/hr"
-        eta_str  = _fmt_duration(pending / rate_scans_per_sec)
+    rate_str = eta_str = ""
+    if rate_per_sec and rate_per_sec > 0:
+        rate_str = f"{rate_per_sec * 3600:,.0f} scans/hr"
+        eta_str  = _fmt_duration(pending / rate_per_sec)

-    print(f"Mosaic download progress  \u2014  {datetime.now().strftime('%Y-%m-%d %H:%M')}")
-    print(f"Archive: {archive.resolve()}")
+    # -----------------------------------------------------------------------
+    # Output — Markdown
+    # -----------------------------------------------------------------------

-    timing_parts = []
+    ts = datetime.now().strftime("%Y-%m-%d %H:%M")
+    print(f"# Mosaic download progress — {ts}\n")
+    print(f"**Archive:** `{archive.resolve()}`  ")
+    meta_parts = []
    if elapsed_str:
-        timing_parts.append(f"Elapsed: {elapsed_str}")
+        meta_parts.append(f"**Elapsed:** {elapsed_str}")
    if rate_str:
-        timing_parts.append(f"Rate: {rate_str}")
+        meta_parts.append(f"**Rate:** {rate_str}")
    if eta_str:
-        timing_parts.append(f"ETA: {eta_str}")
-    if timing_parts:
-        print("  ".join(timing_parts))
-
+        meta_parts.append(f"**ETA:** {eta_str}")
+    if meta_parts:
+        print("  ".join(meta_parts) + "  ")
    print()
-    print(f"  Downloaded:            {downloaded:>8,}  ({100*downloaded/total:.1f}%)")
-    print(f"  Failed:                {failed:>8,}  ({100*failed/total:.1f}%)")
-    print(f"  Skipped (disk=0):      {zero_skipped:>8,}  ({100*zero_skipped/total:.1f}%)")
-    print(f"  Pending:               {pending:>8,}  ({100*pending/total:.1f}%)")
-    print(f"  Total scans:           {total:>8,}")
+
+    # Summary table
+    summary_rows = [
+        ["Downloaded",      f"{downloaded:,}",   f"{100*downloaded/total:.1f}%"],
+        ["Failed",          f"{failed:,}",        f"{100*failed/total:.1f}%"],
+        ["Skipped (disk=0)",f"{zero_skipped:,}",  f"{100*zero_skipped/total:.1f}%"],
+        ["Pending",         f"{pending:,}",       f"{100*pending/total:.1f}%"],
+        ["**Total**",       f"**{total:,}**",     ""],
+    ]
    if attempted:
-        print(f"  Success rate:          {100*downloaded/attempted:.1f}%  (of attempted)")
+        summary_rows.append(["**Success rate**", f"**{100*downloaded/attempted:.1f}%**", "*(of attempted)*"])
+    print(_md_table(["Metric", "Count", ""], summary_rows, align=["l", "r", "l"]))
    print()

-    if total_bytes is not None:
-        print(f"  Disk space {size_note}")
-        print(f"    Downloaded so far:   {_fmt_size(downloaded_bytes):>10}")
-        print(f"    Estimated remaining: {_fmt_size(remaining_bytes):>10}  (model-based)")
-        print(f"    Grand total:         {_fmt_size(total_bytes):>10}")
+    # Disk space
+    if dl_bytes is not None and rem_bytes is not None:
+        total_bytes = dl_bytes + rem_bytes
+        print(f"### Disk space\n")
+        print(f"_{size_note}_\n")
+        ds_rows = [
+            ["Downloaded so far",    _fmt_size(dl_bytes),    ""],
+            ["Estimated remaining",  _fmt_size(rem_bytes),   "*(model-based)*"],
+            ["**Grand total**",      f"**{_fmt_size(total_bytes)}**", ""],
+        ]
+        print(_md_table(["", "Size", ""], ds_rows, align=["l", "r", "l"]))
        print()

-    print(f"  {'Machine':<25} {'Done':>7}  {'Failed':>7}  {'Skip0':>6}  {'Pending':>8}  {'Total':>7}")
-    print("  " + "-" * 70)
-    for machine in sorted(by_machine):
-        mc = by_machine[machine]
+    # Per-machine breakdown
+    print("### Per-machine breakdown\n")
+    machines = sorted(by_machine)
+    mc_rows = []
+    for m in machines:
+        mc = by_machine[m]
        mt = sum(mc.values())
-        print(
-            f"  {machine:<25} {mc['downloaded']:>7,}  {mc['failed']:>7,}  "
-            f"{mc['skipped_zero_disk_space']:>6,}  {mc['skipped_metadata_only']:>8,}  {mt:>7,}"
+        mc_rows.append([
+            m,
+            f"{mc['downloaded']:,}",
+            f"{mc['failed']:,}",
+            f"{mc['skipped_zero_disk_space']:,}",
+            f"{mc['skipped_metadata_only']:,}",
+            f"{mt:,}",
+        ])
+    mc_rows.append([
+        "**TOTAL**",
+        f"**{downloaded:,}**",
+        f"**{failed:,}**",
+        f"**{zero_skipped:,}**",
+        f"**{pending:,}**",
+        f"**{total:,}**",
+    ])
+    print(_md_table(
+        ["Machine", "Done", "Failed", "Skip0", "Pending", "Total"],
+        mc_rows,
+        align=["l", "r", "r", "r", "r", "r"],
+    ))
+
+    # -----------------------------------------------------------------------
+    # --by-year table
+    # -----------------------------------------------------------------------
+    if args.by_year:
+        print()
+        print("### Downloads by machine and year\n")
+        print("*Format: done / failed*\n")
+
+        # Only include years that have at least one downloaded or failed scan
+        all_years = sorted(
+            yr for yr in set(
+                yr for m_data in by_machine_year.values() for yr in m_data
+            )
+            if any(
+                by_machine_year[m].get(yr, Counter()).get("downloaded", 0)
+                + by_machine_year[m].get(yr, Counter()).get("failed", 0) > 0
+                for m in by_machine_year
+            )
        )
-    print("  " + "-" * 70)
-    print(
-        f"  {'TOTAL':<25} {downloaded:>7,}  {failed:>7,}  "
-        f"{zero_skipped:>6,}  {pending:>8,}  {total:>7,}"
-    )
+
+        # Totals per year
+        yr_totals: dict[str, Counter] = {}
+        for yr in all_years:
+            yr_totals[yr] = Counter()
+            for m in machines:
+                yr_totals[yr] += by_machine_year.get(m, {}).get(yr, Counter())
+
+        year_rows = []
+        for m in machines:
+            row_cells = [m]
+            for yr in all_years:
+                c = by_machine_year.get(m, {}).get(yr, Counter())
+                d = c.get("downloaded", 0)
+                f = c.get("failed", 0)
+                row_cells.append(f"{d:,} / {f:,}" if (d or f) else "—")
+            year_rows.append(row_cells)
+
+        # Totals row
+        total_cells = ["**TOTAL**"]
+        for yr in all_years:
+            d = yr_totals[yr].get("downloaded", 0)
+            f = yr_totals[yr].get("failed", 0)
+            total_cells.append(f"**{d:,} / {f:,}**" if (d or f) else "—")
+        year_rows.append(total_cells)
+
+        print(_md_table(
+            ["Machine"] + all_years,
+            year_rows,
+            align=["l"] + ["r"] * len(all_years),
+        ))


 if __name__ == "__main__":