#!/usr/bin/env python3 """ Report metadata-scan progress and projected completion times for all machines. Usage: python scripts/scan_progress_report.py [--archive ARCHIVE_DIR] [--recent N] [--mermaid] [--rate-chart] Options: --archive DIR Path to archives directory (default: archives) --recent N Number of recent files used to compute current rate (default: 500) --mermaid Also print a Mermaid Gantt chart --rate-chart Also print a Mermaid XY chart of s/scan rate by hour """ import argparse import glob import os import re import sys from collections import defaultdict from datetime import datetime, timedelta from pathlib import Path # Canonical machine order and total scan counts (from README inventory, April 2026) MACHINES = [ ("BW1-4 [AMR-15]", 6121), ("BW1-6 [AMR-19]", 18198), ("BW1-7 [AMR-18]", 430), ("BW2-8 [AMR-25]", 8191), ("BW2-10 [AMR-22]", 16537), ("BW2-11 [AMR-23]", 26763), ("BW2-13 [AMR-24]", 13537), ("BW3-16 [AMR-16]", 7325), ("BW3-17 [AMR-20]", 471), ("BW3-19 [AMR-21]", 15186), ("BW3-20 [AMR-26]", 23052), ("BW3-21 [AMR-17]", 10115), ] TOTAL_SCANS = sum(t for _, t in MACHINES) def dir_name(label: str) -> str: return re.sub(r"[^\w\-.]", "_", label).strip("_") def get_timestamps(machine_dir: Path) -> list[float]: files = glob.glob(str(machine_dir / "**" / "metadata.json"), recursive=True) return sorted(os.path.getmtime(f) for f in files) def print_rate_chart(all_timestamps: list[float]) -> None: """Print a Mermaid xychart-beta of average s/scan per hour.""" # One avg rate per hour bins: dict[str, list[float]] = defaultdict(list) start_hour: datetime | None = None for i in range(1, len(all_timestamps)): gap = all_timestamps[i] - all_timestamps[i - 1] if gap < 300: # ignore inter-machine gaps dt = datetime.fromtimestamp(all_timestamps[i]) hour_key = dt.strftime("%m-%d %Hh") bins[hour_key].append(gap) if start_hour is None: start_hour = dt.replace(minute=0, second=0, microsecond=0) # Drop the last (partial) hour hours = sorted(bins.keys()) if hours: hours = hours[:-1] if not hours or start_hour is None: print("(not enough data for rate chart)") return values = [f"{sum(bins[h])/len(bins[h]):.2f}" for h in hours] y_max = max(float(v) for v in values) y_ceil = int(y_max) + 3 n = len(hours) # Numeric x-axis: Mermaid auto-picks readable tick positions start_label = start_hour.strftime("%b %d %H:%M") print("```mermaid") print("xychart-beta") print(f' title "Metadata scan rate (s/scan) — hourly, starting {start_label}"') print(f' x-axis "Hours elapsed" 0 --> {n}') print(f' y-axis "s / scan" 0 --> {y_ceil}') print(f" line [{', '.join(values)}]") print("```") def fmt_dt(dt: datetime) -> str: return dt.strftime("%a %b %d %H:%M") def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--archive", default="archives", help="Archives directory") parser.add_argument("--recent", type=int, default=500, help="Files used to compute recent rate (default: 500)") parser.add_argument("--mermaid", action="store_true", help="Print Mermaid Gantt chart") parser.add_argument("--rate-chart", action="store_true", help="Print Mermaid XY rate-over-time chart") args = parser.parse_args() archive = Path(args.archive) if not archive.is_dir(): sys.exit(f"Archive directory not found: {archive}") now = datetime.now() # --- Gather per-machine data --- machine_data = [] # (label, total, done, first_ts, last_ts) all_timestamps: list[float] = [] for label, total in MACHINES: mdir = archive / dir_name(label) if mdir.is_dir(): times = get_timestamps(mdir) else: times = [] done = len(times) first_ts = datetime.fromtimestamp(times[0]) if times else None last_ts = datetime.fromtimestamp(times[-1]) if times else None machine_data.append((label, total, done, first_ts, last_ts, times)) all_timestamps.extend(times) all_timestamps.sort() total_done = sum(d for _, _, d, *_ in machine_data) # --- Rate calculation --- recent_times = all_timestamps[-args.recent:] if len(all_timestamps) >= 2 else all_timestamps if len(recent_times) >= 2: recent_rate = (recent_times[-1] - recent_times[0]) / len(recent_times) else: recent_rate = None if len(all_timestamps) >= 2: overall_rate = (all_timestamps[-1] - all_timestamps[0]) / len(all_timestamps) else: overall_rate = None rate = recent_rate or overall_rate or 5.0 # fallback # --- Print timetable --- print(f"Metadata scan progress — {now.strftime('%Y-%m-%d %H:%M')}") print(f"Overall rate : {overall_rate:.2f} s/scan" if overall_rate else "Overall rate : n/a") print(f"Recent rate : {recent_rate:.2f} s/scan (last {args.recent} files)" if recent_rate else "Recent rate : n/a") print(f"Rate used : {rate:.2f} s/scan") print(f"Done : {total_done:,} / {TOTAL_SCANS:,} ({100*total_done/TOTAL_SCANS:.1f}%)") print() print(f"{'Machine':<20} {'Total':>7} {'Done':>7} {'Pct':>6} {'Completion'}") print("-" * 68) cursor = now gantt_rows: list[tuple[str, datetime, datetime, str]] = [] # label, start, end, status for label, total, done, first_ts, last_ts, times in machine_data: pct = 100 * done / total if total else 0 complete = done >= total or (done > 0 and done / total >= 0.999) if done == 0: # Not started yet start = cursor finish = cursor + timedelta(seconds=total * rate) status = "pending" print(f"{label:<20} {total:>7,} {'—':>7} {'—':>5} {fmt_dt(finish)}") elif complete: # Complete — use actual timestamps start = first_ts finish = last_ts status = "done" print(f"{label:<20} {total:>7,} {done:>7,} {pct:>5.1f}% complete") else: # In progress remaining = total - done start = first_ts finish = cursor + timedelta(seconds=remaining * rate) status = "active" print(f"{label:<20} {total:>7,} {done:>7,} {pct:>5.1f}% {fmt_dt(finish)} ← in progress") gantt_rows.append((label, start or cursor, finish, status)) if status != "done": cursor = finish print("-" * 68) print(f"{'All done':<20} {TOTAL_SCANS:>7,} {total_done:>7,} {100*total_done/TOTAL_SCANS:>5.1f}% {fmt_dt(cursor)}") # --- Mermaid rate chart --- if args.rate_chart: print() print_rate_chart(all_timestamps) # --- Mermaid Gantt --- if args.mermaid: print() print("```mermaid") print("gantt") print(" title Metadata scan progress — all 12 machines") print(" dateFormat YYYY-MM-DD HH:mm") print(" axisFormat %b %d") print() section = None for label, start, finish, status in gantt_rows: prefix = label.split()[0][:3] # BW1, BW2, BW3 if prefix != section: section = prefix print(f" section {section}") safe = label.replace("[", "").replace("]", "").replace(" ", "-") tag = f"done, " if status == "done" else ("active, " if status == "active" else "") s = start.strftime("%Y-%m-%d %H:%M") e = finish.strftime("%Y-%m-%d %H:%M") print(f" {label} :{tag}{safe}, {s}, {e}") print("```") if __name__ == "__main__": main()