Scraping resilience, metadata tooling, and repository hygiene
Consolidates mosaic and session hardening (login retry, skip processed scans, no retry on 404, started_at), progress reporting (Markdown tables, by-year rollup, rolling-window rate/ETA), and metadata workflow scripts (run_metadata_scan.sh, scan_progress_report.py, export_machine_metadata.py). Adds mosaic reconstruction sample JPEGs referenced by the report. Updates .gitignore for backup/ and .claude/; sample_random_scans helper is documented for branch testing/sample-runs only (see README).
This commit is contained in:
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Report metadata-scan progress and projected completion times for all machines.
|
||||
|
||||
Usage:
|
||||
python scripts/scan_progress_report.py [--archive ARCHIVE_DIR] [--recent N] [--mermaid] [--rate-chart]
|
||||
|
||||
Options:
|
||||
--archive DIR Path to archives directory (default: archives)
|
||||
--recent N Number of recent files used to compute current rate (default: 500)
|
||||
--mermaid Also print a Mermaid Gantt chart
|
||||
--rate-chart Also print a Mermaid XY chart of s/scan rate by hour
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Canonical machine order and total scan counts (from README inventory, April 2026)
|
||||
MACHINES = [
|
||||
("BW1-4 [AMR-15]", 6121),
|
||||
("BW1-6 [AMR-19]", 18198),
|
||||
("BW1-7 [AMR-18]", 430),
|
||||
("BW2-8 [AMR-25]", 8191),
|
||||
("BW2-10 [AMR-22]", 16537),
|
||||
("BW2-11 [AMR-23]", 26763),
|
||||
("BW2-13 [AMR-24]", 13537),
|
||||
("BW3-16 [AMR-16]", 7325),
|
||||
("BW3-17 [AMR-20]", 471),
|
||||
("BW3-19 [AMR-21]", 15186),
|
||||
("BW3-20 [AMR-26]", 23052),
|
||||
("BW3-21 [AMR-17]", 10115),
|
||||
]
|
||||
TOTAL_SCANS = sum(t for _, t in MACHINES)
|
||||
|
||||
|
||||
def dir_name(label: str) -> str:
|
||||
return re.sub(r"[^\w\-.]", "_", label).strip("_")
|
||||
|
||||
|
||||
def get_timestamps(machine_dir: Path) -> list[float]:
|
||||
files = glob.glob(str(machine_dir / "**" / "metadata.json"), recursive=True)
|
||||
return sorted(os.path.getmtime(f) for f in files)
|
||||
|
||||
|
||||
def print_rate_chart(all_timestamps: list[float]) -> None:
|
||||
"""Print a Mermaid xychart-beta of average s/scan per hour."""
|
||||
# One avg rate per hour
|
||||
bins: dict[str, list[float]] = defaultdict(list)
|
||||
start_hour: datetime | None = None
|
||||
for i in range(1, len(all_timestamps)):
|
||||
gap = all_timestamps[i] - all_timestamps[i - 1]
|
||||
if gap < 300: # ignore inter-machine gaps
|
||||
dt = datetime.fromtimestamp(all_timestamps[i])
|
||||
hour_key = dt.strftime("%m-%d %Hh")
|
||||
bins[hour_key].append(gap)
|
||||
if start_hour is None:
|
||||
start_hour = dt.replace(minute=0, second=0, microsecond=0)
|
||||
|
||||
# Drop the last (partial) hour
|
||||
hours = sorted(bins.keys())
|
||||
if hours:
|
||||
hours = hours[:-1]
|
||||
|
||||
if not hours or start_hour is None:
|
||||
print("(not enough data for rate chart)")
|
||||
return
|
||||
|
||||
values = [f"{sum(bins[h])/len(bins[h]):.2f}" for h in hours]
|
||||
y_max = max(float(v) for v in values)
|
||||
y_ceil = int(y_max) + 3
|
||||
n = len(hours)
|
||||
|
||||
# Numeric x-axis: Mermaid auto-picks readable tick positions
|
||||
start_label = start_hour.strftime("%b %d %H:%M")
|
||||
print("```mermaid")
|
||||
print("xychart-beta")
|
||||
print(f' title "Metadata scan rate (s/scan) — hourly, starting {start_label}"')
|
||||
print(f' x-axis "Hours elapsed" 0 --> {n}')
|
||||
print(f' y-axis "s / scan" 0 --> {y_ceil}')
|
||||
print(f" line [{', '.join(values)}]")
|
||||
print("```")
|
||||
|
||||
|
||||
def fmt_dt(dt: datetime) -> str:
|
||||
return dt.strftime("%a %b %d %H:%M")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--archive", default="archives", help="Archives directory")
|
||||
parser.add_argument("--recent", type=int, default=500,
|
||||
help="Files used to compute recent rate (default: 500)")
|
||||
parser.add_argument("--mermaid", action="store_true", help="Print Mermaid Gantt chart")
|
||||
parser.add_argument("--rate-chart", action="store_true", help="Print Mermaid XY rate-over-time chart")
|
||||
args = parser.parse_args()
|
||||
|
||||
archive = Path(args.archive)
|
||||
if not archive.is_dir():
|
||||
sys.exit(f"Archive directory not found: {archive}")
|
||||
|
||||
now = datetime.now()
|
||||
|
||||
# --- Gather per-machine data ---
|
||||
machine_data = [] # (label, total, done, first_ts, last_ts)
|
||||
all_timestamps: list[float] = []
|
||||
|
||||
for label, total in MACHINES:
|
||||
mdir = archive / dir_name(label)
|
||||
if mdir.is_dir():
|
||||
times = get_timestamps(mdir)
|
||||
else:
|
||||
times = []
|
||||
done = len(times)
|
||||
first_ts = datetime.fromtimestamp(times[0]) if times else None
|
||||
last_ts = datetime.fromtimestamp(times[-1]) if times else None
|
||||
machine_data.append((label, total, done, first_ts, last_ts, times))
|
||||
all_timestamps.extend(times)
|
||||
|
||||
all_timestamps.sort()
|
||||
total_done = sum(d for _, _, d, *_ in machine_data)
|
||||
|
||||
# --- Rate calculation ---
|
||||
recent_times = all_timestamps[-args.recent:] if len(all_timestamps) >= 2 else all_timestamps
|
||||
if len(recent_times) >= 2:
|
||||
recent_rate = (recent_times[-1] - recent_times[0]) / len(recent_times)
|
||||
else:
|
||||
recent_rate = None
|
||||
|
||||
if len(all_timestamps) >= 2:
|
||||
overall_rate = (all_timestamps[-1] - all_timestamps[0]) / len(all_timestamps)
|
||||
else:
|
||||
overall_rate = None
|
||||
|
||||
rate = recent_rate or overall_rate or 5.0 # fallback
|
||||
|
||||
# --- Print timetable ---
|
||||
print(f"Metadata scan progress — {now.strftime('%Y-%m-%d %H:%M')}")
|
||||
print(f"Overall rate : {overall_rate:.2f} s/scan" if overall_rate else "Overall rate : n/a")
|
||||
print(f"Recent rate : {recent_rate:.2f} s/scan (last {args.recent} files)" if recent_rate else "Recent rate : n/a")
|
||||
print(f"Rate used : {rate:.2f} s/scan")
|
||||
print(f"Done : {total_done:,} / {TOTAL_SCANS:,} ({100*total_done/TOTAL_SCANS:.1f}%)")
|
||||
print()
|
||||
print(f"{'Machine':<20} {'Total':>7} {'Done':>7} {'Pct':>6} {'Completion'}")
|
||||
print("-" * 68)
|
||||
|
||||
cursor = now
|
||||
gantt_rows: list[tuple[str, datetime, datetime, str]] = [] # label, start, end, status
|
||||
|
||||
for label, total, done, first_ts, last_ts, times in machine_data:
|
||||
pct = 100 * done / total if total else 0
|
||||
|
||||
complete = done >= total or (done > 0 and done / total >= 0.999)
|
||||
|
||||
if done == 0:
|
||||
# Not started yet
|
||||
start = cursor
|
||||
finish = cursor + timedelta(seconds=total * rate)
|
||||
status = "pending"
|
||||
print(f"{label:<20} {total:>7,} {'—':>7} {'—':>5} {fmt_dt(finish)}")
|
||||
elif complete:
|
||||
# Complete — use actual timestamps
|
||||
start = first_ts
|
||||
finish = last_ts
|
||||
status = "done"
|
||||
print(f"{label:<20} {total:>7,} {done:>7,} {pct:>5.1f}% complete")
|
||||
else:
|
||||
# In progress
|
||||
remaining = total - done
|
||||
start = first_ts
|
||||
finish = cursor + timedelta(seconds=remaining * rate)
|
||||
status = "active"
|
||||
print(f"{label:<20} {total:>7,} {done:>7,} {pct:>5.1f}% {fmt_dt(finish)} ← in progress")
|
||||
|
||||
gantt_rows.append((label, start or cursor, finish, status))
|
||||
if status != "done":
|
||||
cursor = finish
|
||||
|
||||
print("-" * 68)
|
||||
print(f"{'All done':<20} {TOTAL_SCANS:>7,} {total_done:>7,} {100*total_done/TOTAL_SCANS:>5.1f}% {fmt_dt(cursor)}")
|
||||
|
||||
# --- Mermaid rate chart ---
|
||||
if args.rate_chart:
|
||||
print()
|
||||
print_rate_chart(all_timestamps)
|
||||
|
||||
# --- Mermaid Gantt ---
|
||||
if args.mermaid:
|
||||
print()
|
||||
print("```mermaid")
|
||||
print("gantt")
|
||||
print(" title Metadata scan progress — all 12 machines")
|
||||
print(" dateFormat YYYY-MM-DD HH:mm")
|
||||
print(" axisFormat %b %d")
|
||||
print()
|
||||
|
||||
section = None
|
||||
for label, start, finish, status in gantt_rows:
|
||||
prefix = label.split()[0][:3] # BW1, BW2, BW3
|
||||
if prefix != section:
|
||||
section = prefix
|
||||
print(f" section {section}")
|
||||
safe = label.replace("[", "").replace("]", "").replace(" ", "-")
|
||||
tag = f"done, " if status == "done" else ("active, " if status == "active" else "")
|
||||
s = start.strftime("%Y-%m-%d %H:%M")
|
||||
e = finish.strftime("%Y-%m-%d %H:%M")
|
||||
print(f" {label} :{tag}{safe}, {s}, {e}")
|
||||
print("```")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user