Testing: sample_random_scans helper, run checkpoint, investigation manifest

Scraping resilience, metadata tooling, and repository hygiene
Consolidates mosaic and session hardening (login retry, skip processed scans, no retry on 404, started_at), progress reporting (Markdown tables, by-year rollup, rolling-window rate/ETA), and metadata workflow scripts (run_metadata_scan.sh, scan_progress_report.py, export_machine_metadata.py). Adds mosaic reconstruction sample JPEGs referenced by the report. Updates .gitignore for backup/ and .claude/; sample_random_scans helper is documented for branch testing/sample-runs only (see README).
2026-05-14 19:53:56 -04:00 · 2026-05-14 19:52:53 -04:00
18 changed files with 344 additions and 5 deletions
@@ -7,3 +7,6 @@ __pycache__/
 .DS_Store
 explore_dumps/
 .venv/
+scripts/sync_to_nas.sh
+backup/
+.claude/
@@ -97,10 +97,8 @@ python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only
 # Download mosaics for all machines
 python scraper.py --mosaic-only

-# One random completed scan per machine: mosaic + all tiles (from machines.txt; uses --list-scans + --scan-id)
-# MOSAIC_ONLY=1 ./scripts/sample_random_scans.sh machines.txt   # optional: mosaics only, no tiles
-# cp scripts/machines.example.txt machines.txt   # then edit: one label per line
-# ./scripts/sample_random_scans.sh machines.txt
+# One random completed scan per machine (helper script): check out branch `testing/sample-runs`,
+# then see `scripts/sample_random_scans.sh` and `docs/sample_random_scans_run_progress.md`.

 # Download all tiles for a specific scan
 python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+VENV="/tmp/spruce_venv"
+
+if [[ ! -x "$VENV/bin/python" ]]; then
+    echo "Setting up venv at $VENV..."
+    python3 -m venv "$VENV"
+    "$VENV/bin/python" -m ensurepip --upgrade
+    "$VENV/bin/pip" install -q -r "$SCRIPT_DIR/requirements.txt"
+fi
+
+echo "Starting metadata-only scan of all machines..."
+"$VENV/bin/python" "$SCRIPT_DIR/scraper.py" --metadata-only "$@"
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""
+Split scans.csv into per-machine metadata CSVs.
+
+Reads the combined scans.csv produced by the scraper and writes one CSV per
+machine containing only the website-sourced metadata columns (no mosaic paths,
+download status, or error fields).
+
+Usage:
+  python scripts/export_machine_metadata.py
+  python scripts/export_machine_metadata.py --input archives/scans.csv --output-dir archives/by_machine
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+METADATA_COLUMNS = [
+    "machine",
+    "machine_id",
+    "scan_id",
+    "name",
+    "scan_time",
+    "start_x",
+    "start_y",
+    "end_x",
+    "end_y",
+    "dx",
+    "dy",
+    "nx",
+    "ny",
+    "total_tiles",
+    "scan_lines",
+    "scan_mode",
+    "start_datetime",
+    "end_datetime",
+    "status",
+    "user",
+    "disk_space_mb",
+]
+
+
+def sanitize_machine_label(label: str) -> str:
+    return label.replace("[", "").replace("]", "").replace(" ", "_").strip("_")
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Split scans.csv into per-machine metadata CSVs.")
+    p.add_argument(
+        "--input",
+        default="archives/scans.csv",
+        metavar="FILE",
+        help="Path to scans.csv (default: archives/scans.csv)",
+    )
+    p.add_argument(
+        "--output-dir",
+        default="archives/by_machine",
+        metavar="DIR",
+        help="Directory for output CSVs (default: archives/by_machine)",
+    )
+    return p.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    input_path = Path(args.input)
+    output_dir = Path(args.output_dir)
+
+    if not input_path.exists():
+        sys.exit(f"Input file not found: {input_path}")
+
+    with input_path.open(newline="") as fh:
+        reader = csv.DictReader(fh)
+        if reader.fieldnames is None:
+            sys.exit(f"{input_path} appears to be empty.")
+
+        missing = [c for c in METADATA_COLUMNS if c not in reader.fieldnames]
+        if missing:
+            sys.exit(f"Expected columns not found in {input_path}: {missing}")
+
+        rows_by_machine: dict[str, list[dict]] = defaultdict(list)
+        for row in reader:
+            rows_by_machine[row["machine"]].append(row)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for machine_label, rows in sorted(rows_by_machine.items()):
+        safe_name = sanitize_machine_label(machine_label)
+        out_path = output_dir / f"{safe_name}_scans_metadata.csv"
+        with out_path.open("w", newline="") as fh:
+            writer = csv.DictWriter(fh, fieldnames=METADATA_COLUMNS, extrasaction="ignore")
+            writer.writeheader()
+            writer.writerows(rows)
+        print(f"  {out_path}  ({len(rows)} rows)")
+
+    total = sum(len(r) for r in rows_by_machine.values())
+    print(f"\n{len(rows_by_machine)} machine(s), {total} total rows → {output_dir}/")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,6 +1,6 @@
 # All RootView minirhizotron machine labels (same set as `machine_metadata` in config.example.yaml).
 # Copy to the repo root as machines.txt, or: cp scripts/machines.example.txt machines.txt
-# sample_random_scans.sh: by default one random scan per line = mosaic + tiles; use MOSAIC_ONLY=1 for mosaics only
+# Random-sample helper `scripts/sample_random_scans.sh` lives on branch `testing/sample-runs` only.
 BW1-4 [AMR-15]
 BW1-6 [AMR-19]
 BW1-7 [AMR-18]
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+"""
+Report metadata-scan progress and projected completion times for all machines.
+
+Usage:
+    python scripts/scan_progress_report.py [--archive ARCHIVE_DIR] [--recent N] [--mermaid] [--rate-chart]
+
+Options:
+    --archive DIR   Path to archives directory (default: archives)
+    --recent N      Number of recent files used to compute current rate (default: 500)
+    --mermaid       Also print a Mermaid Gantt chart
+    --rate-chart    Also print a Mermaid XY chart of s/scan rate by hour
+"""
+
+import argparse
+import glob
+import os
+import re
+import sys
+from collections import defaultdict
+from datetime import datetime, timedelta
+from pathlib import Path
+
+
+# Canonical machine order and total scan counts (from README inventory, April 2026)
+MACHINES = [
+    ("BW1-4 [AMR-15]",  6121),
+    ("BW1-6 [AMR-19]", 18198),
+    ("BW1-7 [AMR-18]",   430),
+    ("BW2-8 [AMR-25]",  8191),
+    ("BW2-10 [AMR-22]", 16537),
+    ("BW2-11 [AMR-23]", 26763),
+    ("BW2-13 [AMR-24]", 13537),
+    ("BW3-16 [AMR-16]",  7325),
+    ("BW3-17 [AMR-20]",   471),
+    ("BW3-19 [AMR-21]", 15186),
+    ("BW3-20 [AMR-26]", 23052),
+    ("BW3-21 [AMR-17]", 10115),
+]
+TOTAL_SCANS = sum(t for _, t in MACHINES)
+
+
+def dir_name(label: str) -> str:
+    return re.sub(r"[^\w\-.]", "_", label).strip("_")
+
+
+def get_timestamps(machine_dir: Path) -> list[float]:
+    files = glob.glob(str(machine_dir / "**" / "metadata.json"), recursive=True)
+    return sorted(os.path.getmtime(f) for f in files)
+
+
+def print_rate_chart(all_timestamps: list[float]) -> None:
+    """Print a Mermaid xychart-beta of average s/scan per hour."""
+    # One avg rate per hour
+    bins: dict[str, list[float]] = defaultdict(list)
+    start_hour: datetime | None = None
+    for i in range(1, len(all_timestamps)):
+        gap = all_timestamps[i] - all_timestamps[i - 1]
+        if gap < 300:  # ignore inter-machine gaps
+            dt = datetime.fromtimestamp(all_timestamps[i])
+            hour_key = dt.strftime("%m-%d %Hh")
+            bins[hour_key].append(gap)
+            if start_hour is None:
+                start_hour = dt.replace(minute=0, second=0, microsecond=0)
+
+    # Drop the last (partial) hour
+    hours = sorted(bins.keys())
+    if hours:
+        hours = hours[:-1]
+
+    if not hours or start_hour is None:
+        print("(not enough data for rate chart)")
+        return
+
+    values = [f"{sum(bins[h])/len(bins[h]):.2f}" for h in hours]
+    y_max = max(float(v) for v in values)
+    y_ceil = int(y_max) + 3
+    n = len(hours)
+
+    # Numeric x-axis: Mermaid auto-picks readable tick positions
+    start_label = start_hour.strftime("%b %d %H:%M")
+    print("```mermaid")
+    print("xychart-beta")
+    print(f'    title "Metadata scan rate (s/scan) — hourly, starting {start_label}"')
+    print(f'    x-axis "Hours elapsed" 0 --> {n}')
+    print(f'    y-axis "s / scan" 0 --> {y_ceil}')
+    print(f"    line [{', '.join(values)}]")
+    print("```")
+
+
+def fmt_dt(dt: datetime) -> str:
+    return dt.strftime("%a %b %d  %H:%M")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--archive", default="archives", help="Archives directory")
+    parser.add_argument("--recent", type=int, default=500,
+                        help="Files used to compute recent rate (default: 500)")
+    parser.add_argument("--mermaid", action="store_true", help="Print Mermaid Gantt chart")
+    parser.add_argument("--rate-chart", action="store_true", help="Print Mermaid XY rate-over-time chart")
+    args = parser.parse_args()
+
+    archive = Path(args.archive)
+    if not archive.is_dir():
+        sys.exit(f"Archive directory not found: {archive}")
+
+    now = datetime.now()
+
+    # --- Gather per-machine data ---
+    machine_data = []  # (label, total, done, first_ts, last_ts)
+    all_timestamps: list[float] = []
+
+    for label, total in MACHINES:
+        mdir = archive / dir_name(label)
+        if mdir.is_dir():
+            times = get_timestamps(mdir)
+        else:
+            times = []
+        done = len(times)
+        first_ts = datetime.fromtimestamp(times[0]) if times else None
+        last_ts  = datetime.fromtimestamp(times[-1]) if times else None
+        machine_data.append((label, total, done, first_ts, last_ts, times))
+        all_timestamps.extend(times)
+
+    all_timestamps.sort()
+    total_done = sum(d for _, _, d, *_ in machine_data)
+
+    # --- Rate calculation ---
+    recent_times = all_timestamps[-args.recent:] if len(all_timestamps) >= 2 else all_timestamps
+    if len(recent_times) >= 2:
+        recent_rate = (recent_times[-1] - recent_times[0]) / len(recent_times)
+    else:
+        recent_rate = None
+
+    if len(all_timestamps) >= 2:
+        overall_rate = (all_timestamps[-1] - all_timestamps[0]) / len(all_timestamps)
+    else:
+        overall_rate = None
+
+    rate = recent_rate or overall_rate or 5.0  # fallback
+
+    # --- Print timetable ---
+    print(f"Metadata scan progress  —  {now.strftime('%Y-%m-%d %H:%M')}")
+    print(f"Overall rate : {overall_rate:.2f} s/scan" if overall_rate else "Overall rate : n/a")
+    print(f"Recent rate  : {recent_rate:.2f} s/scan (last {args.recent} files)" if recent_rate else "Recent rate  : n/a")
+    print(f"Rate used    : {rate:.2f} s/scan")
+    print(f"Done         : {total_done:,} / {TOTAL_SCANS:,}  ({100*total_done/TOTAL_SCANS:.1f}%)")
+    print()
+    print(f"{'Machine':<20} {'Total':>7}  {'Done':>7}  {'Pct':>6}  {'Completion'}")
+    print("-" * 68)
+
+    cursor = now
+    gantt_rows: list[tuple[str, datetime, datetime, str]] = []  # label, start, end, status
+
+    for label, total, done, first_ts, last_ts, times in machine_data:
+        pct = 100 * done / total if total else 0
+
+        complete = done >= total or (done > 0 and done / total >= 0.999)
+
+        if done == 0:
+            # Not started yet
+            start = cursor
+            finish = cursor + timedelta(seconds=total * rate)
+            status = "pending"
+            print(f"{label:<20} {total:>7,}  {'—':>7}   {'—':>5}   {fmt_dt(finish)}")
+        elif complete:
+            # Complete — use actual timestamps
+            start = first_ts
+            finish = last_ts
+            status = "done"
+            print(f"{label:<20} {total:>7,}  {done:>7,}  {pct:>5.1f}%  complete")
+        else:
+            # In progress
+            remaining = total - done
+            start = first_ts
+            finish = cursor + timedelta(seconds=remaining * rate)
+            status = "active"
+            print(f"{label:<20} {total:>7,}  {done:>7,}  {pct:>5.1f}%  {fmt_dt(finish)}  ← in progress")
+
+        gantt_rows.append((label, start or cursor, finish, status))
+        if status != "done":
+            cursor = finish
+
+    print("-" * 68)
+    print(f"{'All done':<20} {TOTAL_SCANS:>7,}  {total_done:>7,}  {100*total_done/TOTAL_SCANS:>5.1f}%  {fmt_dt(cursor)}")
+
+    # --- Mermaid rate chart ---
+    if args.rate_chart:
+        print()
+        print_rate_chart(all_timestamps)
+
+    # --- Mermaid Gantt ---
+    if args.mermaid:
+        print()
+        print("```mermaid")
+        print("gantt")
+        print("    title Metadata scan progress — all 12 machines")
+        print("    dateFormat YYYY-MM-DD HH:mm")
+        print("    axisFormat %b %d")
+        print()
+
+        section = None
+        for label, start, finish, status in gantt_rows:
+            prefix = label.split()[0][:3]  # BW1, BW2, BW3
+            if prefix != section:
+                section = prefix
+                print(f"    section {section}")
+            safe = label.replace("[", "").replace("]", "").replace(" ", "-")
+            tag = f"done, " if status == "done" else ("active, " if status == "active" else "")
+            s = start.strftime("%Y-%m-%d %H:%M")
+            e = finish.strftime("%Y-%m-%d %H:%M")
+            print(f"    {label} :{tag}{safe}, {s}, {e}")
+        print("```")
+
+
+if __name__ == "__main__":
+    main()