Merge main into testing/sample-runs

2026-05-14 19:44:26 -04:00
parent 1e1695a27d 9f341ea27d
commit 4b06ab4516
19 changed files with 344 additions and 183 deletions
@@ -7,3 +7,6 @@ __pycache__/
 .DS_Store
 explore_dumps/
 .venv/
 scripts/sync_to_nas.sh
 backup/
 .claude/
@@ -97,10 +97,8 @@ python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only
 # Download mosaics for all machines
 python scraper.py --mosaic-only
-# One random completed scan per machine: mosaic + all tiles (from machines.txt; uses --list-scans + --scan-id)
+# One random completed scan per machine (helper script): check out branch `testing/sample-runs`,
-# MOSAIC_ONLY=1 ./scripts/sample_random_scans.sh machines.txt   # optional: mosaics only, no tiles
+# then see `scripts/sample_random_scans.sh` and `docs/sample_random_scans_run_progress.md`.
 # cp scripts/machines.example.txt machines.txt   # then edit: one label per line
 # ./scripts/sample_random_scans.sh machines.txt
 # Download all tiles for a specific scan
 python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
@@ -0,0 +1,15 @@
 #!/usr/bin/env bash
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 VENV="/tmp/spruce_venv"
 if [[ ! -x "$VENV/bin/python" ]]; then
    echo "Setting up venv at $VENV..."
    python3 -m venv "$VENV"
    "$VENV/bin/python" -m ensurepip --upgrade
    "$VENV/bin/pip" install -q -r "$SCRIPT_DIR/requirements.txt"
 fi
 echo "Starting metadata-only scan of all machines..."
 "$VENV/bin/python" "$SCRIPT_DIR/scraper.py" --metadata-only "$@"
@@ -0,0 +1,105 @@
 #!/usr/bin/env python3
 """
 Split scans.csv into per-machine metadata CSVs.
 Reads the combined scans.csv produced by the scraper and writes one CSV per
 machine containing only the website-sourced metadata columns (no mosaic paths,
 download status, or error fields).
 Usage:
  python scripts/export_machine_metadata.py
  python scripts/export_machine_metadata.py --input archives/scans.csv --output-dir archives/by_machine
 """
 from __future__ import annotations
 import argparse
 import csv
 import sys
 from collections import defaultdict
 from pathlib import Path
 METADATA_COLUMNS = [
    "machine",
    "machine_id",
    "scan_id",
    "name",
    "scan_time",
    "start_x",
    "start_y",
    "end_x",
    "end_y",
    "dx",
    "dy",
    "nx",
    "ny",
    "total_tiles",
    "scan_lines",
    "scan_mode",
    "start_datetime",
    "end_datetime",
    "status",
    "user",
    "disk_space_mb",
 ]
 def sanitize_machine_label(label: str) -> str:
    return label.replace("[", "").replace("]", "").replace(" ", "_").strip("_")
 def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Split scans.csv into per-machine metadata CSVs.")
    p.add_argument(
        "--input",
        default="archives/scans.csv",
        metavar="FILE",
        help="Path to scans.csv (default: archives/scans.csv)",
    )
    p.add_argument(
        "--output-dir",
        default="archives/by_machine",
        metavar="DIR",
        help="Directory for output CSVs (default: archives/by_machine)",
    )
    return p.parse_args()
 def main() -> None:
    args = parse_args()
    input_path = Path(args.input)
    output_dir = Path(args.output_dir)
    if not input_path.exists():
        sys.exit(f"Input file not found: {input_path}")
    with input_path.open(newline="") as fh:
        reader = csv.DictReader(fh)
        if reader.fieldnames is None:
            sys.exit(f"{input_path} appears to be empty.")
        missing = [c for c in METADATA_COLUMNS if c not in reader.fieldnames]
        if missing:
            sys.exit(f"Expected columns not found in {input_path}: {missing}")
        rows_by_machine: dict[str, list[dict]] = defaultdict(list)
        for row in reader:
            rows_by_machine[row["machine"]].append(row)
    output_dir.mkdir(parents=True, exist_ok=True)
    for machine_label, rows in sorted(rows_by_machine.items()):
        safe_name = sanitize_machine_label(machine_label)
        out_path = output_dir / f"{safe_name}_scans_metadata.csv"
        with out_path.open("w", newline="") as fh:
            writer = csv.DictWriter(fh, fieldnames=METADATA_COLUMNS, extrasaction="ignore")
            writer.writeheader()
            writer.writerows(rows)
        print(f"  {out_path}  ({len(rows)} rows)")
    total = sum(len(r) for r in rows_by_machine.values())
    print(f"\n{len(rows_by_machine)} machine(s), {total} total rows → {output_dir}/")
 if __name__ == "__main__":
    main()
@@ -1,6 +1,6 @@
 # All RootView minirhizotron machine labels (same set as `machine_metadata` in config.example.yaml).
 # Copy to the repo root as machines.txt, or: cp scripts/machines.example.txt machines.txt
-# sample_random_scans.sh: by default one random scan per line = mosaic + tiles; use MOSAIC_ONLY=1 for mosaics only
+# Random-sample helper `scripts/sample_random_scans.sh` lives on branch `testing/sample-runs` only.
 BW1-4 [AMR-15]
 BW1-6 [AMR-19]
 BW1-7 [AMR-18]
@@ -1,178 +0,0 @@
 #!/usr/bin/env bash
 # For each machine label in a text file, pick one random completed scan and download
 # it: by default the mosaic and all tiles (same as: --machine "…" --scan-id N).
 # For mosaic only (faster, no tile downloads), set: MOSAIC_ONLY=1
 #
 # Usage:
 #   ./scripts/sample_random_scans.sh [PATH_TO_machines.txt]
 # Config path defaults to config.yaml in the repo root. Override with:
 #   CONFIG=/path/to/config.yaml ./scripts/sample_random_scans.sh machines.txt
 # Dry-run the download step (listing still does real HTTP to fetch scan list):
 #   DRY_RUN=1 ./scripts/sample_random_scans.sh machines.txt
 # Verbose / debug (extra per-step lines, scan counts from the list step):
 #   DEBUG=1 ./scripts/sample_random_scans.sh machines.txt
 # By default, --list-scans fetches only the first page (one HTTP request, up to
 # 320 scans). To paginate the full archive for the random pick (slower when many
 #   LIST_SCANS_ALL_PAGES=1 ./scripts/sample_random_scans.sh machines.txt
 #
 # machines.txt: one machine label per line (same as --machine and config machine names).
 # See scripts/machines.example.txt
 set -euo pipefail
 REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 CONFIG="${CONFIG:-$REPO_ROOT/config.yaml}"
 MACHINES_FILE="${1:-$REPO_ROOT/machines.txt}"
 SCRAPER=(python3 "$REPO_ROOT/scraper.py" --config "$CONFIG")
 log() { echo "[sample_random_scans] $*" >&2; }
 log_debug() {
  if [[ -n "${DEBUG:-}" ]]; then
    echo "[sample_random_scans] debug: $*" >&2
  fi
 }
 if [[ ! -f "$MACHINES_FILE" ]]; then
  log "error: file not found: $MACHINES_FILE"
  log "Create it with one machine label per line, or: cp scripts/machines.example.txt machines.txt"
  exit 1
 fi
 if [[ ! -f "$CONFIG" ]]; then
  log "error: config not found: $CONFIG"
  exit 1
 fi
 # Non-empty, non-comment lines (same rules as the main loop)
 TOTAL_MACHINES="$(
  grep -v '^[[:space:]]*#' "$MACHINES_FILE" | grep -c -v '^[[:space:]]*$' || true
 )"
 if [[ -z "$TOTAL_MACHINES" || "$TOTAL_MACHINES" -eq 0 ]]; then
  log "error: no machine lines in: $MACHINES_FILE"
  exit 1
 fi
 log "starting  repo=$REPO_ROOT"
 log "         config=$CONFIG"
 log "         machines_file=$MACHINES_FILE  (${TOTAL_MACHINES} machine(s) in file)"
 if [[ -n "${MOSAIC_ONLY:-}" ]]; then
  if [[ -n "${DRY_RUN:-}" ]]; then
    log "         mode: MOSAIC_ONLY + DRY_RUN (mosaic only, --dry-run on download step)"
  else
    log "         mode: MOSAIC_ONLY=1 (mosaics only, no tiles; use for a lighter sample)"
  fi
 else
  if [[ -n "${DRY_RUN:-}" ]]; then
    log "         mode: DRY_RUN (list + full scan download use --dry-run; no files written)"
  else
    log "         mode: full scan — mosaic + all tiles (workers from config)"
  fi
 fi
 if [[ -n "${DEBUG:-}" ]]; then
  log "         DEBUG=1 (extra diagnostics enabled)"
 fi
 if [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]]; then
  log "         list step: list-scans = full archive (all pages, slower)"
 else
  log "         list step: list-scans --list-scans-first-page-only (one page, up to 320 IDs)"
 fi
 log "────────────────────────────────────────"
 export REPO_ROOT CONFIG
 [[ -n "${DEBUG:-}" ]] && export DEBUG
 [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]] && export LIST_SCANS_ALL_PAGES
 PROCESSED=0
 SKIPPED=0
 IDX=0
 while IFS= read -r line || [[ -n "${line-}" ]]; do
  # trim, strip CR, skip blanks / comments
  line="${line//$'\r'/}"
  label="${line#"${line%%[![:space:]]*}"}"
  label="${label%"${label##*[![:space:]]}"}"
  [[ -z "$label" || "$label" == \#* ]] && continue
  IDX=$((IDX + 1))
  log "[$IDX/$TOTAL_MACHINES] machine: $label"
  log "         status: listing scans (--list-scans) …"
  random_id="$(
    REPO_ROOT="$REPO_ROOT" CONFIG="$CONFIG" LABEL="$label" python3 - <<'PY'
 import os, random, subprocess, sys
 label = os.environ["LABEL"]
 repo = os.environ["REPO_ROOT"]
 cfg = os.environ["CONFIG"]
 debug = bool(os.environ.get("DEBUG"))
 full = bool(os.environ.get("LIST_SCANS_ALL_PAGES"))
 scraper = os.path.join(repo, "scraper.py")
 if debug:
    print(
        f"[sample_random_scans] debug: running list-scans for {label!r} "
        f"({'all pages' if full else 'first page only'})",
        file=sys.stderr,
    )
 cmd = [sys.executable, scraper, "--list-scans", "--machine", label, "--config", cfg]
 if not full:
    cmd.insert(3, "--list-scans-first-page-only")
 out = subprocess.check_output(
    cmd,
    text=True,
    stderr=subprocess.STDOUT,
 )
 ids = []
 for line in out.splitlines():
    line = line.rstrip()
    if not line or line.startswith("---") or "Total" in line:
        continue
    parts = line.split()
    if parts and parts[0].isdigit():
        ids.append(parts[0])
 if not ids:
    print(f"no scans parsed for {label!r} — check login and output", file=sys.stderr)
    sys.exit(1)
 if debug:
    print(
        f"[sample_random_scans] debug: parsed {len(ids)} scan id(s) for {label!r}",
        file=sys.stderr,
    )
 print(random.choice(ids), end="")
 PY
  )" || {
    log "         status: SKIPPED (could not get scan list or pick id)"
    SKIPPED=$((SKIPPED + 1))
    continue
  }
  log "         status: picked random scan_id=$random_id (uniform among IDs from this list step — first page by default, see start banner)"
  if [[ -n "${MOSAIC_ONLY:-}" ]]; then
    log "         status: running scraper: --mosaic-only --scan-id (mosaic only) …"
  else
    log "         status: running scraper: --scan-id (mosaic + tiles) …"
  fi
  if [[ -n "${DRY_RUN:-}" ]]; then
    log "         status: (dry-run — no files written for this scan)"
  fi
  if [[ -n "${MOSAIC_ONLY:-}" ]]; then
    run_cmd=("${SCRAPER[@]}" --mosaic-only --machine "$label" --scan-id "$random_id")
  else
    run_cmd=("${SCRAPER[@]}" --machine "$label" --scan-id "$random_id")
  fi
  if [[ -n "${DRY_RUN:-}" ]]; then
    run_cmd+=(--dry-run)
  fi
  if "${run_cmd[@]}"; then
    log "         status: OK — finished this machine (exit 0)"
    PROCESSED=$((PROCESSED + 1))
  else
    rc=$?
    log "         status: FAILED — scraper exit code $rc (stopping; fix or remove this machine and re-run)"
    exit "$rc"
  fi
  log "────────────────────────────────────────"
 done < "$MACHINES_FILE"
 log "done. summary: $PROCESSED machine(s) with sampled scan download completed, $SKIPPED skipped, $IDX line(s) processed out of $TOTAL_MACHINES in file."
 exit 0
@@ -0,0 +1,218 @@
 #!/usr/bin/env python3
 """
 Report metadata-scan progress and projected completion times for all machines.
 Usage:
    python scripts/scan_progress_report.py [--archive ARCHIVE_DIR] [--recent N] [--mermaid] [--rate-chart]
 Options:
    --archive DIR   Path to archives directory (default: archives)
    --recent N      Number of recent files used to compute current rate (default: 500)
    --mermaid       Also print a Mermaid Gantt chart
    --rate-chart    Also print a Mermaid XY chart of s/scan rate by hour
 """
 import argparse
 import glob
 import os
 import re
 import sys
 from collections import defaultdict
 from datetime import datetime, timedelta
 from pathlib import Path
 # Canonical machine order and total scan counts (from README inventory, April 2026)
 MACHINES = [
    ("BW1-4 [AMR-15]",  6121),
    ("BW1-6 [AMR-19]", 18198),
    ("BW1-7 [AMR-18]",   430),
    ("BW2-8 [AMR-25]",  8191),
    ("BW2-10 [AMR-22]", 16537),
    ("BW2-11 [AMR-23]", 26763),
    ("BW2-13 [AMR-24]", 13537),
    ("BW3-16 [AMR-16]",  7325),
    ("BW3-17 [AMR-20]",   471),
    ("BW3-19 [AMR-21]", 15186),
    ("BW3-20 [AMR-26]", 23052),
    ("BW3-21 [AMR-17]", 10115),
 ]
 TOTAL_SCANS = sum(t for _, t in MACHINES)
 def dir_name(label: str) -> str:
    return re.sub(r"[^\w\-.]", "_", label).strip("_")
 def get_timestamps(machine_dir: Path) -> list[float]:
    files = glob.glob(str(machine_dir / "**" / "metadata.json"), recursive=True)
    return sorted(os.path.getmtime(f) for f in files)
 def print_rate_chart(all_timestamps: list[float]) -> None:
    """Print a Mermaid xychart-beta of average s/scan per hour."""
    # One avg rate per hour
    bins: dict[str, list[float]] = defaultdict(list)
    start_hour: datetime | None = None
    for i in range(1, len(all_timestamps)):
        gap = all_timestamps[i] - all_timestamps[i - 1]
        if gap < 300:  # ignore inter-machine gaps
            dt = datetime.fromtimestamp(all_timestamps[i])
            hour_key = dt.strftime("%m-%d %Hh")
            bins[hour_key].append(gap)
            if start_hour is None:
                start_hour = dt.replace(minute=0, second=0, microsecond=0)
    # Drop the last (partial) hour
    hours = sorted(bins.keys())
    if hours:
        hours = hours[:-1]
    if not hours or start_hour is None:
        print("(not enough data for rate chart)")
        return
    values = [f"{sum(bins[h])/len(bins[h]):.2f}" for h in hours]
    y_max = max(float(v) for v in values)
    y_ceil = int(y_max) + 3
    n = len(hours)
    # Numeric x-axis: Mermaid auto-picks readable tick positions
    start_label = start_hour.strftime("%b %d %H:%M")
    print("```mermaid")
    print("xychart-beta")
    print(f'    title "Metadata scan rate (s/scan) — hourly, starting {start_label}"')
    print(f'    x-axis "Hours elapsed" 0 --> {n}')
    print(f'    y-axis "s / scan" 0 --> {y_ceil}')
    print(f"    line [{', '.join(values)}]")
    print("```")
 def fmt_dt(dt: datetime) -> str:
    return dt.strftime("%a %b %d  %H:%M")
 def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--archive", default="archives", help="Archives directory")
    parser.add_argument("--recent", type=int, default=500,
                        help="Files used to compute recent rate (default: 500)")
    parser.add_argument("--mermaid", action="store_true", help="Print Mermaid Gantt chart")
    parser.add_argument("--rate-chart", action="store_true", help="Print Mermaid XY rate-over-time chart")
    args = parser.parse_args()
    archive = Path(args.archive)
    if not archive.is_dir():
        sys.exit(f"Archive directory not found: {archive}")
    now = datetime.now()
    # --- Gather per-machine data ---
    machine_data = []  # (label, total, done, first_ts, last_ts)
    all_timestamps: list[float] = []
    for label, total in MACHINES:
        mdir = archive / dir_name(label)
        if mdir.is_dir():
            times = get_timestamps(mdir)
        else:
            times = []
        done = len(times)
        first_ts = datetime.fromtimestamp(times[0]) if times else None
        last_ts  = datetime.fromtimestamp(times[-1]) if times else None
        machine_data.append((label, total, done, first_ts, last_ts, times))
        all_timestamps.extend(times)
    all_timestamps.sort()
    total_done = sum(d for _, _, d, *_ in machine_data)
    # --- Rate calculation ---
    recent_times = all_timestamps[-args.recent:] if len(all_timestamps) >= 2 else all_timestamps
    if len(recent_times) >= 2:
        recent_rate = (recent_times[-1] - recent_times[0]) / len(recent_times)
    else:
        recent_rate = None
    if len(all_timestamps) >= 2:
        overall_rate = (all_timestamps[-1] - all_timestamps[0]) / len(all_timestamps)
    else:
        overall_rate = None
    rate = recent_rate or overall_rate or 5.0  # fallback
    # --- Print timetable ---
    print(f"Metadata scan progress  —  {now.strftime('%Y-%m-%d %H:%M')}")
    print(f"Overall rate : {overall_rate:.2f} s/scan" if overall_rate else "Overall rate : n/a")
    print(f"Recent rate  : {recent_rate:.2f} s/scan (last {args.recent} files)" if recent_rate else "Recent rate  : n/a")
    print(f"Rate used    : {rate:.2f} s/scan")
    print(f"Done         : {total_done:,} / {TOTAL_SCANS:,}  ({100*total_done/TOTAL_SCANS:.1f}%)")
    print()
    print(f"{'Machine':<20} {'Total':>7}  {'Done':>7}  {'Pct':>6}  {'Completion'}")
    print("-" * 68)
    cursor = now
    gantt_rows: list[tuple[str, datetime, datetime, str]] = []  # label, start, end, status
    for label, total, done, first_ts, last_ts, times in machine_data:
        pct = 100 * done / total if total else 0
        complete = done >= total or (done > 0 and done / total >= 0.999)
        if done == 0:
            # Not started yet
            start = cursor
            finish = cursor + timedelta(seconds=total * rate)
            status = "pending"
            print(f"{label:<20} {total:>7,}  {'—':>7}   {'—':>5}   {fmt_dt(finish)}")
        elif complete:
            # Complete — use actual timestamps
            start = first_ts
            finish = last_ts
            status = "done"
            print(f"{label:<20} {total:>7,}  {done:>7,}  {pct:>5.1f}%  complete")
        else:
            # In progress
            remaining = total - done
            start = first_ts
            finish = cursor + timedelta(seconds=remaining * rate)
            status = "active"
            print(f"{label:<20} {total:>7,}  {done:>7,}  {pct:>5.1f}%  {fmt_dt(finish)}  ← in progress")
        gantt_rows.append((label, start or cursor, finish, status))
        if status != "done":
            cursor = finish
    print("-" * 68)
    print(f"{'All done':<20} {TOTAL_SCANS:>7,}  {total_done:>7,}  {100*total_done/TOTAL_SCANS:>5.1f}%  {fmt_dt(cursor)}")
    # --- Mermaid rate chart ---
    if args.rate_chart:
        print()
        print_rate_chart(all_timestamps)
    # --- Mermaid Gantt ---
    if args.mermaid:
        print()
        print("```mermaid")
        print("gantt")
        print("    title Metadata scan progress — all 12 machines")
        print("    dateFormat YYYY-MM-DD HH:mm")
        print("    axisFormat %b %d")
        print()
        section = None
        for label, start, finish, status in gantt_rows:
            prefix = label.split()[0][:3]  # BW1, BW2, BW3
            if prefix != section:
                section = prefix
                print(f"    section {section}")
            safe = label.replace("[", "").replace("]", "").replace(" ", "-")
            tag = f"done, " if status == "done" else ("active, " if status == "active" else "")
            s = start.strftime("%Y-%m-%d %H:%M")
            e = finish.strftime("%Y-%m-%d %H:%M")
            print(f"    {label} :{tag}{safe}, {s}, {e}")
        print("```")
 if __name__ == "__main__":
    main()