From 4118e6e4f0e66fd77b82954d34829648571b5982 Mon Sep 17 00:00:00 2001
From: James Kolpack <james.kolpack@gmail.com>
Date: Sun, 26 Apr 2026 20:56:52 -0400
Subject: [PATCH] Add sample_random_scans script and first-page list-scans
 option

- scripts/sample_random_scans.sh: pick a random scan per machine (default: first list page) and download mosaic and/or tiles
- --list-scans-first-page-only: one HTTP request for scan list (up to 320 IDs)
- scripts/machines.example.txt; .gitignore local machines.txt (copy from example)
- README: document usage
---
 .gitignore                     |   2 +
 README.md                      |   9 ++
 scripts/machines.example.txt   |  15 +++
 scripts/sample_random_scans.sh | 178 +++++++++++++++++++++++++++++++++
 spruce/cli.py                  |  17 +++-
 spruce/session.py              |  22 +++-
 6 files changed, 236 insertions(+), 7 deletions(-)
 create mode 100644 scripts/machines.example.txt
 create mode 100755 scripts/sample_random_scans.sh

diff --git a/.gitignore b/.gitignore
index d2f8359..cfcb0be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 config.yaml
+# Local list of machine labels (copy from scripts/machines.example.txt)
+machines.txt
 archives/
 __pycache__/
 *.pyc
diff --git a/README.md b/README.md
index 87b94ab..cf79c3a 100644
--- a/README.md
+++ b/README.md
@@ -81,6 +81,9 @@ python scraper.py --list-machines
 # List all scans for a machine
 python scraper.py --list-scans --machine "BW3-20 [AMR-26]"
 
+# List only the first table page (one HTTP call; up to 320 — newest/first per server order)
+python scraper.py --list-scans --list-scans-first-page-only --machine "BW3-20 [AMR-26]"
+
 # Preview what would be downloaded (dry run)
 python scraper.py --machine "BW3-20 [AMR-26]" --dry-run
 
@@ -94,6 +97,11 @@ python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only
 # Download mosaics for all machines
 python scraper.py --mosaic-only
 
+# One random completed scan per machine: mosaic + all tiles (from machines.txt; uses --list-scans + --scan-id)
+# MOSAIC_ONLY=1 ./scripts/sample_random_scans.sh machines.txt   # optional: mosaics only, no tiles
+# cp scripts/machines.example.txt machines.txt   # then edit: one label per line
+# ./scripts/sample_random_scans.sh machines.txt
+
 # Download all tiles for a specific scan
 python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
 
@@ -115,6 +123,7 @@ python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
 | `--recheck` | Scan archive for zero-byte/missing tiles and mosaics; remove bad entries from `.progress.json` so they re-download on next run |
 | `--list-machines` | Print all machines and exit |
 | `--list-scans` | Print all scans for `--machine` and exit |
+| `--list-scans-first-page-only` | With `--list-scans`: a single list request (up to 320 scans) instead of paginating the full history |
 | `--verbose` / `-v` | Debug logging |
 
 ### `config.yaml` (optional keys)
diff --git a/scripts/machines.example.txt b/scripts/machines.example.txt
new file mode 100644
index 0000000..f3571b5
--- /dev/null
+++ b/scripts/machines.example.txt
@@ -0,0 +1,15 @@
+# All RootView minirhizotron machine labels (same set as `machine_metadata` in config.example.yaml).
+# Copy to the repo root as machines.txt, or: cp scripts/machines.example.txt machines.txt
+# sample_random_scans.sh: by default one random scan per line = mosaic + tiles; use MOSAIC_ONLY=1 for mosaics only
+BW1-4 [AMR-15]
+BW1-6 [AMR-19]
+BW1-7 [AMR-18]
+BW2-8 [AMR-25]
+BW2-10 [AMR-22]
+BW2-11 [AMR-23]
+BW2-13 [AMR-24]
+BW3-16 [AMR-16]
+BW3-17 [AMR-20]
+BW3-19 [AMR-21]
+BW3-20 [AMR-26]
+BW3-21 [AMR-17]
diff --git a/scripts/sample_random_scans.sh b/scripts/sample_random_scans.sh
new file mode 100755
index 0000000..dab2acd
--- /dev/null
+++ b/scripts/sample_random_scans.sh
@@ -0,0 +1,178 @@
+#!/usr/bin/env bash
+# For each machine label in a text file, pick one random completed scan and download
+# it: by default the mosaic and all tiles (same as: --machine "…" --scan-id N).
+# For mosaic only (faster, no tile downloads), set: MOSAIC_ONLY=1
+#
+# Usage:
+#   ./scripts/sample_random_scans.sh [PATH_TO_machines.txt]
+# Config path defaults to config.yaml in the repo root. Override with:
+#   CONFIG=/path/to/config.yaml ./scripts/sample_random_scans.sh machines.txt
+# Dry-run the download step (listing still does real HTTP to fetch scan list):
+#   DRY_RUN=1 ./scripts/sample_random_scans.sh machines.txt
+# Verbose / debug (extra per-step lines, scan counts from the list step):
+#   DEBUG=1 ./scripts/sample_random_scans.sh machines.txt
+# By default, --list-scans fetches only the first page (one HTTP request, up to
+# 320 scans). To paginate the full archive for the random pick (slower when many
+#   LIST_SCANS_ALL_PAGES=1 ./scripts/sample_random_scans.sh machines.txt
+#
+# machines.txt: one machine label per line (same as --machine and config machine names).
+# See scripts/machines.example.txt
+
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+CONFIG="${CONFIG:-$REPO_ROOT/config.yaml}"
+MACHINES_FILE="${1:-$REPO_ROOT/machines.txt}"
+SCRAPER=(python3 "$REPO_ROOT/scraper.py" --config "$CONFIG")
+
+log() { echo "[sample_random_scans] $*" >&2; }
+log_debug() {
+  if [[ -n "${DEBUG:-}" ]]; then
+    echo "[sample_random_scans] debug: $*" >&2
+  fi
+}
+
+if [[ ! -f "$MACHINES_FILE" ]]; then
+  log "error: file not found: $MACHINES_FILE"
+  log "Create it with one machine label per line, or: cp scripts/machines.example.txt machines.txt"
+  exit 1
+fi
+
+if [[ ! -f "$CONFIG" ]]; then
+  log "error: config not found: $CONFIG"
+  exit 1
+fi
+
+# Non-empty, non-comment lines (same rules as the main loop)
+TOTAL_MACHINES="$(
+  grep -v '^[[:space:]]*#' "$MACHINES_FILE" | grep -c -v '^[[:space:]]*$' || true
+)"
+if [[ -z "$TOTAL_MACHINES" || "$TOTAL_MACHINES" -eq 0 ]]; then
+  log "error: no machine lines in: $MACHINES_FILE"
+  exit 1
+fi
+
+log "starting  repo=$REPO_ROOT"
+log "         config=$CONFIG"
+log "         machines_file=$MACHINES_FILE  (${TOTAL_MACHINES} machine(s) in file)"
+if [[ -n "${MOSAIC_ONLY:-}" ]]; then
+  if [[ -n "${DRY_RUN:-}" ]]; then
+    log "         mode: MOSAIC_ONLY + DRY_RUN (mosaic only, --dry-run on download step)"
+  else
+    log "         mode: MOSAIC_ONLY=1 (mosaics only, no tiles; use for a lighter sample)"
+  fi
+else
+  if [[ -n "${DRY_RUN:-}" ]]; then
+    log "         mode: DRY_RUN (list + full scan download use --dry-run; no files written)"
+  else
+    log "         mode: full scan — mosaic + all tiles (workers from config)"
+  fi
+fi
+if [[ -n "${DEBUG:-}" ]]; then
+  log "         DEBUG=1 (extra diagnostics enabled)"
+fi
+if [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]]; then
+  log "         list step: list-scans = full archive (all pages, slower)"
+else
+  log "         list step: list-scans --list-scans-first-page-only (one page, up to 320 IDs)"
+fi
+log "────────────────────────────────────────"
+
+export REPO_ROOT CONFIG
+[[ -n "${DEBUG:-}" ]] && export DEBUG
+[[ -n "${LIST_SCANS_ALL_PAGES:-}" ]] && export LIST_SCANS_ALL_PAGES
+
+PROCESSED=0
+SKIPPED=0
+IDX=0
+
+while IFS= read -r line || [[ -n "${line-}" ]]; do
+  # trim, strip CR, skip blanks / comments
+  line="${line//$'\r'/}"
+  label="${line#"${line%%[![:space:]]*}"}"
+  label="${label%"${label##*[![:space:]]}"}"
+  [[ -z "$label" || "$label" == \#* ]] && continue
+
+  IDX=$((IDX + 1))
+  log "[$IDX/$TOTAL_MACHINES] machine: $label"
+  log "         status: listing scans (--list-scans) …"
+
+  random_id="$(
+    REPO_ROOT="$REPO_ROOT" CONFIG="$CONFIG" LABEL="$label" python3 - <<'PY'
+import os, random, subprocess, sys
+
+label = os.environ["LABEL"]
+repo = os.environ["REPO_ROOT"]
+cfg = os.environ["CONFIG"]
+debug = bool(os.environ.get("DEBUG"))
+full = bool(os.environ.get("LIST_SCANS_ALL_PAGES"))
+scraper = os.path.join(repo, "scraper.py")
+if debug:
+    print(
+        f"[sample_random_scans] debug: running list-scans for {label!r} "
+        f"({'all pages' if full else 'first page only'})",
+        file=sys.stderr,
+    )
+cmd = [sys.executable, scraper, "--list-scans", "--machine", label, "--config", cfg]
+if not full:
+    cmd.insert(3, "--list-scans-first-page-only")
+out = subprocess.check_output(
+    cmd,
+    text=True,
+    stderr=subprocess.STDOUT,
+)
+ids = []
+for line in out.splitlines():
+    line = line.rstrip()
+    if not line or line.startswith("---") or "Total" in line:
+        continue
+    parts = line.split()
+    if parts and parts[0].isdigit():
+        ids.append(parts[0])
+if not ids:
+    print(f"no scans parsed for {label!r} — check login and output", file=sys.stderr)
+    sys.exit(1)
+if debug:
+    print(
+        f"[sample_random_scans] debug: parsed {len(ids)} scan id(s) for {label!r}",
+        file=sys.stderr,
+    )
+print(random.choice(ids), end="")
+PY
+  )" || {
+    log "         status: SKIPPED (could not get scan list or pick id)"
+    SKIPPED=$((SKIPPED + 1))
+    continue
+  }
+
+  log "         status: picked random scan_id=$random_id (uniform among IDs from this list step — first page by default, see start banner)"
+  if [[ -n "${MOSAIC_ONLY:-}" ]]; then
+    log "         status: running scraper: --mosaic-only --scan-id (mosaic only) …"
+  else
+    log "         status: running scraper: --scan-id (mosaic + tiles) …"
+  fi
+  if [[ -n "${DRY_RUN:-}" ]]; then
+    log "         status: (dry-run — no files written for this scan)"
+  fi
+
+  if [[ -n "${MOSAIC_ONLY:-}" ]]; then
+    run_cmd=("${SCRAPER[@]}" --mosaic-only --machine "$label" --scan-id "$random_id")
+  else
+    run_cmd=("${SCRAPER[@]}" --machine "$label" --scan-id "$random_id")
+  fi
+  if [[ -n "${DRY_RUN:-}" ]]; then
+    run_cmd+=(--dry-run)
+  fi
+  if "${run_cmd[@]}"; then
+    log "         status: OK — finished this machine (exit 0)"
+    PROCESSED=$((PROCESSED + 1))
+  else
+    rc=$?
+    log "         status: FAILED — scraper exit code $rc (stopping; fix or remove this machine and re-run)"
+    exit "$rc"
+  fi
+  log "────────────────────────────────────────"
+done < "$MACHINES_FILE"
+
+log "done. summary: $PROCESSED machine(s) with sampled scan download completed, $SKIPPED skipped, $IDX line(s) processed out of $TOTAL_MACHINES in file."
+exit 0
diff --git a/spruce/cli.py b/spruce/cli.py
index 3795bea..81a3746 100644
--- a/spruce/cli.py
+++ b/spruce/cli.py
@@ -105,6 +105,14 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Print all scans for --machine and exit",
     )
+    p.add_argument(
+        "--list-scans-first-page-only",
+        action="store_true",
+        help=(
+            "With --list-scans: only fetch the first list page (up to 320 scans) "
+            "— one HTTP request, no pagination"
+        ),
+    )
     p.add_argument(
         "--recheck",
         action="store_true",
@@ -134,6 +142,9 @@ def main() -> None:
     if args.verbose:
         logging.getLogger().setLevel(logging.DEBUG)
 
+    if args.list_scans_first_page_only and not args.list_scans:
+        sys.exit("--list-scans-first-page-only requires --list-scans")
+
     # --list-machines doesn't need credentials
     if args.list_machines:
         base_url = "http://205.149.147.131:8010/"
@@ -213,7 +224,8 @@ def main() -> None:
         sess = MachineSession(machines[0], config)
         if not sess.login():
             sys.exit("Login failed.")
-        scans = sess.get_all_scans()
+        first_only = bool(args.list_scans_first_page_only)
+        scans = sess.get_all_scans(first_page_only=first_only)
         print(f"{'ID':>8}  {'Date':<22}  {'Name':<40}  {'Status'}")
         print("-" * 85)
         for sc in scans:
@@ -221,7 +233,8 @@ def main() -> None:
                 f"{sc['scan_id']:>8}  {sc.get('scan_time', ''):<22}  "
                 f"{sc.get('name', ''):<40}  {sc.get('status', '')}"
             )
-        print(f"\nTotal: {len(scans)} scans")
+        total_note = " (first page only — not full archive)" if first_only else ""
+        print(f"\nTotal: {len(scans)} scans{total_note}")
         return
 
     log.info(
diff --git a/spruce/session.py b/spruce/session.py
index aecef2b..922fdf3 100644
--- a/spruce/session.py
+++ b/spruce/session.py
@@ -77,16 +77,28 @@ class MachineSession:
     # Scan list (paginated)
     # ------------------------------------------------------------------
 
-    def get_all_scans(self) -> list[dict[str, Any]]:
+    def get_all_scans(
+        self, first_page_only: bool = False
+    ) -> list[dict[str, Any]]:
         """
-        Fetch the complete scan list across all pages.
+        Fetch the scan list from the RootView table.
 
-        Uses a large FilterCount (320) to minimise round-trips.
-        Falls back to repeated pages if the list is longer.
+        By default, walks all pages. With first_page_only=True, only the first
+        request is made (FilterCount 320) — enough for a random pick without
+        paginating a large history.
         """
+        page_size = 320
+        if first_page_only:
+            all_scans = self._fetch_scan_page(0, page_size)
+            log.info(
+                "[%s] First page only: %d scan(s) (not paginating).",
+                self.machine["label"],
+                len(all_scans),
+            )
+            return all_scans
+
         all_scans: list[dict[str, Any]] = []
         start = 0
-        page_size = 320
 
         while True:
             page_scans = self._fetch_scan_page(start, page_size)