From 4118e6e4f0e66fd77b82954d34829648571b5982 Mon Sep 17 00:00:00 2001 From: James Kolpack Date: Sun, 26 Apr 2026 20:56:52 -0400 Subject: [PATCH] Add sample_random_scans script and first-page list-scans option - scripts/sample_random_scans.sh: pick a random scan per machine (default: first list page) and download mosaic and/or tiles - --list-scans-first-page-only: one HTTP request for scan list (up to 320 IDs) - scripts/machines.example.txt; .gitignore local machines.txt (copy from example) - README: document usage --- .gitignore | 2 + README.md | 9 ++ scripts/machines.example.txt | 15 +++ scripts/sample_random_scans.sh | 178 +++++++++++++++++++++++++++++++++ spruce/cli.py | 17 +++- spruce/session.py | 22 +++- 6 files changed, 236 insertions(+), 7 deletions(-) create mode 100644 scripts/machines.example.txt create mode 100755 scripts/sample_random_scans.sh diff --git a/.gitignore b/.gitignore index d2f8359..cfcb0be 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ config.yaml +# Local list of machine labels (copy from scripts/machines.example.txt) +machines.txt archives/ __pycache__/ *.pyc diff --git a/README.md b/README.md index 87b94ab..cf79c3a 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,9 @@ python scraper.py --list-machines # List all scans for a machine python scraper.py --list-scans --machine "BW3-20 [AMR-26]" +# List only the first table page (one HTTP call; up to 320 — newest/first per server order) +python scraper.py --list-scans --list-scans-first-page-only --machine "BW3-20 [AMR-26]" + # Preview what would be downloaded (dry run) python scraper.py --machine "BW3-20 [AMR-26]" --dry-run @@ -94,6 +97,11 @@ python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only # Download mosaics for all machines python scraper.py --mosaic-only +# One random completed scan per machine: mosaic + all tiles (from machines.txt; uses --list-scans + --scan-id) +# MOSAIC_ONLY=1 ./scripts/sample_random_scans.sh machines.txt # optional: mosaics only, no tiles +# cp scripts/machines.example.txt machines.txt # then edit: one label per line +# ./scripts/sample_random_scans.sh machines.txt + # Download all tiles for a specific scan python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4 @@ -115,6 +123,7 @@ python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4 | `--recheck` | Scan archive for zero-byte/missing tiles and mosaics; remove bad entries from `.progress.json` so they re-download on next run | | `--list-machines` | Print all machines and exit | | `--list-scans` | Print all scans for `--machine` and exit | +| `--list-scans-first-page-only` | With `--list-scans`: a single list request (up to 320 scans) instead of paginating the full history | | `--verbose` / `-v` | Debug logging | ### `config.yaml` (optional keys) diff --git a/scripts/machines.example.txt b/scripts/machines.example.txt new file mode 100644 index 0000000..f3571b5 --- /dev/null +++ b/scripts/machines.example.txt @@ -0,0 +1,15 @@ +# All RootView minirhizotron machine labels (same set as `machine_metadata` in config.example.yaml). +# Copy to the repo root as machines.txt, or: cp scripts/machines.example.txt machines.txt +# sample_random_scans.sh: by default one random scan per line = mosaic + tiles; use MOSAIC_ONLY=1 for mosaics only +BW1-4 [AMR-15] +BW1-6 [AMR-19] +BW1-7 [AMR-18] +BW2-8 [AMR-25] +BW2-10 [AMR-22] +BW2-11 [AMR-23] +BW2-13 [AMR-24] +BW3-16 [AMR-16] +BW3-17 [AMR-20] +BW3-19 [AMR-21] +BW3-20 [AMR-26] +BW3-21 [AMR-17] diff --git a/scripts/sample_random_scans.sh b/scripts/sample_random_scans.sh new file mode 100755 index 0000000..dab2acd --- /dev/null +++ b/scripts/sample_random_scans.sh @@ -0,0 +1,178 @@ +#!/usr/bin/env bash +# For each machine label in a text file, pick one random completed scan and download +# it: by default the mosaic and all tiles (same as: --machine "…" --scan-id N). +# For mosaic only (faster, no tile downloads), set: MOSAIC_ONLY=1 +# +# Usage: +# ./scripts/sample_random_scans.sh [PATH_TO_machines.txt] +# Config path defaults to config.yaml in the repo root. Override with: +# CONFIG=/path/to/config.yaml ./scripts/sample_random_scans.sh machines.txt +# Dry-run the download step (listing still does real HTTP to fetch scan list): +# DRY_RUN=1 ./scripts/sample_random_scans.sh machines.txt +# Verbose / debug (extra per-step lines, scan counts from the list step): +# DEBUG=1 ./scripts/sample_random_scans.sh machines.txt +# By default, --list-scans fetches only the first page (one HTTP request, up to +# 320 scans). To paginate the full archive for the random pick (slower when many +# LIST_SCANS_ALL_PAGES=1 ./scripts/sample_random_scans.sh machines.txt +# +# machines.txt: one machine label per line (same as --machine and config machine names). +# See scripts/machines.example.txt + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +CONFIG="${CONFIG:-$REPO_ROOT/config.yaml}" +MACHINES_FILE="${1:-$REPO_ROOT/machines.txt}" +SCRAPER=(python3 "$REPO_ROOT/scraper.py" --config "$CONFIG") + +log() { echo "[sample_random_scans] $*" >&2; } +log_debug() { + if [[ -n "${DEBUG:-}" ]]; then + echo "[sample_random_scans] debug: $*" >&2 + fi +} + +if [[ ! -f "$MACHINES_FILE" ]]; then + log "error: file not found: $MACHINES_FILE" + log "Create it with one machine label per line, or: cp scripts/machines.example.txt machines.txt" + exit 1 +fi + +if [[ ! -f "$CONFIG" ]]; then + log "error: config not found: $CONFIG" + exit 1 +fi + +# Non-empty, non-comment lines (same rules as the main loop) +TOTAL_MACHINES="$( + grep -v '^[[:space:]]*#' "$MACHINES_FILE" | grep -c -v '^[[:space:]]*$' || true +)" +if [[ -z "$TOTAL_MACHINES" || "$TOTAL_MACHINES" -eq 0 ]]; then + log "error: no machine lines in: $MACHINES_FILE" + exit 1 +fi + +log "starting repo=$REPO_ROOT" +log " config=$CONFIG" +log " machines_file=$MACHINES_FILE (${TOTAL_MACHINES} machine(s) in file)" +if [[ -n "${MOSAIC_ONLY:-}" ]]; then + if [[ -n "${DRY_RUN:-}" ]]; then + log " mode: MOSAIC_ONLY + DRY_RUN (mosaic only, --dry-run on download step)" + else + log " mode: MOSAIC_ONLY=1 (mosaics only, no tiles; use for a lighter sample)" + fi +else + if [[ -n "${DRY_RUN:-}" ]]; then + log " mode: DRY_RUN (list + full scan download use --dry-run; no files written)" + else + log " mode: full scan — mosaic + all tiles (workers from config)" + fi +fi +if [[ -n "${DEBUG:-}" ]]; then + log " DEBUG=1 (extra diagnostics enabled)" +fi +if [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]]; then + log " list step: list-scans = full archive (all pages, slower)" +else + log " list step: list-scans --list-scans-first-page-only (one page, up to 320 IDs)" +fi +log "────────────────────────────────────────" + +export REPO_ROOT CONFIG +[[ -n "${DEBUG:-}" ]] && export DEBUG +[[ -n "${LIST_SCANS_ALL_PAGES:-}" ]] && export LIST_SCANS_ALL_PAGES + +PROCESSED=0 +SKIPPED=0 +IDX=0 + +while IFS= read -r line || [[ -n "${line-}" ]]; do + # trim, strip CR, skip blanks / comments + line="${line//$'\r'/}" + label="${line#"${line%%[![:space:]]*}"}" + label="${label%"${label##*[![:space:]]}"}" + [[ -z "$label" || "$label" == \#* ]] && continue + + IDX=$((IDX + 1)) + log "[$IDX/$TOTAL_MACHINES] machine: $label" + log " status: listing scans (--list-scans) …" + + random_id="$( + REPO_ROOT="$REPO_ROOT" CONFIG="$CONFIG" LABEL="$label" python3 - <<'PY' +import os, random, subprocess, sys + +label = os.environ["LABEL"] +repo = os.environ["REPO_ROOT"] +cfg = os.environ["CONFIG"] +debug = bool(os.environ.get("DEBUG")) +full = bool(os.environ.get("LIST_SCANS_ALL_PAGES")) +scraper = os.path.join(repo, "scraper.py") +if debug: + print( + f"[sample_random_scans] debug: running list-scans for {label!r} " + f"({'all pages' if full else 'first page only'})", + file=sys.stderr, + ) +cmd = [sys.executable, scraper, "--list-scans", "--machine", label, "--config", cfg] +if not full: + cmd.insert(3, "--list-scans-first-page-only") +out = subprocess.check_output( + cmd, + text=True, + stderr=subprocess.STDOUT, +) +ids = [] +for line in out.splitlines(): + line = line.rstrip() + if not line or line.startswith("---") or "Total" in line: + continue + parts = line.split() + if parts and parts[0].isdigit(): + ids.append(parts[0]) +if not ids: + print(f"no scans parsed for {label!r} — check login and output", file=sys.stderr) + sys.exit(1) +if debug: + print( + f"[sample_random_scans] debug: parsed {len(ids)} scan id(s) for {label!r}", + file=sys.stderr, + ) +print(random.choice(ids), end="") +PY + )" || { + log " status: SKIPPED (could not get scan list or pick id)" + SKIPPED=$((SKIPPED + 1)) + continue + } + + log " status: picked random scan_id=$random_id (uniform among IDs from this list step — first page by default, see start banner)" + if [[ -n "${MOSAIC_ONLY:-}" ]]; then + log " status: running scraper: --mosaic-only --scan-id (mosaic only) …" + else + log " status: running scraper: --scan-id (mosaic + tiles) …" + fi + if [[ -n "${DRY_RUN:-}" ]]; then + log " status: (dry-run — no files written for this scan)" + fi + + if [[ -n "${MOSAIC_ONLY:-}" ]]; then + run_cmd=("${SCRAPER[@]}" --mosaic-only --machine "$label" --scan-id "$random_id") + else + run_cmd=("${SCRAPER[@]}" --machine "$label" --scan-id "$random_id") + fi + if [[ -n "${DRY_RUN:-}" ]]; then + run_cmd+=(--dry-run) + fi + if "${run_cmd[@]}"; then + log " status: OK — finished this machine (exit 0)" + PROCESSED=$((PROCESSED + 1)) + else + rc=$? + log " status: FAILED — scraper exit code $rc (stopping; fix or remove this machine and re-run)" + exit "$rc" + fi + log "────────────────────────────────────────" +done < "$MACHINES_FILE" + +log "done. summary: $PROCESSED machine(s) with sampled scan download completed, $SKIPPED skipped, $IDX line(s) processed out of $TOTAL_MACHINES in file." +exit 0 diff --git a/spruce/cli.py b/spruce/cli.py index 3795bea..81a3746 100644 --- a/spruce/cli.py +++ b/spruce/cli.py @@ -105,6 +105,14 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Print all scans for --machine and exit", ) + p.add_argument( + "--list-scans-first-page-only", + action="store_true", + help=( + "With --list-scans: only fetch the first list page (up to 320 scans) " + "— one HTTP request, no pagination" + ), + ) p.add_argument( "--recheck", action="store_true", @@ -134,6 +142,9 @@ def main() -> None: if args.verbose: logging.getLogger().setLevel(logging.DEBUG) + if args.list_scans_first_page_only and not args.list_scans: + sys.exit("--list-scans-first-page-only requires --list-scans") + # --list-machines doesn't need credentials if args.list_machines: base_url = "http://205.149.147.131:8010/" @@ -213,7 +224,8 @@ def main() -> None: sess = MachineSession(machines[0], config) if not sess.login(): sys.exit("Login failed.") - scans = sess.get_all_scans() + first_only = bool(args.list_scans_first_page_only) + scans = sess.get_all_scans(first_page_only=first_only) print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}") print("-" * 85) for sc in scans: @@ -221,7 +233,8 @@ def main() -> None: f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} " f"{sc.get('name', ''):<40} {sc.get('status', '')}" ) - print(f"\nTotal: {len(scans)} scans") + total_note = " (first page only — not full archive)" if first_only else "" + print(f"\nTotal: {len(scans)} scans{total_note}") return log.info( diff --git a/spruce/session.py b/spruce/session.py index aecef2b..922fdf3 100644 --- a/spruce/session.py +++ b/spruce/session.py @@ -77,16 +77,28 @@ class MachineSession: # Scan list (paginated) # ------------------------------------------------------------------ - def get_all_scans(self) -> list[dict[str, Any]]: + def get_all_scans( + self, first_page_only: bool = False + ) -> list[dict[str, Any]]: """ - Fetch the complete scan list across all pages. + Fetch the scan list from the RootView table. - Uses a large FilterCount (320) to minimise round-trips. - Falls back to repeated pages if the list is longer. + By default, walks all pages. With first_page_only=True, only the first + request is made (FilterCount 320) — enough for a random pick without + paginating a large history. """ + page_size = 320 + if first_page_only: + all_scans = self._fetch_scan_page(0, page_size) + log.info( + "[%s] First page only: %d scan(s) (not paginating).", + self.machine["label"], + len(all_scans), + ) + return all_scans + all_scans: list[dict[str, Any]] = [] start = 0 - page_size = 320 while True: page_scans = self._fetch_scan_page(start, page_size)