Add sample_random_scans script and first-page list-scans option

- scripts/sample_random_scans.sh: pick a random scan per machine (default: first list page) and download mosaic and/or tiles
- --list-scans-first-page-only: one HTTP request for scan list (up to 320 IDs)
- scripts/machines.example.txt; .gitignore local machines.txt (copy from example)
- README: document usage
This commit is contained in:
2026-04-26 20:56:52 -04:00
parent 08a29d124a
commit 4118e6e4f0
6 changed files with 236 additions and 7 deletions
+2
View File
@@ -1,4 +1,6 @@
config.yaml
# Local list of machine labels (copy from scripts/machines.example.txt)
machines.txt
archives/
__pycache__/
*.pyc
+9
View File
@@ -81,6 +81,9 @@ python scraper.py --list-machines
# List all scans for a machine
python scraper.py --list-scans --machine "BW3-20 [AMR-26]"
# List only the first table page (one HTTP call; up to 320 — newest/first per server order)
python scraper.py --list-scans --list-scans-first-page-only --machine "BW3-20 [AMR-26]"
# Preview what would be downloaded (dry run)
python scraper.py --machine "BW3-20 [AMR-26]" --dry-run
@@ -94,6 +97,11 @@ python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only
# Download mosaics for all machines
python scraper.py --mosaic-only
# One random completed scan per machine: mosaic + all tiles (from machines.txt; uses --list-scans + --scan-id)
# MOSAIC_ONLY=1 ./scripts/sample_random_scans.sh machines.txt # optional: mosaics only, no tiles
# cp scripts/machines.example.txt machines.txt # then edit: one label per line
# ./scripts/sample_random_scans.sh machines.txt
# Download all tiles for a specific scan
python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
@@ -115,6 +123,7 @@ python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
| `--recheck` | Scan archive for zero-byte/missing tiles and mosaics; remove bad entries from `.progress.json` so they re-download on next run |
| `--list-machines` | Print all machines and exit |
| `--list-scans` | Print all scans for `--machine` and exit |
| `--list-scans-first-page-only` | With `--list-scans`: a single list request (up to 320 scans) instead of paginating the full history |
| `--verbose` / `-v` | Debug logging |
### `config.yaml` (optional keys)
+15
View File
@@ -0,0 +1,15 @@
# All RootView minirhizotron machine labels (same set as `machine_metadata` in config.example.yaml).
# Copy to the repo root as machines.txt, or: cp scripts/machines.example.txt machines.txt
# sample_random_scans.sh: by default one random scan per line = mosaic + tiles; use MOSAIC_ONLY=1 for mosaics only
BW1-4 [AMR-15]
BW1-6 [AMR-19]
BW1-7 [AMR-18]
BW2-8 [AMR-25]
BW2-10 [AMR-22]
BW2-11 [AMR-23]
BW2-13 [AMR-24]
BW3-16 [AMR-16]
BW3-17 [AMR-20]
BW3-19 [AMR-21]
BW3-20 [AMR-26]
BW3-21 [AMR-17]
+178
View File
@@ -0,0 +1,178 @@
#!/usr/bin/env bash
# For each machine label in a text file, pick one random completed scan and download
# it: by default the mosaic and all tiles (same as: --machine "…" --scan-id N).
# For mosaic only (faster, no tile downloads), set: MOSAIC_ONLY=1
#
# Usage:
# ./scripts/sample_random_scans.sh [PATH_TO_machines.txt]
# Config path defaults to config.yaml in the repo root. Override with:
# CONFIG=/path/to/config.yaml ./scripts/sample_random_scans.sh machines.txt
# Dry-run the download step (listing still does real HTTP to fetch scan list):
# DRY_RUN=1 ./scripts/sample_random_scans.sh machines.txt
# Verbose / debug (extra per-step lines, scan counts from the list step):
# DEBUG=1 ./scripts/sample_random_scans.sh machines.txt
# By default, --list-scans fetches only the first page (one HTTP request, up to
# 320 scans). To paginate the full archive for the random pick (slower when many
# LIST_SCANS_ALL_PAGES=1 ./scripts/sample_random_scans.sh machines.txt
#
# machines.txt: one machine label per line (same as --machine and config machine names).
# See scripts/machines.example.txt
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
CONFIG="${CONFIG:-$REPO_ROOT/config.yaml}"
MACHINES_FILE="${1:-$REPO_ROOT/machines.txt}"
SCRAPER=(python3 "$REPO_ROOT/scraper.py" --config "$CONFIG")
log() { echo "[sample_random_scans] $*" >&2; }
log_debug() {
if [[ -n "${DEBUG:-}" ]]; then
echo "[sample_random_scans] debug: $*" >&2
fi
}
if [[ ! -f "$MACHINES_FILE" ]]; then
log "error: file not found: $MACHINES_FILE"
log "Create it with one machine label per line, or: cp scripts/machines.example.txt machines.txt"
exit 1
fi
if [[ ! -f "$CONFIG" ]]; then
log "error: config not found: $CONFIG"
exit 1
fi
# Non-empty, non-comment lines (same rules as the main loop)
TOTAL_MACHINES="$(
grep -v '^[[:space:]]*#' "$MACHINES_FILE" | grep -c -v '^[[:space:]]*$' || true
)"
if [[ -z "$TOTAL_MACHINES" || "$TOTAL_MACHINES" -eq 0 ]]; then
log "error: no machine lines in: $MACHINES_FILE"
exit 1
fi
log "starting repo=$REPO_ROOT"
log " config=$CONFIG"
log " machines_file=$MACHINES_FILE (${TOTAL_MACHINES} machine(s) in file)"
if [[ -n "${MOSAIC_ONLY:-}" ]]; then
if [[ -n "${DRY_RUN:-}" ]]; then
log " mode: MOSAIC_ONLY + DRY_RUN (mosaic only, --dry-run on download step)"
else
log " mode: MOSAIC_ONLY=1 (mosaics only, no tiles; use for a lighter sample)"
fi
else
if [[ -n "${DRY_RUN:-}" ]]; then
log " mode: DRY_RUN (list + full scan download use --dry-run; no files written)"
else
log " mode: full scan — mosaic + all tiles (workers from config)"
fi
fi
if [[ -n "${DEBUG:-}" ]]; then
log " DEBUG=1 (extra diagnostics enabled)"
fi
if [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]]; then
log " list step: list-scans = full archive (all pages, slower)"
else
log " list step: list-scans --list-scans-first-page-only (one page, up to 320 IDs)"
fi
log "────────────────────────────────────────"
export REPO_ROOT CONFIG
[[ -n "${DEBUG:-}" ]] && export DEBUG
[[ -n "${LIST_SCANS_ALL_PAGES:-}" ]] && export LIST_SCANS_ALL_PAGES
PROCESSED=0
SKIPPED=0
IDX=0
while IFS= read -r line || [[ -n "${line-}" ]]; do
# trim, strip CR, skip blanks / comments
line="${line//$'\r'/}"
label="${line#"${line%%[![:space:]]*}"}"
label="${label%"${label##*[![:space:]]}"}"
[[ -z "$label" || "$label" == \#* ]] && continue
IDX=$((IDX + 1))
log "[$IDX/$TOTAL_MACHINES] machine: $label"
log " status: listing scans (--list-scans) …"
random_id="$(
REPO_ROOT="$REPO_ROOT" CONFIG="$CONFIG" LABEL="$label" python3 - <<'PY'
import os, random, subprocess, sys
label = os.environ["LABEL"]
repo = os.environ["REPO_ROOT"]
cfg = os.environ["CONFIG"]
debug = bool(os.environ.get("DEBUG"))
full = bool(os.environ.get("LIST_SCANS_ALL_PAGES"))
scraper = os.path.join(repo, "scraper.py")
if debug:
print(
f"[sample_random_scans] debug: running list-scans for {label!r} "
f"({'all pages' if full else 'first page only'})",
file=sys.stderr,
)
cmd = [sys.executable, scraper, "--list-scans", "--machine", label, "--config", cfg]
if not full:
cmd.insert(3, "--list-scans-first-page-only")
out = subprocess.check_output(
cmd,
text=True,
stderr=subprocess.STDOUT,
)
ids = []
for line in out.splitlines():
line = line.rstrip()
if not line or line.startswith("---") or "Total" in line:
continue
parts = line.split()
if parts and parts[0].isdigit():
ids.append(parts[0])
if not ids:
print(f"no scans parsed for {label!r} — check login and output", file=sys.stderr)
sys.exit(1)
if debug:
print(
f"[sample_random_scans] debug: parsed {len(ids)} scan id(s) for {label!r}",
file=sys.stderr,
)
print(random.choice(ids), end="")
PY
)" || {
log " status: SKIPPED (could not get scan list or pick id)"
SKIPPED=$((SKIPPED + 1))
continue
}
log " status: picked random scan_id=$random_id (uniform among IDs from this list step — first page by default, see start banner)"
if [[ -n "${MOSAIC_ONLY:-}" ]]; then
log " status: running scraper: --mosaic-only --scan-id (mosaic only) …"
else
log " status: running scraper: --scan-id (mosaic + tiles) …"
fi
if [[ -n "${DRY_RUN:-}" ]]; then
log " status: (dry-run — no files written for this scan)"
fi
if [[ -n "${MOSAIC_ONLY:-}" ]]; then
run_cmd=("${SCRAPER[@]}" --mosaic-only --machine "$label" --scan-id "$random_id")
else
run_cmd=("${SCRAPER[@]}" --machine "$label" --scan-id "$random_id")
fi
if [[ -n "${DRY_RUN:-}" ]]; then
run_cmd+=(--dry-run)
fi
if "${run_cmd[@]}"; then
log " status: OK — finished this machine (exit 0)"
PROCESSED=$((PROCESSED + 1))
else
rc=$?
log " status: FAILED — scraper exit code $rc (stopping; fix or remove this machine and re-run)"
exit "$rc"
fi
log "────────────────────────────────────────"
done < "$MACHINES_FILE"
log "done. summary: $PROCESSED machine(s) with sampled scan download completed, $SKIPPED skipped, $IDX line(s) processed out of $TOTAL_MACHINES in file."
exit 0
+15 -2
View File
@@ -105,6 +105,14 @@ def parse_args() -> argparse.Namespace:
action="store_true",
help="Print all scans for --machine and exit",
)
p.add_argument(
"--list-scans-first-page-only",
action="store_true",
help=(
"With --list-scans: only fetch the first list page (up to 320 scans) "
"— one HTTP request, no pagination"
),
)
p.add_argument(
"--recheck",
action="store_true",
@@ -134,6 +142,9 @@ def main() -> None:
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
if args.list_scans_first_page_only and not args.list_scans:
sys.exit("--list-scans-first-page-only requires --list-scans")
# --list-machines doesn't need credentials
if args.list_machines:
base_url = "http://205.149.147.131:8010/"
@@ -213,7 +224,8 @@ def main() -> None:
sess = MachineSession(machines[0], config)
if not sess.login():
sys.exit("Login failed.")
scans = sess.get_all_scans()
first_only = bool(args.list_scans_first_page_only)
scans = sess.get_all_scans(first_page_only=first_only)
print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}")
print("-" * 85)
for sc in scans:
@@ -221,7 +233,8 @@ def main() -> None:
f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} "
f"{sc.get('name', ''):<40} {sc.get('status', '')}"
)
print(f"\nTotal: {len(scans)} scans")
total_note = " (first page only — not full archive)" if first_only else ""
print(f"\nTotal: {len(scans)} scans{total_note}")
return
log.info(
+17 -5
View File
@@ -77,16 +77,28 @@ class MachineSession:
# Scan list (paginated)
# ------------------------------------------------------------------
def get_all_scans(self) -> list[dict[str, Any]]:
def get_all_scans(
self, first_page_only: bool = False
) -> list[dict[str, Any]]:
"""
Fetch the complete scan list across all pages.
Fetch the scan list from the RootView table.
Uses a large FilterCount (320) to minimise round-trips.
Falls back to repeated pages if the list is longer.
By default, walks all pages. With first_page_only=True, only the first
request is made (FilterCount 320) — enough for a random pick without
paginating a large history.
"""
page_size = 320
if first_page_only:
all_scans = self._fetch_scan_page(0, page_size)
log.info(
"[%s] First page only: %d scan(s) (not paginating).",
self.machine["label"],
len(all_scans),
)
return all_scans
all_scans: list[dict[str, Any]] = []
start = 0
page_size = 320
while True:
page_scans = self._fetch_scan_page(start, page_size)