diff --git a/README.md b/README.md index cf79c3a..9687890 100644 --- a/README.md +++ b/README.md @@ -97,10 +97,8 @@ python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only # Download mosaics for all machines python scraper.py --mosaic-only -# One random completed scan per machine: mosaic + all tiles (from machines.txt; uses --list-scans + --scan-id) -# MOSAIC_ONLY=1 ./scripts/sample_random_scans.sh machines.txt # optional: mosaics only, no tiles -# cp scripts/machines.example.txt machines.txt # then edit: one label per line -# ./scripts/sample_random_scans.sh machines.txt +# One random completed scan per machine (helper script): check out branch `testing/sample-runs`, +# then see `scripts/sample_random_scans.sh` and `docs/sample_random_scans_run_progress.md`. # Download all tiles for a specific scan python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4 diff --git a/scripts/machines.example.txt b/scripts/machines.example.txt index f3571b5..53c91f7 100644 --- a/scripts/machines.example.txt +++ b/scripts/machines.example.txt @@ -1,6 +1,6 @@ # All RootView minirhizotron machine labels (same set as `machine_metadata` in config.example.yaml). # Copy to the repo root as machines.txt, or: cp scripts/machines.example.txt machines.txt -# sample_random_scans.sh: by default one random scan per line = mosaic + tiles; use MOSAIC_ONLY=1 for mosaics only +# Random-sample helper `scripts/sample_random_scans.sh` lives on branch `testing/sample-runs` only. BW1-4 [AMR-15] BW1-6 [AMR-19] BW1-7 [AMR-18] diff --git a/scripts/sample_random_scans.sh b/scripts/sample_random_scans.sh deleted file mode 100755 index dab2acd..0000000 --- a/scripts/sample_random_scans.sh +++ /dev/null @@ -1,178 +0,0 @@ -#!/usr/bin/env bash -# For each machine label in a text file, pick one random completed scan and download -# it: by default the mosaic and all tiles (same as: --machine "…" --scan-id N). -# For mosaic only (faster, no tile downloads), set: MOSAIC_ONLY=1 -# -# Usage: -# ./scripts/sample_random_scans.sh [PATH_TO_machines.txt] -# Config path defaults to config.yaml in the repo root. Override with: -# CONFIG=/path/to/config.yaml ./scripts/sample_random_scans.sh machines.txt -# Dry-run the download step (listing still does real HTTP to fetch scan list): -# DRY_RUN=1 ./scripts/sample_random_scans.sh machines.txt -# Verbose / debug (extra per-step lines, scan counts from the list step): -# DEBUG=1 ./scripts/sample_random_scans.sh machines.txt -# By default, --list-scans fetches only the first page (one HTTP request, up to -# 320 scans). To paginate the full archive for the random pick (slower when many -# LIST_SCANS_ALL_PAGES=1 ./scripts/sample_random_scans.sh machines.txt -# -# machines.txt: one machine label per line (same as --machine and config machine names). -# See scripts/machines.example.txt - -set -euo pipefail - -REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -CONFIG="${CONFIG:-$REPO_ROOT/config.yaml}" -MACHINES_FILE="${1:-$REPO_ROOT/machines.txt}" -SCRAPER=(python3 "$REPO_ROOT/scraper.py" --config "$CONFIG") - -log() { echo "[sample_random_scans] $*" >&2; } -log_debug() { - if [[ -n "${DEBUG:-}" ]]; then - echo "[sample_random_scans] debug: $*" >&2 - fi -} - -if [[ ! -f "$MACHINES_FILE" ]]; then - log "error: file not found: $MACHINES_FILE" - log "Create it with one machine label per line, or: cp scripts/machines.example.txt machines.txt" - exit 1 -fi - -if [[ ! -f "$CONFIG" ]]; then - log "error: config not found: $CONFIG" - exit 1 -fi - -# Non-empty, non-comment lines (same rules as the main loop) -TOTAL_MACHINES="$( - grep -v '^[[:space:]]*#' "$MACHINES_FILE" | grep -c -v '^[[:space:]]*$' || true -)" -if [[ -z "$TOTAL_MACHINES" || "$TOTAL_MACHINES" -eq 0 ]]; then - log "error: no machine lines in: $MACHINES_FILE" - exit 1 -fi - -log "starting repo=$REPO_ROOT" -log " config=$CONFIG" -log " machines_file=$MACHINES_FILE (${TOTAL_MACHINES} machine(s) in file)" -if [[ -n "${MOSAIC_ONLY:-}" ]]; then - if [[ -n "${DRY_RUN:-}" ]]; then - log " mode: MOSAIC_ONLY + DRY_RUN (mosaic only, --dry-run on download step)" - else - log " mode: MOSAIC_ONLY=1 (mosaics only, no tiles; use for a lighter sample)" - fi -else - if [[ -n "${DRY_RUN:-}" ]]; then - log " mode: DRY_RUN (list + full scan download use --dry-run; no files written)" - else - log " mode: full scan — mosaic + all tiles (workers from config)" - fi -fi -if [[ -n "${DEBUG:-}" ]]; then - log " DEBUG=1 (extra diagnostics enabled)" -fi -if [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]]; then - log " list step: list-scans = full archive (all pages, slower)" -else - log " list step: list-scans --list-scans-first-page-only (one page, up to 320 IDs)" -fi -log "────────────────────────────────────────" - -export REPO_ROOT CONFIG -[[ -n "${DEBUG:-}" ]] && export DEBUG -[[ -n "${LIST_SCANS_ALL_PAGES:-}" ]] && export LIST_SCANS_ALL_PAGES - -PROCESSED=0 -SKIPPED=0 -IDX=0 - -while IFS= read -r line || [[ -n "${line-}" ]]; do - # trim, strip CR, skip blanks / comments - line="${line//$'\r'/}" - label="${line#"${line%%[![:space:]]*}"}" - label="${label%"${label##*[![:space:]]}"}" - [[ -z "$label" || "$label" == \#* ]] && continue - - IDX=$((IDX + 1)) - log "[$IDX/$TOTAL_MACHINES] machine: $label" - log " status: listing scans (--list-scans) …" - - random_id="$( - REPO_ROOT="$REPO_ROOT" CONFIG="$CONFIG" LABEL="$label" python3 - <<'PY' -import os, random, subprocess, sys - -label = os.environ["LABEL"] -repo = os.environ["REPO_ROOT"] -cfg = os.environ["CONFIG"] -debug = bool(os.environ.get("DEBUG")) -full = bool(os.environ.get("LIST_SCANS_ALL_PAGES")) -scraper = os.path.join(repo, "scraper.py") -if debug: - print( - f"[sample_random_scans] debug: running list-scans for {label!r} " - f"({'all pages' if full else 'first page only'})", - file=sys.stderr, - ) -cmd = [sys.executable, scraper, "--list-scans", "--machine", label, "--config", cfg] -if not full: - cmd.insert(3, "--list-scans-first-page-only") -out = subprocess.check_output( - cmd, - text=True, - stderr=subprocess.STDOUT, -) -ids = [] -for line in out.splitlines(): - line = line.rstrip() - if not line or line.startswith("---") or "Total" in line: - continue - parts = line.split() - if parts and parts[0].isdigit(): - ids.append(parts[0]) -if not ids: - print(f"no scans parsed for {label!r} — check login and output", file=sys.stderr) - sys.exit(1) -if debug: - print( - f"[sample_random_scans] debug: parsed {len(ids)} scan id(s) for {label!r}", - file=sys.stderr, - ) -print(random.choice(ids), end="") -PY - )" || { - log " status: SKIPPED (could not get scan list or pick id)" - SKIPPED=$((SKIPPED + 1)) - continue - } - - log " status: picked random scan_id=$random_id (uniform among IDs from this list step — first page by default, see start banner)" - if [[ -n "${MOSAIC_ONLY:-}" ]]; then - log " status: running scraper: --mosaic-only --scan-id (mosaic only) …" - else - log " status: running scraper: --scan-id (mosaic + tiles) …" - fi - if [[ -n "${DRY_RUN:-}" ]]; then - log " status: (dry-run — no files written for this scan)" - fi - - if [[ -n "${MOSAIC_ONLY:-}" ]]; then - run_cmd=("${SCRAPER[@]}" --mosaic-only --machine "$label" --scan-id "$random_id") - else - run_cmd=("${SCRAPER[@]}" --machine "$label" --scan-id "$random_id") - fi - if [[ -n "${DRY_RUN:-}" ]]; then - run_cmd+=(--dry-run) - fi - if "${run_cmd[@]}"; then - log " status: OK — finished this machine (exit 0)" - PROCESSED=$((PROCESSED + 1)) - else - rc=$? - log " status: FAILED — scraper exit code $rc (stopping; fix or remove this machine and re-run)" - exit "$rc" - fi - log "────────────────────────────────────────" -done < "$MACHINES_FILE" - -log "done. summary: $PROCESSED machine(s) with sampled scan download completed, $SKIPPED skipped, $IDX line(s) processed out of $TOTAL_MACHINES in file." -exit 0