Move sample_random_scans.sh to testing/sample-runs branch only
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -97,10 +97,8 @@ python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only
|
|||||||
# Download mosaics for all machines
|
# Download mosaics for all machines
|
||||||
python scraper.py --mosaic-only
|
python scraper.py --mosaic-only
|
||||||
|
|
||||||
# One random completed scan per machine: mosaic + all tiles (from machines.txt; uses --list-scans + --scan-id)
|
# One random completed scan per machine (helper script): check out branch `testing/sample-runs`,
|
||||||
# MOSAIC_ONLY=1 ./scripts/sample_random_scans.sh machines.txt # optional: mosaics only, no tiles
|
# then see `scripts/sample_random_scans.sh` and `docs/sample_random_scans_run_progress.md`.
|
||||||
# cp scripts/machines.example.txt machines.txt # then edit: one label per line
|
|
||||||
# ./scripts/sample_random_scans.sh machines.txt
|
|
||||||
|
|
||||||
# Download all tiles for a specific scan
|
# Download all tiles for a specific scan
|
||||||
python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
|
python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# All RootView minirhizotron machine labels (same set as `machine_metadata` in config.example.yaml).
|
# All RootView minirhizotron machine labels (same set as `machine_metadata` in config.example.yaml).
|
||||||
# Copy to the repo root as machines.txt, or: cp scripts/machines.example.txt machines.txt
|
# Copy to the repo root as machines.txt, or: cp scripts/machines.example.txt machines.txt
|
||||||
# sample_random_scans.sh: by default one random scan per line = mosaic + tiles; use MOSAIC_ONLY=1 for mosaics only
|
# Random-sample helper `scripts/sample_random_scans.sh` lives on branch `testing/sample-runs` only.
|
||||||
BW1-4 [AMR-15]
|
BW1-4 [AMR-15]
|
||||||
BW1-6 [AMR-19]
|
BW1-6 [AMR-19]
|
||||||
BW1-7 [AMR-18]
|
BW1-7 [AMR-18]
|
||||||
|
|||||||
@@ -1,178 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# For each machine label in a text file, pick one random completed scan and download
|
|
||||||
# it: by default the mosaic and all tiles (same as: --machine "…" --scan-id N).
|
|
||||||
# For mosaic only (faster, no tile downloads), set: MOSAIC_ONLY=1
|
|
||||||
#
|
|
||||||
# Usage:
|
|
||||||
# ./scripts/sample_random_scans.sh [PATH_TO_machines.txt]
|
|
||||||
# Config path defaults to config.yaml in the repo root. Override with:
|
|
||||||
# CONFIG=/path/to/config.yaml ./scripts/sample_random_scans.sh machines.txt
|
|
||||||
# Dry-run the download step (listing still does real HTTP to fetch scan list):
|
|
||||||
# DRY_RUN=1 ./scripts/sample_random_scans.sh machines.txt
|
|
||||||
# Verbose / debug (extra per-step lines, scan counts from the list step):
|
|
||||||
# DEBUG=1 ./scripts/sample_random_scans.sh machines.txt
|
|
||||||
# By default, --list-scans fetches only the first page (one HTTP request, up to
|
|
||||||
# 320 scans). To paginate the full archive for the random pick (slower when many
|
|
||||||
# LIST_SCANS_ALL_PAGES=1 ./scripts/sample_random_scans.sh machines.txt
|
|
||||||
#
|
|
||||||
# machines.txt: one machine label per line (same as --machine and config machine names).
|
|
||||||
# See scripts/machines.example.txt
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
||||||
CONFIG="${CONFIG:-$REPO_ROOT/config.yaml}"
|
|
||||||
MACHINES_FILE="${1:-$REPO_ROOT/machines.txt}"
|
|
||||||
SCRAPER=(python3 "$REPO_ROOT/scraper.py" --config "$CONFIG")
|
|
||||||
|
|
||||||
log() { echo "[sample_random_scans] $*" >&2; }
|
|
||||||
log_debug() {
|
|
||||||
if [[ -n "${DEBUG:-}" ]]; then
|
|
||||||
echo "[sample_random_scans] debug: $*" >&2
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
if [[ ! -f "$MACHINES_FILE" ]]; then
|
|
||||||
log "error: file not found: $MACHINES_FILE"
|
|
||||||
log "Create it with one machine label per line, or: cp scripts/machines.example.txt machines.txt"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$CONFIG" ]]; then
|
|
||||||
log "error: config not found: $CONFIG"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Non-empty, non-comment lines (same rules as the main loop)
|
|
||||||
TOTAL_MACHINES="$(
|
|
||||||
grep -v '^[[:space:]]*#' "$MACHINES_FILE" | grep -c -v '^[[:space:]]*$' || true
|
|
||||||
)"
|
|
||||||
if [[ -z "$TOTAL_MACHINES" || "$TOTAL_MACHINES" -eq 0 ]]; then
|
|
||||||
log "error: no machine lines in: $MACHINES_FILE"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "starting repo=$REPO_ROOT"
|
|
||||||
log " config=$CONFIG"
|
|
||||||
log " machines_file=$MACHINES_FILE (${TOTAL_MACHINES} machine(s) in file)"
|
|
||||||
if [[ -n "${MOSAIC_ONLY:-}" ]]; then
|
|
||||||
if [[ -n "${DRY_RUN:-}" ]]; then
|
|
||||||
log " mode: MOSAIC_ONLY + DRY_RUN (mosaic only, --dry-run on download step)"
|
|
||||||
else
|
|
||||||
log " mode: MOSAIC_ONLY=1 (mosaics only, no tiles; use for a lighter sample)"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [[ -n "${DRY_RUN:-}" ]]; then
|
|
||||||
log " mode: DRY_RUN (list + full scan download use --dry-run; no files written)"
|
|
||||||
else
|
|
||||||
log " mode: full scan — mosaic + all tiles (workers from config)"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
if [[ -n "${DEBUG:-}" ]]; then
|
|
||||||
log " DEBUG=1 (extra diagnostics enabled)"
|
|
||||||
fi
|
|
||||||
if [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]]; then
|
|
||||||
log " list step: list-scans = full archive (all pages, slower)"
|
|
||||||
else
|
|
||||||
log " list step: list-scans --list-scans-first-page-only (one page, up to 320 IDs)"
|
|
||||||
fi
|
|
||||||
log "────────────────────────────────────────"
|
|
||||||
|
|
||||||
export REPO_ROOT CONFIG
|
|
||||||
[[ -n "${DEBUG:-}" ]] && export DEBUG
|
|
||||||
[[ -n "${LIST_SCANS_ALL_PAGES:-}" ]] && export LIST_SCANS_ALL_PAGES
|
|
||||||
|
|
||||||
PROCESSED=0
|
|
||||||
SKIPPED=0
|
|
||||||
IDX=0
|
|
||||||
|
|
||||||
while IFS= read -r line || [[ -n "${line-}" ]]; do
|
|
||||||
# trim, strip CR, skip blanks / comments
|
|
||||||
line="${line//$'\r'/}"
|
|
||||||
label="${line#"${line%%[![:space:]]*}"}"
|
|
||||||
label="${label%"${label##*[![:space:]]}"}"
|
|
||||||
[[ -z "$label" || "$label" == \#* ]] && continue
|
|
||||||
|
|
||||||
IDX=$((IDX + 1))
|
|
||||||
log "[$IDX/$TOTAL_MACHINES] machine: $label"
|
|
||||||
log " status: listing scans (--list-scans) …"
|
|
||||||
|
|
||||||
random_id="$(
|
|
||||||
REPO_ROOT="$REPO_ROOT" CONFIG="$CONFIG" LABEL="$label" python3 - <<'PY'
|
|
||||||
import os, random, subprocess, sys
|
|
||||||
|
|
||||||
label = os.environ["LABEL"]
|
|
||||||
repo = os.environ["REPO_ROOT"]
|
|
||||||
cfg = os.environ["CONFIG"]
|
|
||||||
debug = bool(os.environ.get("DEBUG"))
|
|
||||||
full = bool(os.environ.get("LIST_SCANS_ALL_PAGES"))
|
|
||||||
scraper = os.path.join(repo, "scraper.py")
|
|
||||||
if debug:
|
|
||||||
print(
|
|
||||||
f"[sample_random_scans] debug: running list-scans for {label!r} "
|
|
||||||
f"({'all pages' if full else 'first page only'})",
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
cmd = [sys.executable, scraper, "--list-scans", "--machine", label, "--config", cfg]
|
|
||||||
if not full:
|
|
||||||
cmd.insert(3, "--list-scans-first-page-only")
|
|
||||||
out = subprocess.check_output(
|
|
||||||
cmd,
|
|
||||||
text=True,
|
|
||||||
stderr=subprocess.STDOUT,
|
|
||||||
)
|
|
||||||
ids = []
|
|
||||||
for line in out.splitlines():
|
|
||||||
line = line.rstrip()
|
|
||||||
if not line or line.startswith("---") or "Total" in line:
|
|
||||||
continue
|
|
||||||
parts = line.split()
|
|
||||||
if parts and parts[0].isdigit():
|
|
||||||
ids.append(parts[0])
|
|
||||||
if not ids:
|
|
||||||
print(f"no scans parsed for {label!r} — check login and output", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
if debug:
|
|
||||||
print(
|
|
||||||
f"[sample_random_scans] debug: parsed {len(ids)} scan id(s) for {label!r}",
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
print(random.choice(ids), end="")
|
|
||||||
PY
|
|
||||||
)" || {
|
|
||||||
log " status: SKIPPED (could not get scan list or pick id)"
|
|
||||||
SKIPPED=$((SKIPPED + 1))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
log " status: picked random scan_id=$random_id (uniform among IDs from this list step — first page by default, see start banner)"
|
|
||||||
if [[ -n "${MOSAIC_ONLY:-}" ]]; then
|
|
||||||
log " status: running scraper: --mosaic-only --scan-id (mosaic only) …"
|
|
||||||
else
|
|
||||||
log " status: running scraper: --scan-id (mosaic + tiles) …"
|
|
||||||
fi
|
|
||||||
if [[ -n "${DRY_RUN:-}" ]]; then
|
|
||||||
log " status: (dry-run — no files written for this scan)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n "${MOSAIC_ONLY:-}" ]]; then
|
|
||||||
run_cmd=("${SCRAPER[@]}" --mosaic-only --machine "$label" --scan-id "$random_id")
|
|
||||||
else
|
|
||||||
run_cmd=("${SCRAPER[@]}" --machine "$label" --scan-id "$random_id")
|
|
||||||
fi
|
|
||||||
if [[ -n "${DRY_RUN:-}" ]]; then
|
|
||||||
run_cmd+=(--dry-run)
|
|
||||||
fi
|
|
||||||
if "${run_cmd[@]}"; then
|
|
||||||
log " status: OK — finished this machine (exit 0)"
|
|
||||||
PROCESSED=$((PROCESSED + 1))
|
|
||||||
else
|
|
||||||
rc=$?
|
|
||||||
log " status: FAILED — scraper exit code $rc (stopping; fix or remove this machine and re-run)"
|
|
||||||
exit "$rc"
|
|
||||||
fi
|
|
||||||
log "────────────────────────────────────────"
|
|
||||||
done < "$MACHINES_FILE"
|
|
||||||
|
|
||||||
log "done. summary: $PROCESSED machine(s) with sampled scan download completed, $SKIPPED skipped, $IDX line(s) processed out of $TOTAL_MACHINES in file."
|
|
||||||
exit 0
|
|
||||||
Reference in New Issue
Block a user