#!/usr/bin/env bash # For each machine label in a text file, pick one random completed scan and download # it: by default the mosaic and all tiles (same as: --machine "…" --scan-id N). # For mosaic only (faster, no tile downloads), set: MOSAIC_ONLY=1 # # Usage: # ./scripts/sample_random_scans.sh [PATH_TO_machines.txt] # Config path defaults to config.yaml in the repo root. Override with: # CONFIG=/path/to/config.yaml ./scripts/sample_random_scans.sh machines.txt # Dry-run the download step (listing still does real HTTP to fetch scan list): # DRY_RUN=1 ./scripts/sample_random_scans.sh machines.txt # Verbose / debug (extra per-step lines, scan counts from the list step): # DEBUG=1 ./scripts/sample_random_scans.sh machines.txt # By default, --list-scans fetches only the first page (one HTTP request, up to # 320 scans). To paginate the full archive for the random pick (slower when many # LIST_SCANS_ALL_PAGES=1 ./scripts/sample_random_scans.sh machines.txt # # machines.txt: one machine label per line (same as --machine and config machine names). # See scripts/machines.example.txt set -euo pipefail REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" CONFIG="${CONFIG:-$REPO_ROOT/config.yaml}" MACHINES_FILE="${1:-$REPO_ROOT/machines.txt}" SCRAPER=(python3 "$REPO_ROOT/scraper.py" --config "$CONFIG") log() { echo "[sample_random_scans] $*" >&2; } log_debug() { if [[ -n "${DEBUG:-}" ]]; then echo "[sample_random_scans] debug: $*" >&2 fi } if [[ ! -f "$MACHINES_FILE" ]]; then log "error: file not found: $MACHINES_FILE" log "Create it with one machine label per line, or: cp scripts/machines.example.txt machines.txt" exit 1 fi if [[ ! -f "$CONFIG" ]]; then log "error: config not found: $CONFIG" exit 1 fi # Non-empty, non-comment lines (same rules as the main loop) TOTAL_MACHINES="$( grep -v '^[[:space:]]*#' "$MACHINES_FILE" | grep -c -v '^[[:space:]]*$' || true )" if [[ -z "$TOTAL_MACHINES" || "$TOTAL_MACHINES" -eq 0 ]]; then log "error: no machine lines in: $MACHINES_FILE" exit 1 fi log "starting repo=$REPO_ROOT" log " config=$CONFIG" log " machines_file=$MACHINES_FILE (${TOTAL_MACHINES} machine(s) in file)" if [[ -n "${MOSAIC_ONLY:-}" ]]; then if [[ -n "${DRY_RUN:-}" ]]; then log " mode: MOSAIC_ONLY + DRY_RUN (mosaic only, --dry-run on download step)" else log " mode: MOSAIC_ONLY=1 (mosaics only, no tiles; use for a lighter sample)" fi else if [[ -n "${DRY_RUN:-}" ]]; then log " mode: DRY_RUN (list + full scan download use --dry-run; no files written)" else log " mode: full scan — mosaic + all tiles (workers from config)" fi fi if [[ -n "${DEBUG:-}" ]]; then log " DEBUG=1 (extra diagnostics enabled)" fi if [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]]; then log " list step: list-scans = full archive (all pages, slower)" else log " list step: list-scans --list-scans-first-page-only (one page, up to 320 IDs)" fi log "────────────────────────────────────────" export REPO_ROOT CONFIG [[ -n "${DEBUG:-}" ]] && export DEBUG [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]] && export LIST_SCANS_ALL_PAGES PROCESSED=0 SKIPPED=0 IDX=0 while IFS= read -r line || [[ -n "${line-}" ]]; do # trim, strip CR, skip blanks / comments line="${line//$'\r'/}" label="${line#"${line%%[![:space:]]*}"}" label="${label%"${label##*[![:space:]]}"}" [[ -z "$label" || "$label" == \#* ]] && continue IDX=$((IDX + 1)) log "[$IDX/$TOTAL_MACHINES] machine: $label" log " status: listing scans (--list-scans) …" random_id="$( REPO_ROOT="$REPO_ROOT" CONFIG="$CONFIG" LABEL="$label" python3 - <<'PY' import os, random, subprocess, sys label = os.environ["LABEL"] repo = os.environ["REPO_ROOT"] cfg = os.environ["CONFIG"] debug = bool(os.environ.get("DEBUG")) full = bool(os.environ.get("LIST_SCANS_ALL_PAGES")) scraper = os.path.join(repo, "scraper.py") if debug: print( f"[sample_random_scans] debug: running list-scans for {label!r} " f"({'all pages' if full else 'first page only'})", file=sys.stderr, ) cmd = [sys.executable, scraper, "--list-scans", "--machine", label, "--config", cfg] if not full: cmd.insert(3, "--list-scans-first-page-only") out = subprocess.check_output( cmd, text=True, stderr=subprocess.STDOUT, ) ids = [] for line in out.splitlines(): line = line.rstrip() if not line or line.startswith("---") or "Total" in line: continue parts = line.split() if parts and parts[0].isdigit(): ids.append(parts[0]) if not ids: print(f"no scans parsed for {label!r} — check login and output", file=sys.stderr) sys.exit(1) if debug: print( f"[sample_random_scans] debug: parsed {len(ids)} scan id(s) for {label!r}", file=sys.stderr, ) print(random.choice(ids), end="") PY )" || { log " status: SKIPPED (could not get scan list or pick id)" SKIPPED=$((SKIPPED + 1)) continue } log " status: picked random scan_id=$random_id (uniform among IDs from this list step — first page by default, see start banner)" if [[ -n "${MOSAIC_ONLY:-}" ]]; then log " status: running scraper: --mosaic-only --scan-id (mosaic only) …" else log " status: running scraper: --scan-id (mosaic + tiles) …" fi if [[ -n "${DRY_RUN:-}" ]]; then log " status: (dry-run — no files written for this scan)" fi if [[ -n "${MOSAIC_ONLY:-}" ]]; then run_cmd=("${SCRAPER[@]}" --mosaic-only --machine "$label" --scan-id "$random_id") else run_cmd=("${SCRAPER[@]}" --machine "$label" --scan-id "$random_id") fi if [[ -n "${DRY_RUN:-}" ]]; then run_cmd+=(--dry-run) fi if "${run_cmd[@]}"; then log " status: OK — finished this machine (exit 0)" PROCESSED=$((PROCESSED + 1)) else rc=$? log " status: FAILED — scraper exit code $rc (stopping; fix or remove this machine and re-run)" exit "$rc" fi log "────────────────────────────────────────" done < "$MACHINES_FILE" log "done. summary: $PROCESSED machine(s) with sampled scan download completed, $SKIPPED skipped, $IDX line(s) processed out of $TOTAL_MACHINES in file." exit 0