diff --git a/scripts/sample_random_scans.sh b/scripts/sample_random_scans.sh new file mode 100755 index 0000000..dab2acd --- /dev/null +++ b/scripts/sample_random_scans.sh @@ -0,0 +1,178 @@ +#!/usr/bin/env bash +# For each machine label in a text file, pick one random completed scan and download +# it: by default the mosaic and all tiles (same as: --machine "…" --scan-id N). +# For mosaic only (faster, no tile downloads), set: MOSAIC_ONLY=1 +# +# Usage: +# ./scripts/sample_random_scans.sh [PATH_TO_machines.txt] +# Config path defaults to config.yaml in the repo root. Override with: +# CONFIG=/path/to/config.yaml ./scripts/sample_random_scans.sh machines.txt +# Dry-run the download step (listing still does real HTTP to fetch scan list): +# DRY_RUN=1 ./scripts/sample_random_scans.sh machines.txt +# Verbose / debug (extra per-step lines, scan counts from the list step): +# DEBUG=1 ./scripts/sample_random_scans.sh machines.txt +# By default, --list-scans fetches only the first page (one HTTP request, up to +# 320 scans). To paginate the full archive for the random pick (slower when many +# LIST_SCANS_ALL_PAGES=1 ./scripts/sample_random_scans.sh machines.txt +# +# machines.txt: one machine label per line (same as --machine and config machine names). +# See scripts/machines.example.txt + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +CONFIG="${CONFIG:-$REPO_ROOT/config.yaml}" +MACHINES_FILE="${1:-$REPO_ROOT/machines.txt}" +SCRAPER=(python3 "$REPO_ROOT/scraper.py" --config "$CONFIG") + +log() { echo "[sample_random_scans] $*" >&2; } +log_debug() { + if [[ -n "${DEBUG:-}" ]]; then + echo "[sample_random_scans] debug: $*" >&2 + fi +} + +if [[ ! -f "$MACHINES_FILE" ]]; then + log "error: file not found: $MACHINES_FILE" + log "Create it with one machine label per line, or: cp scripts/machines.example.txt machines.txt" + exit 1 +fi + +if [[ ! -f "$CONFIG" ]]; then + log "error: config not found: $CONFIG" + exit 1 +fi + +# Non-empty, non-comment lines (same rules as the main loop) +TOTAL_MACHINES="$( + grep -v '^[[:space:]]*#' "$MACHINES_FILE" | grep -c -v '^[[:space:]]*$' || true +)" +if [[ -z "$TOTAL_MACHINES" || "$TOTAL_MACHINES" -eq 0 ]]; then + log "error: no machine lines in: $MACHINES_FILE" + exit 1 +fi + +log "starting repo=$REPO_ROOT" +log " config=$CONFIG" +log " machines_file=$MACHINES_FILE (${TOTAL_MACHINES} machine(s) in file)" +if [[ -n "${MOSAIC_ONLY:-}" ]]; then + if [[ -n "${DRY_RUN:-}" ]]; then + log " mode: MOSAIC_ONLY + DRY_RUN (mosaic only, --dry-run on download step)" + else + log " mode: MOSAIC_ONLY=1 (mosaics only, no tiles; use for a lighter sample)" + fi +else + if [[ -n "${DRY_RUN:-}" ]]; then + log " mode: DRY_RUN (list + full scan download use --dry-run; no files written)" + else + log " mode: full scan — mosaic + all tiles (workers from config)" + fi +fi +if [[ -n "${DEBUG:-}" ]]; then + log " DEBUG=1 (extra diagnostics enabled)" +fi +if [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]]; then + log " list step: list-scans = full archive (all pages, slower)" +else + log " list step: list-scans --list-scans-first-page-only (one page, up to 320 IDs)" +fi +log "────────────────────────────────────────" + +export REPO_ROOT CONFIG +[[ -n "${DEBUG:-}" ]] && export DEBUG +[[ -n "${LIST_SCANS_ALL_PAGES:-}" ]] && export LIST_SCANS_ALL_PAGES + +PROCESSED=0 +SKIPPED=0 +IDX=0 + +while IFS= read -r line || [[ -n "${line-}" ]]; do + # trim, strip CR, skip blanks / comments + line="${line//$'\r'/}" + label="${line#"${line%%[![:space:]]*}"}" + label="${label%"${label##*[![:space:]]}"}" + [[ -z "$label" || "$label" == \#* ]] && continue + + IDX=$((IDX + 1)) + log "[$IDX/$TOTAL_MACHINES] machine: $label" + log " status: listing scans (--list-scans) …" + + random_id="$( + REPO_ROOT="$REPO_ROOT" CONFIG="$CONFIG" LABEL="$label" python3 - <<'PY' +import os, random, subprocess, sys + +label = os.environ["LABEL"] +repo = os.environ["REPO_ROOT"] +cfg = os.environ["CONFIG"] +debug = bool(os.environ.get("DEBUG")) +full = bool(os.environ.get("LIST_SCANS_ALL_PAGES")) +scraper = os.path.join(repo, "scraper.py") +if debug: + print( + f"[sample_random_scans] debug: running list-scans for {label!r} " + f"({'all pages' if full else 'first page only'})", + file=sys.stderr, + ) +cmd = [sys.executable, scraper, "--list-scans", "--machine", label, "--config", cfg] +if not full: + cmd.insert(3, "--list-scans-first-page-only") +out = subprocess.check_output( + cmd, + text=True, + stderr=subprocess.STDOUT, +) +ids = [] +for line in out.splitlines(): + line = line.rstrip() + if not line or line.startswith("---") or "Total" in line: + continue + parts = line.split() + if parts and parts[0].isdigit(): + ids.append(parts[0]) +if not ids: + print(f"no scans parsed for {label!r} — check login and output", file=sys.stderr) + sys.exit(1) +if debug: + print( + f"[sample_random_scans] debug: parsed {len(ids)} scan id(s) for {label!r}", + file=sys.stderr, + ) +print(random.choice(ids), end="") +PY + )" || { + log " status: SKIPPED (could not get scan list or pick id)" + SKIPPED=$((SKIPPED + 1)) + continue + } + + log " status: picked random scan_id=$random_id (uniform among IDs from this list step — first page by default, see start banner)" + if [[ -n "${MOSAIC_ONLY:-}" ]]; then + log " status: running scraper: --mosaic-only --scan-id (mosaic only) …" + else + log " status: running scraper: --scan-id (mosaic + tiles) …" + fi + if [[ -n "${DRY_RUN:-}" ]]; then + log " status: (dry-run — no files written for this scan)" + fi + + if [[ -n "${MOSAIC_ONLY:-}" ]]; then + run_cmd=("${SCRAPER[@]}" --mosaic-only --machine "$label" --scan-id "$random_id") + else + run_cmd=("${SCRAPER[@]}" --machine "$label" --scan-id "$random_id") + fi + if [[ -n "${DRY_RUN:-}" ]]; then + run_cmd+=(--dry-run) + fi + if "${run_cmd[@]}"; then + log " status: OK — finished this machine (exit 0)" + PROCESSED=$((PROCESSED + 1)) + else + rc=$? + log " status: FAILED — scraper exit code $rc (stopping; fix or remove this machine and re-run)" + exit "$rc" + fi + log "────────────────────────────────────────" +done < "$MACHINES_FILE" + +log "done. summary: $PROCESSED machine(s) with sampled scan download completed, $SKIPPED skipped, $IDX line(s) processed out of $TOTAL_MACHINES in file." +exit 0