diff --git a/.gitignore b/.gitignore index cfcb0be..28c541b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ __pycache__/ .DS_Store explore_dumps/ .venv/ +scripts/sync_to_nas.sh +backup/ +.claude/ diff --git a/README.md b/README.md index cf79c3a..9687890 100644 --- a/README.md +++ b/README.md @@ -97,10 +97,8 @@ python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only # Download mosaics for all machines python scraper.py --mosaic-only -# One random completed scan per machine: mosaic + all tiles (from machines.txt; uses --list-scans + --scan-id) -# MOSAIC_ONLY=1 ./scripts/sample_random_scans.sh machines.txt # optional: mosaics only, no tiles -# cp scripts/machines.example.txt machines.txt # then edit: one label per line -# ./scripts/sample_random_scans.sh machines.txt +# One random completed scan per machine (helper script): check out branch `testing/sample-runs`, +# then see `scripts/sample_random_scans.sh` and `docs/sample_random_scans_run_progress.md`. # Download all tiles for a specific scan python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4 diff --git a/docs/mosaic_reconstruction_sample_images/146368_mosaic.jpg b/docs/mosaic_reconstruction_sample_images/146368_mosaic.jpg new file mode 100644 index 0000000..9df60c4 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/146368_mosaic.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/146368_mosaic_reconstructed.jpg b/docs/mosaic_reconstruction_sample_images/146368_mosaic_reconstructed.jpg new file mode 100644 index 0000000..1fc6e67 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/146368_mosaic_reconstructed.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/146368_tile_s1.jpg b/docs/mosaic_reconstruction_sample_images/146368_tile_s1.jpg new file mode 100644 index 0000000..0d4a87b Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/146368_tile_s1.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156875_mosaic.jpg b/docs/mosaic_reconstruction_sample_images/156875_mosaic.jpg new file mode 100644 index 0000000..bbc559b Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156875_mosaic.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156875_mosaic_reconstructed.jpg b/docs/mosaic_reconstruction_sample_images/156875_mosaic_reconstructed.jpg new file mode 100644 index 0000000..aebcd72 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156875_mosaic_reconstructed.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156875_tile_s1.jpg b/docs/mosaic_reconstruction_sample_images/156875_tile_s1.jpg new file mode 100644 index 0000000..b5a18e6 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156875_tile_s1.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156957_mosaic.jpg b/docs/mosaic_reconstruction_sample_images/156957_mosaic.jpg new file mode 100644 index 0000000..87694a9 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156957_mosaic.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156957_mosaic_reconstructed.jpg b/docs/mosaic_reconstruction_sample_images/156957_mosaic_reconstructed.jpg new file mode 100644 index 0000000..dda903a Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156957_mosaic_reconstructed.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156957_tile_s1.jpg b/docs/mosaic_reconstruction_sample_images/156957_tile_s1.jpg new file mode 100644 index 0000000..86ff192 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156957_tile_s1.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/160022_mosaic.jpg b/docs/mosaic_reconstruction_sample_images/160022_mosaic.jpg new file mode 100644 index 0000000..da61577 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/160022_mosaic.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/160022_mosaic_reconstructed.jpg b/docs/mosaic_reconstruction_sample_images/160022_mosaic_reconstructed.jpg new file mode 100644 index 0000000..a020167 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/160022_mosaic_reconstructed.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/160022_tile_s1.jpg b/docs/mosaic_reconstruction_sample_images/160022_tile_s1.jpg new file mode 100644 index 0000000..fdce342 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/160022_tile_s1.jpg differ diff --git a/run_metadata_scan.sh b/run_metadata_scan.sh new file mode 100644 index 0000000..bbfe1b0 --- /dev/null +++ b/run_metadata_scan.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV="/tmp/spruce_venv" + +if [[ ! -x "$VENV/bin/python" ]]; then + echo "Setting up venv at $VENV..." + python3 -m venv "$VENV" + "$VENV/bin/python" -m ensurepip --upgrade + "$VENV/bin/pip" install -q -r "$SCRIPT_DIR/requirements.txt" +fi + +echo "Starting metadata-only scan of all machines..." +"$VENV/bin/python" "$SCRIPT_DIR/scraper.py" --metadata-only "$@" diff --git a/scripts/export_machine_metadata.py b/scripts/export_machine_metadata.py new file mode 100644 index 0000000..e5b2e4d --- /dev/null +++ b/scripts/export_machine_metadata.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +""" +Split scans.csv into per-machine metadata CSVs. + +Reads the combined scans.csv produced by the scraper and writes one CSV per +machine containing only the website-sourced metadata columns (no mosaic paths, +download status, or error fields). + +Usage: + python scripts/export_machine_metadata.py + python scripts/export_machine_metadata.py --input archives/scans.csv --output-dir archives/by_machine +""" + +from __future__ import annotations + +import argparse +import csv +import sys +from collections import defaultdict +from pathlib import Path + +METADATA_COLUMNS = [ + "machine", + "machine_id", + "scan_id", + "name", + "scan_time", + "start_x", + "start_y", + "end_x", + "end_y", + "dx", + "dy", + "nx", + "ny", + "total_tiles", + "scan_lines", + "scan_mode", + "start_datetime", + "end_datetime", + "status", + "user", + "disk_space_mb", +] + + +def sanitize_machine_label(label: str) -> str: + return label.replace("[", "").replace("]", "").replace(" ", "_").strip("_") + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Split scans.csv into per-machine metadata CSVs.") + p.add_argument( + "--input", + default="archives/scans.csv", + metavar="FILE", + help="Path to scans.csv (default: archives/scans.csv)", + ) + p.add_argument( + "--output-dir", + default="archives/by_machine", + metavar="DIR", + help="Directory for output CSVs (default: archives/by_machine)", + ) + return p.parse_args() + + +def main() -> None: + args = parse_args() + input_path = Path(args.input) + output_dir = Path(args.output_dir) + + if not input_path.exists(): + sys.exit(f"Input file not found: {input_path}") + + with input_path.open(newline="") as fh: + reader = csv.DictReader(fh) + if reader.fieldnames is None: + sys.exit(f"{input_path} appears to be empty.") + + missing = [c for c in METADATA_COLUMNS if c not in reader.fieldnames] + if missing: + sys.exit(f"Expected columns not found in {input_path}: {missing}") + + rows_by_machine: dict[str, list[dict]] = defaultdict(list) + for row in reader: + rows_by_machine[row["machine"]].append(row) + + output_dir.mkdir(parents=True, exist_ok=True) + + for machine_label, rows in sorted(rows_by_machine.items()): + safe_name = sanitize_machine_label(machine_label) + out_path = output_dir / f"{safe_name}_scans_metadata.csv" + with out_path.open("w", newline="") as fh: + writer = csv.DictWriter(fh, fieldnames=METADATA_COLUMNS, extrasaction="ignore") + writer.writeheader() + writer.writerows(rows) + print(f" {out_path} ({len(rows)} rows)") + + total = sum(len(r) for r in rows_by_machine.values()) + print(f"\n{len(rows_by_machine)} machine(s), {total} total rows → {output_dir}/") + + +if __name__ == "__main__": + main() diff --git a/scripts/machines.example.txt b/scripts/machines.example.txt index f3571b5..53c91f7 100644 --- a/scripts/machines.example.txt +++ b/scripts/machines.example.txt @@ -1,6 +1,6 @@ # All RootView minirhizotron machine labels (same set as `machine_metadata` in config.example.yaml). # Copy to the repo root as machines.txt, or: cp scripts/machines.example.txt machines.txt -# sample_random_scans.sh: by default one random scan per line = mosaic + tiles; use MOSAIC_ONLY=1 for mosaics only +# Random-sample helper `scripts/sample_random_scans.sh` lives on branch `testing/sample-runs` only. BW1-4 [AMR-15] BW1-6 [AMR-19] BW1-7 [AMR-18] diff --git a/scripts/mosaic_progress_report.py b/scripts/mosaic_progress_report.py new file mode 100644 index 0000000..4730308 --- /dev/null +++ b/scripts/mosaic_progress_report.py @@ -0,0 +1,381 @@ +#!/usr/bin/env python3 +""" +Report mosaic download progress from archives/scans.csv. + +Output is formatted as Markdown. Add --by-year for a per-machine × +per-year breakdown table. + +Rate/ETA require two calls at least 60 s apart. Mean mosaic size is +sampled from up to 100 already-downloaded files and cached for 1 hour. + +Usage: + python scripts/mosaic_progress_report.py [--archive DIR] [--by-year] +""" + +import argparse +import csv +import json +import os +import random +import sys +from collections import Counter +from datetime import datetime, timezone +from pathlib import Path + +# Year-based viability model derived from BW1-4 observations: +# pre-2019 → kept on server long-term (~100 %) +# 2019-2022 → purged (~ 0 %) +# 2023+ → recent, mostly available (~82 %) +_R_PRE19 = 1.00 +_R_PURGED = 0.00 +_R_RECENT = 0.82 + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _parse_dt(s: str) -> datetime | None: + try: + return datetime.fromisoformat(s.replace("Z", "+00:00")) + except Exception: + return None + + +def _fmt_duration(seconds: float) -> str: + if seconds < 0: + return "?" + h = int(seconds // 3600) + m = int((seconds % 3600) // 60) + if h >= 48: + return f"{h // 24}d {h % 24}h" + if h > 0: + return f"{h}h {m:02d}m" + return f"{m}m {int(seconds % 60):02d}s" + + +def _fmt_size(b: float) -> str: + if b >= 1e12: + return f"{b/1e12:.2f} TB" + if b >= 1e9: + return f"{b/1e9:.2f} GB" + if b >= 1e6: + return f"{b/1e6:.1f} MB" + return f"{b/1e3:.0f} KB" + + +def _md_table(headers: list[str], rows: list[list[str]], *, align: list[str] | None = None) -> str: + """Render a Markdown table. align values: 'l', 'r', 'c' (default 'l').""" + if align is None: + align = ["l"] * len(headers) + sep_map = {"l": ":---", "r": "---:", "c": ":---:"} + + def row_str(cells: list[str]) -> str: + return "| " + " | ".join(cells) + " |" + + lines = [ + row_str(headers), + row_str([sep_map.get(a, ":---") for a in align]), + ] + for r in rows: + lines.append(row_str(r)) + return "\n".join(lines) + + +def _sample_mean_bytes(rows: list[dict], cache: dict, max_sample: int = 100) -> float | None: + cached_mean = cache.get("mean_bytes") + cached_n = cache.get("sample_n", 0) + cached_ts = _parse_dt(cache.get("size_ts", "")) + now = datetime.now(timezone.utc) + if ( + cached_mean and cached_ts + and (now - cached_ts).total_seconds() < 3600 + and cached_n >= min(max_sample, len(rows)) + ): + return float(cached_mean) + + sample = random.sample(rows, min(max_sample, len(rows))) + sizes = [] + for row in sample: + p = row.get("mosaic_local_path", "") + if p: + try: + sz = os.path.getsize(p) + if sz > 0: + sizes.append(sz) + except OSError: + pass + if not sizes: + return None + mean = sum(sizes) / len(sizes) + cache["mean_bytes"] = mean + cache["sample_n"] = len(sizes) + cache["size_ts"] = now.isoformat() + return mean + + +def _expected_remaining(pending_rows: list[dict]) -> float: + count = 0.0 + for row in pending_rows: + yr = row.get("scan_time", "")[:4] + if yr < "2019": + count += _R_PRE19 + elif yr <= "2022": + count += _R_PURGED + else: + count += _R_RECENT + return count + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--archive", default="archives") + parser.add_argument( + "--by-year", action="store_true", + help="Add a per-machine × per-year done/failed breakdown table", + ) + args = parser.parse_args() + + archive = Path(args.archive) + scans_csv = archive / "scans.csv" + progress_json = archive / ".progress.json" + rate_cache_path = archive / ".mosaic_rate_cache.json" + + if not scans_csv.exists(): + sys.exit(f"scans.csv not found: {scans_csv}") + + # --- Load & deduplicate (last row per machine+scan_id) --- + latest: dict[tuple[str, str], dict] = {} + with open(scans_csv, newline="", encoding="utf-8") as f: + for row in csv.DictReader(f): + key = (row.get("machine", ""), row.get("scan_id", "")) + latest[key] = row + + by_machine: dict[str, Counter] = {} + # machine -> year -> Counter(status -> count) + by_machine_year: dict[str, dict[str, Counter]] = {} + total_counts: Counter = Counter() + downloaded_rows: list[dict] = [] + pending_rows: list[dict] = [] + + for (_m, _sid), row in latest.items(): + status = row.get("mosaic_download_status", "") + m = row.get("machine", "") + yr = (row.get("scan_time") or "")[:4] or "????" + by_machine.setdefault(m, Counter())[status] += 1 + by_machine_year.setdefault(m, {}).setdefault(yr, Counter())[status] += 1 + total_counts[status] += 1 + if status == "downloaded": + downloaded_rows.append(row) + elif status == "skipped_metadata_only": + pending_rows.append(row) + + total = sum(total_counts.values()) + downloaded = total_counts["downloaded"] + failed = total_counts["failed"] + zero_skipped = total_counts["skipped_zero_disk_space"] + pending = total_counts["skipped_metadata_only"] + processed = downloaded + failed + zero_skipped + attempted = downloaded + failed + now = datetime.now(timezone.utc) + + # --- Elapsed --- + elapsed_str = "" + if progress_json.exists(): + try: + data = json.loads(progress_json.read_text()) + started_at = _parse_dt(data.get("started_at", "")) + if started_at: + elapsed_str = _fmt_duration((now - started_at).total_seconds()) + except Exception: + pass + + # --- Rate cache --- + cache: dict = {} + if rate_cache_path.exists(): + try: + cache = json.loads(rate_cache_path.read_text()) + except Exception: + pass + + # Rolling rate: keep up to 60 snapshots; compute rate from the oldest + # snapshot within the last 30 minutes for a smoothed estimate. + snapshots: list[dict] = cache.get("snapshots", []) + # Prune snapshots older than 30 minutes, but keep at least one + cutoff = now.timestamp() - 1800 + recent = [s for s in snapshots if s.get("ts", 0) >= cutoff] + if not recent and snapshots: + recent = [snapshots[-1]] # always keep one for continuity + + rate_per_sec: float | None = None + rate_window_str = "" + if recent: + oldest = recent[0] + dt = now.timestamp() - oldest["ts"] + dp = processed - oldest["proc"] + if dt >= 60 and dp > 0: + rate_per_sec = dp / dt + window_min = dt / 60 + rate_window_str = f"{window_min:.0f}-min avg" + + # --- Disk space --- + mean_bytes: float | None = None + size_note = "" + if downloaded_rows: + mean_bytes = _sample_mean_bytes(downloaded_rows, cache) + if mean_bytes and cache.get("sample_n"): + size_note = f"mean {_fmt_size(mean_bytes)} × {cache['sample_n']} sampled files" + + dl_bytes: float | None = None + rem_bytes: float | None = None + if mean_bytes: + dl_bytes = downloaded * mean_bytes + rem_bytes = _expected_remaining(pending_rows) * mean_bytes + + # Update cache: append new snapshot, keep last 60 + recent.append({"ts": now.timestamp(), "proc": processed}) + cache["snapshots"] = recent[-60:] + # Keep legacy keys for backwards compat + cache["timestamp"] = now.isoformat() + cache["processed"] = processed + try: + rate_cache_path.write_text(json.dumps(cache)) + except Exception: + pass + + rate_str = eta_str = "" + if rate_per_sec and rate_per_sec > 0: + rate_str = f"{rate_per_sec * 3600:,.0f} scans/hr ({rate_window_str})" + eta_str = _fmt_duration(pending / rate_per_sec) + + # ----------------------------------------------------------------------- + # Output — Markdown + # ----------------------------------------------------------------------- + + ts = datetime.now().strftime("%Y-%m-%d %H:%M") + print(f"# Mosaic download progress — {ts}\n") + print(f"**Archive:** `{archive.resolve()}` ") + meta_parts = [] + if elapsed_str: + meta_parts.append(f"**Elapsed:** {elapsed_str}") + if rate_str: + meta_parts.append(f"**Rate:** {rate_str}") + if eta_str: + meta_parts.append(f"**ETA:** {eta_str}") + if meta_parts: + print(" ".join(meta_parts) + " ") + print() + + # Summary table + summary_rows = [ + ["Downloaded", f"{downloaded:,}", f"{100*downloaded/total:.1f}%"], + ["Failed", f"{failed:,}", f"{100*failed/total:.1f}%"], + ["Skipped (disk=0)",f"{zero_skipped:,}", f"{100*zero_skipped/total:.1f}%"], + ["Pending", f"{pending:,}", f"{100*pending/total:.1f}%"], + ["**Total**", f"**{total:,}**", ""], + ] + if attempted: + summary_rows.append(["**Success rate**", f"**{100*downloaded/attempted:.1f}%**", "*(of attempted)*"]) + print(_md_table(["Metric", "Count", ""], summary_rows, align=["l", "r", "l"])) + print() + + # Disk space + if dl_bytes is not None and rem_bytes is not None: + total_bytes = dl_bytes + rem_bytes + print(f"### Disk space\n") + print(f"_{size_note}_\n") + ds_rows = [ + ["Downloaded so far", _fmt_size(dl_bytes), ""], + ["Estimated remaining", _fmt_size(rem_bytes), "*(model-based)*"], + ["**Grand total**", f"**{_fmt_size(total_bytes)}**", ""], + ] + print(_md_table(["", "Size", ""], ds_rows, align=["l", "r", "l"])) + print() + + # Per-machine breakdown + print("### Per-machine breakdown\n") + machines = sorted(by_machine) + mc_rows = [] + for m in machines: + mc = by_machine[m] + mt = sum(mc.values()) + mc_rows.append([ + m, + f"{mc['downloaded']:,}", + f"{mc['failed']:,}", + f"{mc['skipped_zero_disk_space']:,}", + f"{mc['skipped_metadata_only']:,}", + f"{mt:,}", + ]) + mc_rows.append([ + "**TOTAL**", + f"**{downloaded:,}**", + f"**{failed:,}**", + f"**{zero_skipped:,}**", + f"**{pending:,}**", + f"**{total:,}**", + ]) + print(_md_table( + ["Machine", "Done", "Failed", "Skip0", "Pending", "Total"], + mc_rows, + align=["l", "r", "r", "r", "r", "r"], + )) + + # ----------------------------------------------------------------------- + # --by-year table + # ----------------------------------------------------------------------- + if args.by_year: + print() + print("### Downloads by machine and year\n") + print("*Format: done / failed*\n") + + # Only include years that have at least one downloaded or failed scan + all_years = sorted( + yr for yr in set( + yr for m_data in by_machine_year.values() for yr in m_data + ) + if any( + by_machine_year[m].get(yr, Counter()).get("downloaded", 0) + + by_machine_year[m].get(yr, Counter()).get("failed", 0) > 0 + for m in by_machine_year + ) + ) + + # Totals per year + yr_totals: dict[str, Counter] = {} + for yr in all_years: + yr_totals[yr] = Counter() + for m in machines: + yr_totals[yr] += by_machine_year.get(m, {}).get(yr, Counter()) + + year_rows = [] + for m in machines: + row_cells = [m] + for yr in all_years: + c = by_machine_year.get(m, {}).get(yr, Counter()) + d = c.get("downloaded", 0) + f = c.get("failed", 0) + row_cells.append(f"{d:,} / {f:,}" if (d or f) else "—") + year_rows.append(row_cells) + + # Totals row + total_cells = ["**TOTAL**"] + for yr in all_years: + d = yr_totals[yr].get("downloaded", 0) + f = yr_totals[yr].get("failed", 0) + total_cells.append(f"**{d:,} / {f:,}**" if (d or f) else "—") + year_rows.append(total_cells) + + print(_md_table( + ["Machine"] + all_years, + year_rows, + align=["l"] + ["r"] * len(all_years), + )) + + +if __name__ == "__main__": + main() diff --git a/scripts/sample_random_scans.sh b/scripts/sample_random_scans.sh deleted file mode 100755 index dab2acd..0000000 --- a/scripts/sample_random_scans.sh +++ /dev/null @@ -1,178 +0,0 @@ -#!/usr/bin/env bash -# For each machine label in a text file, pick one random completed scan and download -# it: by default the mosaic and all tiles (same as: --machine "…" --scan-id N). -# For mosaic only (faster, no tile downloads), set: MOSAIC_ONLY=1 -# -# Usage: -# ./scripts/sample_random_scans.sh [PATH_TO_machines.txt] -# Config path defaults to config.yaml in the repo root. Override with: -# CONFIG=/path/to/config.yaml ./scripts/sample_random_scans.sh machines.txt -# Dry-run the download step (listing still does real HTTP to fetch scan list): -# DRY_RUN=1 ./scripts/sample_random_scans.sh machines.txt -# Verbose / debug (extra per-step lines, scan counts from the list step): -# DEBUG=1 ./scripts/sample_random_scans.sh machines.txt -# By default, --list-scans fetches only the first page (one HTTP request, up to -# 320 scans). To paginate the full archive for the random pick (slower when many -# LIST_SCANS_ALL_PAGES=1 ./scripts/sample_random_scans.sh machines.txt -# -# machines.txt: one machine label per line (same as --machine and config machine names). -# See scripts/machines.example.txt - -set -euo pipefail - -REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -CONFIG="${CONFIG:-$REPO_ROOT/config.yaml}" -MACHINES_FILE="${1:-$REPO_ROOT/machines.txt}" -SCRAPER=(python3 "$REPO_ROOT/scraper.py" --config "$CONFIG") - -log() { echo "[sample_random_scans] $*" >&2; } -log_debug() { - if [[ -n "${DEBUG:-}" ]]; then - echo "[sample_random_scans] debug: $*" >&2 - fi -} - -if [[ ! -f "$MACHINES_FILE" ]]; then - log "error: file not found: $MACHINES_FILE" - log "Create it with one machine label per line, or: cp scripts/machines.example.txt machines.txt" - exit 1 -fi - -if [[ ! -f "$CONFIG" ]]; then - log "error: config not found: $CONFIG" - exit 1 -fi - -# Non-empty, non-comment lines (same rules as the main loop) -TOTAL_MACHINES="$( - grep -v '^[[:space:]]*#' "$MACHINES_FILE" | grep -c -v '^[[:space:]]*$' || true -)" -if [[ -z "$TOTAL_MACHINES" || "$TOTAL_MACHINES" -eq 0 ]]; then - log "error: no machine lines in: $MACHINES_FILE" - exit 1 -fi - -log "starting repo=$REPO_ROOT" -log " config=$CONFIG" -log " machines_file=$MACHINES_FILE (${TOTAL_MACHINES} machine(s) in file)" -if [[ -n "${MOSAIC_ONLY:-}" ]]; then - if [[ -n "${DRY_RUN:-}" ]]; then - log " mode: MOSAIC_ONLY + DRY_RUN (mosaic only, --dry-run on download step)" - else - log " mode: MOSAIC_ONLY=1 (mosaics only, no tiles; use for a lighter sample)" - fi -else - if [[ -n "${DRY_RUN:-}" ]]; then - log " mode: DRY_RUN (list + full scan download use --dry-run; no files written)" - else - log " mode: full scan — mosaic + all tiles (workers from config)" - fi -fi -if [[ -n "${DEBUG:-}" ]]; then - log " DEBUG=1 (extra diagnostics enabled)" -fi -if [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]]; then - log " list step: list-scans = full archive (all pages, slower)" -else - log " list step: list-scans --list-scans-first-page-only (one page, up to 320 IDs)" -fi -log "────────────────────────────────────────" - -export REPO_ROOT CONFIG -[[ -n "${DEBUG:-}" ]] && export DEBUG -[[ -n "${LIST_SCANS_ALL_PAGES:-}" ]] && export LIST_SCANS_ALL_PAGES - -PROCESSED=0 -SKIPPED=0 -IDX=0 - -while IFS= read -r line || [[ -n "${line-}" ]]; do - # trim, strip CR, skip blanks / comments - line="${line//$'\r'/}" - label="${line#"${line%%[![:space:]]*}"}" - label="${label%"${label##*[![:space:]]}"}" - [[ -z "$label" || "$label" == \#* ]] && continue - - IDX=$((IDX + 1)) - log "[$IDX/$TOTAL_MACHINES] machine: $label" - log " status: listing scans (--list-scans) …" - - random_id="$( - REPO_ROOT="$REPO_ROOT" CONFIG="$CONFIG" LABEL="$label" python3 - <<'PY' -import os, random, subprocess, sys - -label = os.environ["LABEL"] -repo = os.environ["REPO_ROOT"] -cfg = os.environ["CONFIG"] -debug = bool(os.environ.get("DEBUG")) -full = bool(os.environ.get("LIST_SCANS_ALL_PAGES")) -scraper = os.path.join(repo, "scraper.py") -if debug: - print( - f"[sample_random_scans] debug: running list-scans for {label!r} " - f"({'all pages' if full else 'first page only'})", - file=sys.stderr, - ) -cmd = [sys.executable, scraper, "--list-scans", "--machine", label, "--config", cfg] -if not full: - cmd.insert(3, "--list-scans-first-page-only") -out = subprocess.check_output( - cmd, - text=True, - stderr=subprocess.STDOUT, -) -ids = [] -for line in out.splitlines(): - line = line.rstrip() - if not line or line.startswith("---") or "Total" in line: - continue - parts = line.split() - if parts and parts[0].isdigit(): - ids.append(parts[0]) -if not ids: - print(f"no scans parsed for {label!r} — check login and output", file=sys.stderr) - sys.exit(1) -if debug: - print( - f"[sample_random_scans] debug: parsed {len(ids)} scan id(s) for {label!r}", - file=sys.stderr, - ) -print(random.choice(ids), end="") -PY - )" || { - log " status: SKIPPED (could not get scan list or pick id)" - SKIPPED=$((SKIPPED + 1)) - continue - } - - log " status: picked random scan_id=$random_id (uniform among IDs from this list step — first page by default, see start banner)" - if [[ -n "${MOSAIC_ONLY:-}" ]]; then - log " status: running scraper: --mosaic-only --scan-id (mosaic only) …" - else - log " status: running scraper: --scan-id (mosaic + tiles) …" - fi - if [[ -n "${DRY_RUN:-}" ]]; then - log " status: (dry-run — no files written for this scan)" - fi - - if [[ -n "${MOSAIC_ONLY:-}" ]]; then - run_cmd=("${SCRAPER[@]}" --mosaic-only --machine "$label" --scan-id "$random_id") - else - run_cmd=("${SCRAPER[@]}" --machine "$label" --scan-id "$random_id") - fi - if [[ -n "${DRY_RUN:-}" ]]; then - run_cmd+=(--dry-run) - fi - if "${run_cmd[@]}"; then - log " status: OK — finished this machine (exit 0)" - PROCESSED=$((PROCESSED + 1)) - else - rc=$? - log " status: FAILED — scraper exit code $rc (stopping; fix or remove this machine and re-run)" - exit "$rc" - fi - log "────────────────────────────────────────" -done < "$MACHINES_FILE" - -log "done. summary: $PROCESSED machine(s) with sampled scan download completed, $SKIPPED skipped, $IDX line(s) processed out of $TOTAL_MACHINES in file." -exit 0 diff --git a/scripts/scan_progress_report.py b/scripts/scan_progress_report.py new file mode 100644 index 0000000..7d07ca3 --- /dev/null +++ b/scripts/scan_progress_report.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +""" +Report metadata-scan progress and projected completion times for all machines. + +Usage: + python scripts/scan_progress_report.py [--archive ARCHIVE_DIR] [--recent N] [--mermaid] [--rate-chart] + +Options: + --archive DIR Path to archives directory (default: archives) + --recent N Number of recent files used to compute current rate (default: 500) + --mermaid Also print a Mermaid Gantt chart + --rate-chart Also print a Mermaid XY chart of s/scan rate by hour +""" + +import argparse +import glob +import os +import re +import sys +from collections import defaultdict +from datetime import datetime, timedelta +from pathlib import Path + + +# Canonical machine order and total scan counts (from README inventory, April 2026) +MACHINES = [ + ("BW1-4 [AMR-15]", 6121), + ("BW1-6 [AMR-19]", 18198), + ("BW1-7 [AMR-18]", 430), + ("BW2-8 [AMR-25]", 8191), + ("BW2-10 [AMR-22]", 16537), + ("BW2-11 [AMR-23]", 26763), + ("BW2-13 [AMR-24]", 13537), + ("BW3-16 [AMR-16]", 7325), + ("BW3-17 [AMR-20]", 471), + ("BW3-19 [AMR-21]", 15186), + ("BW3-20 [AMR-26]", 23052), + ("BW3-21 [AMR-17]", 10115), +] +TOTAL_SCANS = sum(t for _, t in MACHINES) + + +def dir_name(label: str) -> str: + return re.sub(r"[^\w\-.]", "_", label).strip("_") + + +def get_timestamps(machine_dir: Path) -> list[float]: + files = glob.glob(str(machine_dir / "**" / "metadata.json"), recursive=True) + return sorted(os.path.getmtime(f) for f in files) + + +def print_rate_chart(all_timestamps: list[float]) -> None: + """Print a Mermaid xychart-beta of average s/scan per hour.""" + # One avg rate per hour + bins: dict[str, list[float]] = defaultdict(list) + start_hour: datetime | None = None + for i in range(1, len(all_timestamps)): + gap = all_timestamps[i] - all_timestamps[i - 1] + if gap < 300: # ignore inter-machine gaps + dt = datetime.fromtimestamp(all_timestamps[i]) + hour_key = dt.strftime("%m-%d %Hh") + bins[hour_key].append(gap) + if start_hour is None: + start_hour = dt.replace(minute=0, second=0, microsecond=0) + + # Drop the last (partial) hour + hours = sorted(bins.keys()) + if hours: + hours = hours[:-1] + + if not hours or start_hour is None: + print("(not enough data for rate chart)") + return + + values = [f"{sum(bins[h])/len(bins[h]):.2f}" for h in hours] + y_max = max(float(v) for v in values) + y_ceil = int(y_max) + 3 + n = len(hours) + + # Numeric x-axis: Mermaid auto-picks readable tick positions + start_label = start_hour.strftime("%b %d %H:%M") + print("```mermaid") + print("xychart-beta") + print(f' title "Metadata scan rate (s/scan) — hourly, starting {start_label}"') + print(f' x-axis "Hours elapsed" 0 --> {n}') + print(f' y-axis "s / scan" 0 --> {y_ceil}') + print(f" line [{', '.join(values)}]") + print("```") + + +def fmt_dt(dt: datetime) -> str: + return dt.strftime("%a %b %d %H:%M") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--archive", default="archives", help="Archives directory") + parser.add_argument("--recent", type=int, default=500, + help="Files used to compute recent rate (default: 500)") + parser.add_argument("--mermaid", action="store_true", help="Print Mermaid Gantt chart") + parser.add_argument("--rate-chart", action="store_true", help="Print Mermaid XY rate-over-time chart") + args = parser.parse_args() + + archive = Path(args.archive) + if not archive.is_dir(): + sys.exit(f"Archive directory not found: {archive}") + + now = datetime.now() + + # --- Gather per-machine data --- + machine_data = [] # (label, total, done, first_ts, last_ts) + all_timestamps: list[float] = [] + + for label, total in MACHINES: + mdir = archive / dir_name(label) + if mdir.is_dir(): + times = get_timestamps(mdir) + else: + times = [] + done = len(times) + first_ts = datetime.fromtimestamp(times[0]) if times else None + last_ts = datetime.fromtimestamp(times[-1]) if times else None + machine_data.append((label, total, done, first_ts, last_ts, times)) + all_timestamps.extend(times) + + all_timestamps.sort() + total_done = sum(d for _, _, d, *_ in machine_data) + + # --- Rate calculation --- + recent_times = all_timestamps[-args.recent:] if len(all_timestamps) >= 2 else all_timestamps + if len(recent_times) >= 2: + recent_rate = (recent_times[-1] - recent_times[0]) / len(recent_times) + else: + recent_rate = None + + if len(all_timestamps) >= 2: + overall_rate = (all_timestamps[-1] - all_timestamps[0]) / len(all_timestamps) + else: + overall_rate = None + + rate = recent_rate or overall_rate or 5.0 # fallback + + # --- Print timetable --- + print(f"Metadata scan progress — {now.strftime('%Y-%m-%d %H:%M')}") + print(f"Overall rate : {overall_rate:.2f} s/scan" if overall_rate else "Overall rate : n/a") + print(f"Recent rate : {recent_rate:.2f} s/scan (last {args.recent} files)" if recent_rate else "Recent rate : n/a") + print(f"Rate used : {rate:.2f} s/scan") + print(f"Done : {total_done:,} / {TOTAL_SCANS:,} ({100*total_done/TOTAL_SCANS:.1f}%)") + print() + print(f"{'Machine':<20} {'Total':>7} {'Done':>7} {'Pct':>6} {'Completion'}") + print("-" * 68) + + cursor = now + gantt_rows: list[tuple[str, datetime, datetime, str]] = [] # label, start, end, status + + for label, total, done, first_ts, last_ts, times in machine_data: + pct = 100 * done / total if total else 0 + + complete = done >= total or (done > 0 and done / total >= 0.999) + + if done == 0: + # Not started yet + start = cursor + finish = cursor + timedelta(seconds=total * rate) + status = "pending" + print(f"{label:<20} {total:>7,} {'—':>7} {'—':>5} {fmt_dt(finish)}") + elif complete: + # Complete — use actual timestamps + start = first_ts + finish = last_ts + status = "done" + print(f"{label:<20} {total:>7,} {done:>7,} {pct:>5.1f}% complete") + else: + # In progress + remaining = total - done + start = first_ts + finish = cursor + timedelta(seconds=remaining * rate) + status = "active" + print(f"{label:<20} {total:>7,} {done:>7,} {pct:>5.1f}% {fmt_dt(finish)} ← in progress") + + gantt_rows.append((label, start or cursor, finish, status)) + if status != "done": + cursor = finish + + print("-" * 68) + print(f"{'All done':<20} {TOTAL_SCANS:>7,} {total_done:>7,} {100*total_done/TOTAL_SCANS:>5.1f}% {fmt_dt(cursor)}") + + # --- Mermaid rate chart --- + if args.rate_chart: + print() + print_rate_chart(all_timestamps) + + # --- Mermaid Gantt --- + if args.mermaid: + print() + print("```mermaid") + print("gantt") + print(" title Metadata scan progress — all 12 machines") + print(" dateFormat YYYY-MM-DD HH:mm") + print(" axisFormat %b %d") + print() + + section = None + for label, start, finish, status in gantt_rows: + prefix = label.split()[0][:3] # BW1, BW2, BW3 + if prefix != section: + section = prefix + print(f" section {section}") + safe = label.replace("[", "").replace("]", "").replace(" ", "-") + tag = f"done, " if status == "done" else ("active, " if status == "active" else "") + s = start.strftime("%Y-%m-%d %H:%M") + e = finish.strftime("%Y-%m-%d %H:%M") + print(f" {label} :{tag}{safe}, {s}, {e}") + print("```") + + +if __name__ == "__main__": + main() diff --git a/spruce/orchestrator.py b/spruce/orchestrator.py index 369f2ce..067f533 100644 --- a/spruce/orchestrator.py +++ b/spruce/orchestrator.py @@ -241,6 +241,7 @@ def process_scan( mosaic_only: bool, metadata_only: bool = False, max_tiles: int | None = None, + scans_csv_existing_ids: set[int] | None = None, ) -> RunStats: """ Process one scan: fetch metadata, download mosaic and (optionally) tiles. @@ -379,9 +380,16 @@ def process_scan( mds, mer, mco, mcl = "", "", "", "" # Write scan-level CSV row only if this scan hasn't been recorded before. - if mosaic_already_done and not metadata_only: + # Skip if: (1) mosaic URL already in .progress.json, or (2) scan already + # has a non-pending row in scans.csv from a prior run. + already_recorded = (mosaic_already_done and not metadata_only) or ( + not metadata_only + and scans_csv_existing_ids is not None + and scan_id in scans_csv_existing_ids + ) + if already_recorded: log.debug( - "[%s] Scan %d: already in scans.csv (mosaic was previously downloaded), skipping CSV row.", + "[%s] Scan %d: already in scans.csv, skipping CSV row.", machine["label"], scan_id, ) @@ -494,7 +502,20 @@ def scrape_machine( ) -> RunStats: """Login, fetch scans, and download all content for one machine.""" sess = MachineSession(machine, config) - if not sess.login(): + login_ok = False + for attempt in range(1, 4): + if sess.login(): + login_ok = True + break + if attempt < 3: + log.warning( + "[%s] Login failed (attempt %d/3) — retrying in 10s.", + machine["label"], + attempt, + ) + time.sleep(10) + if not login_ok: + log.error("[%s] Login failed after 3 attempts — skipping machine.", machine["label"]) return RunStats() if scan_id_filter is not None: @@ -508,8 +529,33 @@ def scrape_machine( log.warning("[%s] No scans found.", machine["label"]) return RunStats() + # Build a set of scan_ids already fully processed in a prior run so we can + # skip them entirely (no metadata fetch, no mosaic request). + # Only scans with a definitive non-pending status count; skipped_metadata_only + # rows still need to be processed in mosaic mode. + PENDING_STATUSES = {"skipped_metadata_only", ""} + existing_ids: set[int] = set() + if not metadata_only and scans_csv._fh.name: + existing_path = Path(scans_csv._fh.name) + if existing_path.exists(): + import csv as _csv + with open(existing_path, newline="", encoding="utf-8") as _f: + for _row in _csv.DictReader(_f): + if _row.get("machine") == machine["label"]: + if _row.get("mosaic_download_status", "") not in PENDING_STATUSES: + existing_ids.add(int(_row["scan_id"])) + stats = RunStats() for scan in scans: + # Skip scans already fully processed in a prior run — avoids redundant + # metadata fetches and mosaic requests for known-failed / known-done scans. + if not metadata_only and scan["scan_id"] in existing_ids: + log.debug( + "[%s] Scan %d: already processed, skipping.", + machine["label"], + scan["scan_id"], + ) + continue stats.merge(process_scan( sess=sess, scan=scan, @@ -523,5 +569,6 @@ def scrape_machine( mosaic_only=mosaic_only, metadata_only=metadata_only, max_tiles=max_tiles, + scans_csv_existing_ids=existing_ids, )) return stats diff --git a/spruce/progress.py b/spruce/progress.py index 4d7fa67..997b884 100644 --- a/spruce/progress.py +++ b/spruce/progress.py @@ -5,6 +5,7 @@ Progress tracking (JSON) and CSV writing. import csv import json import logging +from datetime import datetime, timezone from pathlib import Path from typing import Iterator @@ -27,6 +28,7 @@ class ProgressTracker: def __init__(self, path: Path) -> None: self.path = path self._done: set[str] = set() + self.started_at: str = datetime.now(timezone.utc).isoformat() self._load() def _load(self) -> None: @@ -34,6 +36,8 @@ class ProgressTracker: try: data = json.loads(self.path.read_text()) self._done = set(data.get("completed_urls", [])) + if "started_at" in data: + self.started_at = data["started_at"] log.info("Resuming: %d URLs already downloaded.", len(self._done)) except Exception: log.warning("Could not read progress file; starting fresh.") @@ -59,7 +63,10 @@ class ProgressTracker: self.path.parent.mkdir(parents=True, exist_ok=True) tmp = self.path.with_suffix(".json.tmp") tmp.write_text( - json.dumps({"completed_urls": sorted(self._done)}, indent=2) + json.dumps( + {"started_at": self.started_at, "completed_urls": sorted(self._done)}, + indent=2, + ) ) tmp.replace(self.path) # atomic on POSIX; avoids corrupt JSON on crash diff --git a/spruce/session.py b/spruce/session.py index 52fc50f..ba2444d 100644 --- a/spruce/session.py +++ b/spruce/session.py @@ -14,6 +14,7 @@ from bs4 import BeautifulSoup from spruce.download_result import ( OK, + PERMANENT_MISSING, UNKNOWN, DownloadResult, classify_http_error, @@ -263,6 +264,10 @@ class MachineSession: and exc.response is not None ): sc = exc.response.status_code + cl = classify_http_error(sc, exc) + if cl == PERMANENT_MISSING: + # 404/410 will never succeed — don't waste time retrying. + return DownloadResult(0, sc, str(exc), cl) if attempt < retries: log.warning( "Attempt %d/%d failed %s: %s — retrying in %.0fs", @@ -281,7 +286,6 @@ class MachineSession: url, exc, ) - cl = classify_http_error(sc, exc) return DownloadResult(0, sc, str(exc), cl) return DownloadResult(0, None, "download_file: exhausted", UNKNOWN)