diff --git a/.gitignore b/.gitignore index cfcb0be..28c541b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ __pycache__/ .DS_Store explore_dumps/ .venv/ +scripts/sync_to_nas.sh +backup/ +.claude/ diff --git a/docs/mosaic_reconstruction_sample_images/146368_mosaic.jpg b/docs/mosaic_reconstruction_sample_images/146368_mosaic.jpg new file mode 100644 index 0000000..9df60c4 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/146368_mosaic.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/146368_mosaic_reconstructed.jpg b/docs/mosaic_reconstruction_sample_images/146368_mosaic_reconstructed.jpg new file mode 100644 index 0000000..1fc6e67 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/146368_mosaic_reconstructed.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/146368_tile_s1.jpg b/docs/mosaic_reconstruction_sample_images/146368_tile_s1.jpg new file mode 100644 index 0000000..0d4a87b Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/146368_tile_s1.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156875_mosaic.jpg b/docs/mosaic_reconstruction_sample_images/156875_mosaic.jpg new file mode 100644 index 0000000..bbc559b Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156875_mosaic.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156875_mosaic_reconstructed.jpg b/docs/mosaic_reconstruction_sample_images/156875_mosaic_reconstructed.jpg new file mode 100644 index 0000000..aebcd72 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156875_mosaic_reconstructed.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156875_tile_s1.jpg b/docs/mosaic_reconstruction_sample_images/156875_tile_s1.jpg new file mode 100644 index 0000000..b5a18e6 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156875_tile_s1.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156957_mosaic.jpg b/docs/mosaic_reconstruction_sample_images/156957_mosaic.jpg new file mode 100644 index 0000000..87694a9 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156957_mosaic.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156957_mosaic_reconstructed.jpg b/docs/mosaic_reconstruction_sample_images/156957_mosaic_reconstructed.jpg new file mode 100644 index 0000000..dda903a Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156957_mosaic_reconstructed.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/156957_tile_s1.jpg b/docs/mosaic_reconstruction_sample_images/156957_tile_s1.jpg new file mode 100644 index 0000000..86ff192 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/156957_tile_s1.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/160022_mosaic.jpg b/docs/mosaic_reconstruction_sample_images/160022_mosaic.jpg new file mode 100644 index 0000000..da61577 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/160022_mosaic.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/160022_mosaic_reconstructed.jpg b/docs/mosaic_reconstruction_sample_images/160022_mosaic_reconstructed.jpg new file mode 100644 index 0000000..a020167 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/160022_mosaic_reconstructed.jpg differ diff --git a/docs/mosaic_reconstruction_sample_images/160022_tile_s1.jpg b/docs/mosaic_reconstruction_sample_images/160022_tile_s1.jpg new file mode 100644 index 0000000..fdce342 Binary files /dev/null and b/docs/mosaic_reconstruction_sample_images/160022_tile_s1.jpg differ diff --git a/run_metadata_scan.sh b/run_metadata_scan.sh new file mode 100644 index 0000000..bbfe1b0 --- /dev/null +++ b/run_metadata_scan.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV="/tmp/spruce_venv" + +if [[ ! -x "$VENV/bin/python" ]]; then + echo "Setting up venv at $VENV..." + python3 -m venv "$VENV" + "$VENV/bin/python" -m ensurepip --upgrade + "$VENV/bin/pip" install -q -r "$SCRIPT_DIR/requirements.txt" +fi + +echo "Starting metadata-only scan of all machines..." +"$VENV/bin/python" "$SCRIPT_DIR/scraper.py" --metadata-only "$@" diff --git a/scripts/export_machine_metadata.py b/scripts/export_machine_metadata.py new file mode 100644 index 0000000..e5b2e4d --- /dev/null +++ b/scripts/export_machine_metadata.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +""" +Split scans.csv into per-machine metadata CSVs. + +Reads the combined scans.csv produced by the scraper and writes one CSV per +machine containing only the website-sourced metadata columns (no mosaic paths, +download status, or error fields). + +Usage: + python scripts/export_machine_metadata.py + python scripts/export_machine_metadata.py --input archives/scans.csv --output-dir archives/by_machine +""" + +from __future__ import annotations + +import argparse +import csv +import sys +from collections import defaultdict +from pathlib import Path + +METADATA_COLUMNS = [ + "machine", + "machine_id", + "scan_id", + "name", + "scan_time", + "start_x", + "start_y", + "end_x", + "end_y", + "dx", + "dy", + "nx", + "ny", + "total_tiles", + "scan_lines", + "scan_mode", + "start_datetime", + "end_datetime", + "status", + "user", + "disk_space_mb", +] + + +def sanitize_machine_label(label: str) -> str: + return label.replace("[", "").replace("]", "").replace(" ", "_").strip("_") + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Split scans.csv into per-machine metadata CSVs.") + p.add_argument( + "--input", + default="archives/scans.csv", + metavar="FILE", + help="Path to scans.csv (default: archives/scans.csv)", + ) + p.add_argument( + "--output-dir", + default="archives/by_machine", + metavar="DIR", + help="Directory for output CSVs (default: archives/by_machine)", + ) + return p.parse_args() + + +def main() -> None: + args = parse_args() + input_path = Path(args.input) + output_dir = Path(args.output_dir) + + if not input_path.exists(): + sys.exit(f"Input file not found: {input_path}") + + with input_path.open(newline="") as fh: + reader = csv.DictReader(fh) + if reader.fieldnames is None: + sys.exit(f"{input_path} appears to be empty.") + + missing = [c for c in METADATA_COLUMNS if c not in reader.fieldnames] + if missing: + sys.exit(f"Expected columns not found in {input_path}: {missing}") + + rows_by_machine: dict[str, list[dict]] = defaultdict(list) + for row in reader: + rows_by_machine[row["machine"]].append(row) + + output_dir.mkdir(parents=True, exist_ok=True) + + for machine_label, rows in sorted(rows_by_machine.items()): + safe_name = sanitize_machine_label(machine_label) + out_path = output_dir / f"{safe_name}_scans_metadata.csv" + with out_path.open("w", newline="") as fh: + writer = csv.DictWriter(fh, fieldnames=METADATA_COLUMNS, extrasaction="ignore") + writer.writeheader() + writer.writerows(rows) + print(f" {out_path} ({len(rows)} rows)") + + total = sum(len(r) for r in rows_by_machine.values()) + print(f"\n{len(rows_by_machine)} machine(s), {total} total rows → {output_dir}/") + + +if __name__ == "__main__": + main() diff --git a/scripts/scan_progress_report.py b/scripts/scan_progress_report.py new file mode 100644 index 0000000..7d07ca3 --- /dev/null +++ b/scripts/scan_progress_report.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +""" +Report metadata-scan progress and projected completion times for all machines. + +Usage: + python scripts/scan_progress_report.py [--archive ARCHIVE_DIR] [--recent N] [--mermaid] [--rate-chart] + +Options: + --archive DIR Path to archives directory (default: archives) + --recent N Number of recent files used to compute current rate (default: 500) + --mermaid Also print a Mermaid Gantt chart + --rate-chart Also print a Mermaid XY chart of s/scan rate by hour +""" + +import argparse +import glob +import os +import re +import sys +from collections import defaultdict +from datetime import datetime, timedelta +from pathlib import Path + + +# Canonical machine order and total scan counts (from README inventory, April 2026) +MACHINES = [ + ("BW1-4 [AMR-15]", 6121), + ("BW1-6 [AMR-19]", 18198), + ("BW1-7 [AMR-18]", 430), + ("BW2-8 [AMR-25]", 8191), + ("BW2-10 [AMR-22]", 16537), + ("BW2-11 [AMR-23]", 26763), + ("BW2-13 [AMR-24]", 13537), + ("BW3-16 [AMR-16]", 7325), + ("BW3-17 [AMR-20]", 471), + ("BW3-19 [AMR-21]", 15186), + ("BW3-20 [AMR-26]", 23052), + ("BW3-21 [AMR-17]", 10115), +] +TOTAL_SCANS = sum(t for _, t in MACHINES) + + +def dir_name(label: str) -> str: + return re.sub(r"[^\w\-.]", "_", label).strip("_") + + +def get_timestamps(machine_dir: Path) -> list[float]: + files = glob.glob(str(machine_dir / "**" / "metadata.json"), recursive=True) + return sorted(os.path.getmtime(f) for f in files) + + +def print_rate_chart(all_timestamps: list[float]) -> None: + """Print a Mermaid xychart-beta of average s/scan per hour.""" + # One avg rate per hour + bins: dict[str, list[float]] = defaultdict(list) + start_hour: datetime | None = None + for i in range(1, len(all_timestamps)): + gap = all_timestamps[i] - all_timestamps[i - 1] + if gap < 300: # ignore inter-machine gaps + dt = datetime.fromtimestamp(all_timestamps[i]) + hour_key = dt.strftime("%m-%d %Hh") + bins[hour_key].append(gap) + if start_hour is None: + start_hour = dt.replace(minute=0, second=0, microsecond=0) + + # Drop the last (partial) hour + hours = sorted(bins.keys()) + if hours: + hours = hours[:-1] + + if not hours or start_hour is None: + print("(not enough data for rate chart)") + return + + values = [f"{sum(bins[h])/len(bins[h]):.2f}" for h in hours] + y_max = max(float(v) for v in values) + y_ceil = int(y_max) + 3 + n = len(hours) + + # Numeric x-axis: Mermaid auto-picks readable tick positions + start_label = start_hour.strftime("%b %d %H:%M") + print("```mermaid") + print("xychart-beta") + print(f' title "Metadata scan rate (s/scan) — hourly, starting {start_label}"') + print(f' x-axis "Hours elapsed" 0 --> {n}') + print(f' y-axis "s / scan" 0 --> {y_ceil}') + print(f" line [{', '.join(values)}]") + print("```") + + +def fmt_dt(dt: datetime) -> str: + return dt.strftime("%a %b %d %H:%M") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--archive", default="archives", help="Archives directory") + parser.add_argument("--recent", type=int, default=500, + help="Files used to compute recent rate (default: 500)") + parser.add_argument("--mermaid", action="store_true", help="Print Mermaid Gantt chart") + parser.add_argument("--rate-chart", action="store_true", help="Print Mermaid XY rate-over-time chart") + args = parser.parse_args() + + archive = Path(args.archive) + if not archive.is_dir(): + sys.exit(f"Archive directory not found: {archive}") + + now = datetime.now() + + # --- Gather per-machine data --- + machine_data = [] # (label, total, done, first_ts, last_ts) + all_timestamps: list[float] = [] + + for label, total in MACHINES: + mdir = archive / dir_name(label) + if mdir.is_dir(): + times = get_timestamps(mdir) + else: + times = [] + done = len(times) + first_ts = datetime.fromtimestamp(times[0]) if times else None + last_ts = datetime.fromtimestamp(times[-1]) if times else None + machine_data.append((label, total, done, first_ts, last_ts, times)) + all_timestamps.extend(times) + + all_timestamps.sort() + total_done = sum(d for _, _, d, *_ in machine_data) + + # --- Rate calculation --- + recent_times = all_timestamps[-args.recent:] if len(all_timestamps) >= 2 else all_timestamps + if len(recent_times) >= 2: + recent_rate = (recent_times[-1] - recent_times[0]) / len(recent_times) + else: + recent_rate = None + + if len(all_timestamps) >= 2: + overall_rate = (all_timestamps[-1] - all_timestamps[0]) / len(all_timestamps) + else: + overall_rate = None + + rate = recent_rate or overall_rate or 5.0 # fallback + + # --- Print timetable --- + print(f"Metadata scan progress — {now.strftime('%Y-%m-%d %H:%M')}") + print(f"Overall rate : {overall_rate:.2f} s/scan" if overall_rate else "Overall rate : n/a") + print(f"Recent rate : {recent_rate:.2f} s/scan (last {args.recent} files)" if recent_rate else "Recent rate : n/a") + print(f"Rate used : {rate:.2f} s/scan") + print(f"Done : {total_done:,} / {TOTAL_SCANS:,} ({100*total_done/TOTAL_SCANS:.1f}%)") + print() + print(f"{'Machine':<20} {'Total':>7} {'Done':>7} {'Pct':>6} {'Completion'}") + print("-" * 68) + + cursor = now + gantt_rows: list[tuple[str, datetime, datetime, str]] = [] # label, start, end, status + + for label, total, done, first_ts, last_ts, times in machine_data: + pct = 100 * done / total if total else 0 + + complete = done >= total or (done > 0 and done / total >= 0.999) + + if done == 0: + # Not started yet + start = cursor + finish = cursor + timedelta(seconds=total * rate) + status = "pending" + print(f"{label:<20} {total:>7,} {'—':>7} {'—':>5} {fmt_dt(finish)}") + elif complete: + # Complete — use actual timestamps + start = first_ts + finish = last_ts + status = "done" + print(f"{label:<20} {total:>7,} {done:>7,} {pct:>5.1f}% complete") + else: + # In progress + remaining = total - done + start = first_ts + finish = cursor + timedelta(seconds=remaining * rate) + status = "active" + print(f"{label:<20} {total:>7,} {done:>7,} {pct:>5.1f}% {fmt_dt(finish)} ← in progress") + + gantt_rows.append((label, start or cursor, finish, status)) + if status != "done": + cursor = finish + + print("-" * 68) + print(f"{'All done':<20} {TOTAL_SCANS:>7,} {total_done:>7,} {100*total_done/TOTAL_SCANS:>5.1f}% {fmt_dt(cursor)}") + + # --- Mermaid rate chart --- + if args.rate_chart: + print() + print_rate_chart(all_timestamps) + + # --- Mermaid Gantt --- + if args.mermaid: + print() + print("```mermaid") + print("gantt") + print(" title Metadata scan progress — all 12 machines") + print(" dateFormat YYYY-MM-DD HH:mm") + print(" axisFormat %b %d") + print() + + section = None + for label, start, finish, status in gantt_rows: + prefix = label.split()[0][:3] # BW1, BW2, BW3 + if prefix != section: + section = prefix + print(f" section {section}") + safe = label.replace("[", "").replace("]", "").replace(" ", "-") + tag = f"done, " if status == "done" else ("active, " if status == "active" else "") + s = start.strftime("%Y-%m-%d %H:%M") + e = finish.strftime("%Y-%m-%d %H:%M") + print(f" {label} :{tag}{safe}, {s}, {e}") + print("```") + + +if __name__ == "__main__": + main()