Merge main into testing/sample-runs
@@ -7,3 +7,6 @@ __pycache__/
|
|||||||
.DS_Store
|
.DS_Store
|
||||||
explore_dumps/
|
explore_dumps/
|
||||||
.venv/
|
.venv/
|
||||||
|
scripts/sync_to_nas.sh
|
||||||
|
backup/
|
||||||
|
.claude/
|
||||||
|
|||||||
@@ -97,10 +97,8 @@ python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only
|
|||||||
# Download mosaics for all machines
|
# Download mosaics for all machines
|
||||||
python scraper.py --mosaic-only
|
python scraper.py --mosaic-only
|
||||||
|
|
||||||
# One random completed scan per machine: mosaic + all tiles (from machines.txt; uses --list-scans + --scan-id)
|
# One random completed scan per machine (helper script): check out branch `testing/sample-runs`,
|
||||||
# MOSAIC_ONLY=1 ./scripts/sample_random_scans.sh machines.txt # optional: mosaics only, no tiles
|
# then see `scripts/sample_random_scans.sh` and `docs/sample_random_scans_run_progress.md`.
|
||||||
# cp scripts/machines.example.txt machines.txt # then edit: one label per line
|
|
||||||
# ./scripts/sample_random_scans.sh machines.txt
|
|
||||||
|
|
||||||
# Download all tiles for a specific scan
|
# Download all tiles for a specific scan
|
||||||
python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
|
python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
|
||||||
|
|||||||
|
After Width: | Height: | Size: 53 KiB |
|
After Width: | Height: | Size: 13 MiB |
|
After Width: | Height: | Size: 41 KiB |
|
After Width: | Height: | Size: 37 KiB |
|
After Width: | Height: | Size: 7.6 MiB |
|
After Width: | Height: | Size: 50 KiB |
|
After Width: | Height: | Size: 58 KiB |
|
After Width: | Height: | Size: 15 MiB |
|
After Width: | Height: | Size: 52 KiB |
|
After Width: | Height: | Size: 218 KiB |
|
After Width: | Height: | Size: 50 MiB |
|
After Width: | Height: | Size: 62 KiB |
@@ -0,0 +1,15 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
VENV="/tmp/spruce_venv"
|
||||||
|
|
||||||
|
if [[ ! -x "$VENV/bin/python" ]]; then
|
||||||
|
echo "Setting up venv at $VENV..."
|
||||||
|
python3 -m venv "$VENV"
|
||||||
|
"$VENV/bin/python" -m ensurepip --upgrade
|
||||||
|
"$VENV/bin/pip" install -q -r "$SCRIPT_DIR/requirements.txt"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Starting metadata-only scan of all machines..."
|
||||||
|
"$VENV/bin/python" "$SCRIPT_DIR/scraper.py" --metadata-only "$@"
|
||||||
@@ -0,0 +1,105 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Split scans.csv into per-machine metadata CSVs.
|
||||||
|
|
||||||
|
Reads the combined scans.csv produced by the scraper and writes one CSV per
|
||||||
|
machine containing only the website-sourced metadata columns (no mosaic paths,
|
||||||
|
download status, or error fields).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/export_machine_metadata.py
|
||||||
|
python scripts/export_machine_metadata.py --input archives/scans.csv --output-dir archives/by_machine
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
METADATA_COLUMNS = [
|
||||||
|
"machine",
|
||||||
|
"machine_id",
|
||||||
|
"scan_id",
|
||||||
|
"name",
|
||||||
|
"scan_time",
|
||||||
|
"start_x",
|
||||||
|
"start_y",
|
||||||
|
"end_x",
|
||||||
|
"end_y",
|
||||||
|
"dx",
|
||||||
|
"dy",
|
||||||
|
"nx",
|
||||||
|
"ny",
|
||||||
|
"total_tiles",
|
||||||
|
"scan_lines",
|
||||||
|
"scan_mode",
|
||||||
|
"start_datetime",
|
||||||
|
"end_datetime",
|
||||||
|
"status",
|
||||||
|
"user",
|
||||||
|
"disk_space_mb",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_machine_label(label: str) -> str:
|
||||||
|
return label.replace("[", "").replace("]", "").replace(" ", "_").strip("_")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
p = argparse.ArgumentParser(description="Split scans.csv into per-machine metadata CSVs.")
|
||||||
|
p.add_argument(
|
||||||
|
"--input",
|
||||||
|
default="archives/scans.csv",
|
||||||
|
metavar="FILE",
|
||||||
|
help="Path to scans.csv (default: archives/scans.csv)",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--output-dir",
|
||||||
|
default="archives/by_machine",
|
||||||
|
metavar="DIR",
|
||||||
|
help="Directory for output CSVs (default: archives/by_machine)",
|
||||||
|
)
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = parse_args()
|
||||||
|
input_path = Path(args.input)
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
|
||||||
|
if not input_path.exists():
|
||||||
|
sys.exit(f"Input file not found: {input_path}")
|
||||||
|
|
||||||
|
with input_path.open(newline="") as fh:
|
||||||
|
reader = csv.DictReader(fh)
|
||||||
|
if reader.fieldnames is None:
|
||||||
|
sys.exit(f"{input_path} appears to be empty.")
|
||||||
|
|
||||||
|
missing = [c for c in METADATA_COLUMNS if c not in reader.fieldnames]
|
||||||
|
if missing:
|
||||||
|
sys.exit(f"Expected columns not found in {input_path}: {missing}")
|
||||||
|
|
||||||
|
rows_by_machine: dict[str, list[dict]] = defaultdict(list)
|
||||||
|
for row in reader:
|
||||||
|
rows_by_machine[row["machine"]].append(row)
|
||||||
|
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
for machine_label, rows in sorted(rows_by_machine.items()):
|
||||||
|
safe_name = sanitize_machine_label(machine_label)
|
||||||
|
out_path = output_dir / f"{safe_name}_scans_metadata.csv"
|
||||||
|
with out_path.open("w", newline="") as fh:
|
||||||
|
writer = csv.DictWriter(fh, fieldnames=METADATA_COLUMNS, extrasaction="ignore")
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(rows)
|
||||||
|
print(f" {out_path} ({len(rows)} rows)")
|
||||||
|
|
||||||
|
total = sum(len(r) for r in rows_by_machine.values())
|
||||||
|
print(f"\n{len(rows_by_machine)} machine(s), {total} total rows → {output_dir}/")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
# All RootView minirhizotron machine labels (same set as `machine_metadata` in config.example.yaml).
|
# All RootView minirhizotron machine labels (same set as `machine_metadata` in config.example.yaml).
|
||||||
# Copy to the repo root as machines.txt, or: cp scripts/machines.example.txt machines.txt
|
# Copy to the repo root as machines.txt, or: cp scripts/machines.example.txt machines.txt
|
||||||
# sample_random_scans.sh: by default one random scan per line = mosaic + tiles; use MOSAIC_ONLY=1 for mosaics only
|
# Random-sample helper `scripts/sample_random_scans.sh` lives on branch `testing/sample-runs` only.
|
||||||
BW1-4 [AMR-15]
|
BW1-4 [AMR-15]
|
||||||
BW1-6 [AMR-19]
|
BW1-6 [AMR-19]
|
||||||
BW1-7 [AMR-18]
|
BW1-7 [AMR-18]
|
||||||
|
|||||||
@@ -1,178 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# For each machine label in a text file, pick one random completed scan and download
|
|
||||||
# it: by default the mosaic and all tiles (same as: --machine "…" --scan-id N).
|
|
||||||
# For mosaic only (faster, no tile downloads), set: MOSAIC_ONLY=1
|
|
||||||
#
|
|
||||||
# Usage:
|
|
||||||
# ./scripts/sample_random_scans.sh [PATH_TO_machines.txt]
|
|
||||||
# Config path defaults to config.yaml in the repo root. Override with:
|
|
||||||
# CONFIG=/path/to/config.yaml ./scripts/sample_random_scans.sh machines.txt
|
|
||||||
# Dry-run the download step (listing still does real HTTP to fetch scan list):
|
|
||||||
# DRY_RUN=1 ./scripts/sample_random_scans.sh machines.txt
|
|
||||||
# Verbose / debug (extra per-step lines, scan counts from the list step):
|
|
||||||
# DEBUG=1 ./scripts/sample_random_scans.sh machines.txt
|
|
||||||
# By default, --list-scans fetches only the first page (one HTTP request, up to
|
|
||||||
# 320 scans). To paginate the full archive for the random pick (slower when many
|
|
||||||
# LIST_SCANS_ALL_PAGES=1 ./scripts/sample_random_scans.sh machines.txt
|
|
||||||
#
|
|
||||||
# machines.txt: one machine label per line (same as --machine and config machine names).
|
|
||||||
# See scripts/machines.example.txt
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
||||||
CONFIG="${CONFIG:-$REPO_ROOT/config.yaml}"
|
|
||||||
MACHINES_FILE="${1:-$REPO_ROOT/machines.txt}"
|
|
||||||
SCRAPER=(python3 "$REPO_ROOT/scraper.py" --config "$CONFIG")
|
|
||||||
|
|
||||||
log() { echo "[sample_random_scans] $*" >&2; }
|
|
||||||
log_debug() {
|
|
||||||
if [[ -n "${DEBUG:-}" ]]; then
|
|
||||||
echo "[sample_random_scans] debug: $*" >&2
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
if [[ ! -f "$MACHINES_FILE" ]]; then
|
|
||||||
log "error: file not found: $MACHINES_FILE"
|
|
||||||
log "Create it with one machine label per line, or: cp scripts/machines.example.txt machines.txt"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "$CONFIG" ]]; then
|
|
||||||
log "error: config not found: $CONFIG"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Non-empty, non-comment lines (same rules as the main loop)
|
|
||||||
TOTAL_MACHINES="$(
|
|
||||||
grep -v '^[[:space:]]*#' "$MACHINES_FILE" | grep -c -v '^[[:space:]]*$' || true
|
|
||||||
)"
|
|
||||||
if [[ -z "$TOTAL_MACHINES" || "$TOTAL_MACHINES" -eq 0 ]]; then
|
|
||||||
log "error: no machine lines in: $MACHINES_FILE"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "starting repo=$REPO_ROOT"
|
|
||||||
log " config=$CONFIG"
|
|
||||||
log " machines_file=$MACHINES_FILE (${TOTAL_MACHINES} machine(s) in file)"
|
|
||||||
if [[ -n "${MOSAIC_ONLY:-}" ]]; then
|
|
||||||
if [[ -n "${DRY_RUN:-}" ]]; then
|
|
||||||
log " mode: MOSAIC_ONLY + DRY_RUN (mosaic only, --dry-run on download step)"
|
|
||||||
else
|
|
||||||
log " mode: MOSAIC_ONLY=1 (mosaics only, no tiles; use for a lighter sample)"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [[ -n "${DRY_RUN:-}" ]]; then
|
|
||||||
log " mode: DRY_RUN (list + full scan download use --dry-run; no files written)"
|
|
||||||
else
|
|
||||||
log " mode: full scan — mosaic + all tiles (workers from config)"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
if [[ -n "${DEBUG:-}" ]]; then
|
|
||||||
log " DEBUG=1 (extra diagnostics enabled)"
|
|
||||||
fi
|
|
||||||
if [[ -n "${LIST_SCANS_ALL_PAGES:-}" ]]; then
|
|
||||||
log " list step: list-scans = full archive (all pages, slower)"
|
|
||||||
else
|
|
||||||
log " list step: list-scans --list-scans-first-page-only (one page, up to 320 IDs)"
|
|
||||||
fi
|
|
||||||
log "────────────────────────────────────────"
|
|
||||||
|
|
||||||
export REPO_ROOT CONFIG
|
|
||||||
[[ -n "${DEBUG:-}" ]] && export DEBUG
|
|
||||||
[[ -n "${LIST_SCANS_ALL_PAGES:-}" ]] && export LIST_SCANS_ALL_PAGES
|
|
||||||
|
|
||||||
PROCESSED=0
|
|
||||||
SKIPPED=0
|
|
||||||
IDX=0
|
|
||||||
|
|
||||||
while IFS= read -r line || [[ -n "${line-}" ]]; do
|
|
||||||
# trim, strip CR, skip blanks / comments
|
|
||||||
line="${line//$'\r'/}"
|
|
||||||
label="${line#"${line%%[![:space:]]*}"}"
|
|
||||||
label="${label%"${label##*[![:space:]]}"}"
|
|
||||||
[[ -z "$label" || "$label" == \#* ]] && continue
|
|
||||||
|
|
||||||
IDX=$((IDX + 1))
|
|
||||||
log "[$IDX/$TOTAL_MACHINES] machine: $label"
|
|
||||||
log " status: listing scans (--list-scans) …"
|
|
||||||
|
|
||||||
random_id="$(
|
|
||||||
REPO_ROOT="$REPO_ROOT" CONFIG="$CONFIG" LABEL="$label" python3 - <<'PY'
|
|
||||||
import os, random, subprocess, sys
|
|
||||||
|
|
||||||
label = os.environ["LABEL"]
|
|
||||||
repo = os.environ["REPO_ROOT"]
|
|
||||||
cfg = os.environ["CONFIG"]
|
|
||||||
debug = bool(os.environ.get("DEBUG"))
|
|
||||||
full = bool(os.environ.get("LIST_SCANS_ALL_PAGES"))
|
|
||||||
scraper = os.path.join(repo, "scraper.py")
|
|
||||||
if debug:
|
|
||||||
print(
|
|
||||||
f"[sample_random_scans] debug: running list-scans for {label!r} "
|
|
||||||
f"({'all pages' if full else 'first page only'})",
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
cmd = [sys.executable, scraper, "--list-scans", "--machine", label, "--config", cfg]
|
|
||||||
if not full:
|
|
||||||
cmd.insert(3, "--list-scans-first-page-only")
|
|
||||||
out = subprocess.check_output(
|
|
||||||
cmd,
|
|
||||||
text=True,
|
|
||||||
stderr=subprocess.STDOUT,
|
|
||||||
)
|
|
||||||
ids = []
|
|
||||||
for line in out.splitlines():
|
|
||||||
line = line.rstrip()
|
|
||||||
if not line or line.startswith("---") or "Total" in line:
|
|
||||||
continue
|
|
||||||
parts = line.split()
|
|
||||||
if parts and parts[0].isdigit():
|
|
||||||
ids.append(parts[0])
|
|
||||||
if not ids:
|
|
||||||
print(f"no scans parsed for {label!r} — check login and output", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
if debug:
|
|
||||||
print(
|
|
||||||
f"[sample_random_scans] debug: parsed {len(ids)} scan id(s) for {label!r}",
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
print(random.choice(ids), end="")
|
|
||||||
PY
|
|
||||||
)" || {
|
|
||||||
log " status: SKIPPED (could not get scan list or pick id)"
|
|
||||||
SKIPPED=$((SKIPPED + 1))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
log " status: picked random scan_id=$random_id (uniform among IDs from this list step — first page by default, see start banner)"
|
|
||||||
if [[ -n "${MOSAIC_ONLY:-}" ]]; then
|
|
||||||
log " status: running scraper: --mosaic-only --scan-id (mosaic only) …"
|
|
||||||
else
|
|
||||||
log " status: running scraper: --scan-id (mosaic + tiles) …"
|
|
||||||
fi
|
|
||||||
if [[ -n "${DRY_RUN:-}" ]]; then
|
|
||||||
log " status: (dry-run — no files written for this scan)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n "${MOSAIC_ONLY:-}" ]]; then
|
|
||||||
run_cmd=("${SCRAPER[@]}" --mosaic-only --machine "$label" --scan-id "$random_id")
|
|
||||||
else
|
|
||||||
run_cmd=("${SCRAPER[@]}" --machine "$label" --scan-id "$random_id")
|
|
||||||
fi
|
|
||||||
if [[ -n "${DRY_RUN:-}" ]]; then
|
|
||||||
run_cmd+=(--dry-run)
|
|
||||||
fi
|
|
||||||
if "${run_cmd[@]}"; then
|
|
||||||
log " status: OK — finished this machine (exit 0)"
|
|
||||||
PROCESSED=$((PROCESSED + 1))
|
|
||||||
else
|
|
||||||
rc=$?
|
|
||||||
log " status: FAILED — scraper exit code $rc (stopping; fix or remove this machine and re-run)"
|
|
||||||
exit "$rc"
|
|
||||||
fi
|
|
||||||
log "────────────────────────────────────────"
|
|
||||||
done < "$MACHINES_FILE"
|
|
||||||
|
|
||||||
log "done. summary: $PROCESSED machine(s) with sampled scan download completed, $SKIPPED skipped, $IDX line(s) processed out of $TOTAL_MACHINES in file."
|
|
||||||
exit 0
|
|
||||||
@@ -0,0 +1,218 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Report metadata-scan progress and projected completion times for all machines.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/scan_progress_report.py [--archive ARCHIVE_DIR] [--recent N] [--mermaid] [--rate-chart]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--archive DIR Path to archives directory (default: archives)
|
||||||
|
--recent N Number of recent files used to compute current rate (default: 500)
|
||||||
|
--mermaid Also print a Mermaid Gantt chart
|
||||||
|
--rate-chart Also print a Mermaid XY chart of s/scan rate by hour
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
# Canonical machine order and total scan counts (from README inventory, April 2026)
|
||||||
|
MACHINES = [
|
||||||
|
("BW1-4 [AMR-15]", 6121),
|
||||||
|
("BW1-6 [AMR-19]", 18198),
|
||||||
|
("BW1-7 [AMR-18]", 430),
|
||||||
|
("BW2-8 [AMR-25]", 8191),
|
||||||
|
("BW2-10 [AMR-22]", 16537),
|
||||||
|
("BW2-11 [AMR-23]", 26763),
|
||||||
|
("BW2-13 [AMR-24]", 13537),
|
||||||
|
("BW3-16 [AMR-16]", 7325),
|
||||||
|
("BW3-17 [AMR-20]", 471),
|
||||||
|
("BW3-19 [AMR-21]", 15186),
|
||||||
|
("BW3-20 [AMR-26]", 23052),
|
||||||
|
("BW3-21 [AMR-17]", 10115),
|
||||||
|
]
|
||||||
|
TOTAL_SCANS = sum(t for _, t in MACHINES)
|
||||||
|
|
||||||
|
|
||||||
|
def dir_name(label: str) -> str:
|
||||||
|
return re.sub(r"[^\w\-.]", "_", label).strip("_")
|
||||||
|
|
||||||
|
|
||||||
|
def get_timestamps(machine_dir: Path) -> list[float]:
|
||||||
|
files = glob.glob(str(machine_dir / "**" / "metadata.json"), recursive=True)
|
||||||
|
return sorted(os.path.getmtime(f) for f in files)
|
||||||
|
|
||||||
|
|
||||||
|
def print_rate_chart(all_timestamps: list[float]) -> None:
|
||||||
|
"""Print a Mermaid xychart-beta of average s/scan per hour."""
|
||||||
|
# One avg rate per hour
|
||||||
|
bins: dict[str, list[float]] = defaultdict(list)
|
||||||
|
start_hour: datetime | None = None
|
||||||
|
for i in range(1, len(all_timestamps)):
|
||||||
|
gap = all_timestamps[i] - all_timestamps[i - 1]
|
||||||
|
if gap < 300: # ignore inter-machine gaps
|
||||||
|
dt = datetime.fromtimestamp(all_timestamps[i])
|
||||||
|
hour_key = dt.strftime("%m-%d %Hh")
|
||||||
|
bins[hour_key].append(gap)
|
||||||
|
if start_hour is None:
|
||||||
|
start_hour = dt.replace(minute=0, second=0, microsecond=0)
|
||||||
|
|
||||||
|
# Drop the last (partial) hour
|
||||||
|
hours = sorted(bins.keys())
|
||||||
|
if hours:
|
||||||
|
hours = hours[:-1]
|
||||||
|
|
||||||
|
if not hours or start_hour is None:
|
||||||
|
print("(not enough data for rate chart)")
|
||||||
|
return
|
||||||
|
|
||||||
|
values = [f"{sum(bins[h])/len(bins[h]):.2f}" for h in hours]
|
||||||
|
y_max = max(float(v) for v in values)
|
||||||
|
y_ceil = int(y_max) + 3
|
||||||
|
n = len(hours)
|
||||||
|
|
||||||
|
# Numeric x-axis: Mermaid auto-picks readable tick positions
|
||||||
|
start_label = start_hour.strftime("%b %d %H:%M")
|
||||||
|
print("```mermaid")
|
||||||
|
print("xychart-beta")
|
||||||
|
print(f' title "Metadata scan rate (s/scan) — hourly, starting {start_label}"')
|
||||||
|
print(f' x-axis "Hours elapsed" 0 --> {n}')
|
||||||
|
print(f' y-axis "s / scan" 0 --> {y_ceil}')
|
||||||
|
print(f" line [{', '.join(values)}]")
|
||||||
|
print("```")
|
||||||
|
|
||||||
|
|
||||||
|
def fmt_dt(dt: datetime) -> str:
|
||||||
|
return dt.strftime("%a %b %d %H:%M")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument("--archive", default="archives", help="Archives directory")
|
||||||
|
parser.add_argument("--recent", type=int, default=500,
|
||||||
|
help="Files used to compute recent rate (default: 500)")
|
||||||
|
parser.add_argument("--mermaid", action="store_true", help="Print Mermaid Gantt chart")
|
||||||
|
parser.add_argument("--rate-chart", action="store_true", help="Print Mermaid XY rate-over-time chart")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
archive = Path(args.archive)
|
||||||
|
if not archive.is_dir():
|
||||||
|
sys.exit(f"Archive directory not found: {archive}")
|
||||||
|
|
||||||
|
now = datetime.now()
|
||||||
|
|
||||||
|
# --- Gather per-machine data ---
|
||||||
|
machine_data = [] # (label, total, done, first_ts, last_ts)
|
||||||
|
all_timestamps: list[float] = []
|
||||||
|
|
||||||
|
for label, total in MACHINES:
|
||||||
|
mdir = archive / dir_name(label)
|
||||||
|
if mdir.is_dir():
|
||||||
|
times = get_timestamps(mdir)
|
||||||
|
else:
|
||||||
|
times = []
|
||||||
|
done = len(times)
|
||||||
|
first_ts = datetime.fromtimestamp(times[0]) if times else None
|
||||||
|
last_ts = datetime.fromtimestamp(times[-1]) if times else None
|
||||||
|
machine_data.append((label, total, done, first_ts, last_ts, times))
|
||||||
|
all_timestamps.extend(times)
|
||||||
|
|
||||||
|
all_timestamps.sort()
|
||||||
|
total_done = sum(d for _, _, d, *_ in machine_data)
|
||||||
|
|
||||||
|
# --- Rate calculation ---
|
||||||
|
recent_times = all_timestamps[-args.recent:] if len(all_timestamps) >= 2 else all_timestamps
|
||||||
|
if len(recent_times) >= 2:
|
||||||
|
recent_rate = (recent_times[-1] - recent_times[0]) / len(recent_times)
|
||||||
|
else:
|
||||||
|
recent_rate = None
|
||||||
|
|
||||||
|
if len(all_timestamps) >= 2:
|
||||||
|
overall_rate = (all_timestamps[-1] - all_timestamps[0]) / len(all_timestamps)
|
||||||
|
else:
|
||||||
|
overall_rate = None
|
||||||
|
|
||||||
|
rate = recent_rate or overall_rate or 5.0 # fallback
|
||||||
|
|
||||||
|
# --- Print timetable ---
|
||||||
|
print(f"Metadata scan progress — {now.strftime('%Y-%m-%d %H:%M')}")
|
||||||
|
print(f"Overall rate : {overall_rate:.2f} s/scan" if overall_rate else "Overall rate : n/a")
|
||||||
|
print(f"Recent rate : {recent_rate:.2f} s/scan (last {args.recent} files)" if recent_rate else "Recent rate : n/a")
|
||||||
|
print(f"Rate used : {rate:.2f} s/scan")
|
||||||
|
print(f"Done : {total_done:,} / {TOTAL_SCANS:,} ({100*total_done/TOTAL_SCANS:.1f}%)")
|
||||||
|
print()
|
||||||
|
print(f"{'Machine':<20} {'Total':>7} {'Done':>7} {'Pct':>6} {'Completion'}")
|
||||||
|
print("-" * 68)
|
||||||
|
|
||||||
|
cursor = now
|
||||||
|
gantt_rows: list[tuple[str, datetime, datetime, str]] = [] # label, start, end, status
|
||||||
|
|
||||||
|
for label, total, done, first_ts, last_ts, times in machine_data:
|
||||||
|
pct = 100 * done / total if total else 0
|
||||||
|
|
||||||
|
complete = done >= total or (done > 0 and done / total >= 0.999)
|
||||||
|
|
||||||
|
if done == 0:
|
||||||
|
# Not started yet
|
||||||
|
start = cursor
|
||||||
|
finish = cursor + timedelta(seconds=total * rate)
|
||||||
|
status = "pending"
|
||||||
|
print(f"{label:<20} {total:>7,} {'—':>7} {'—':>5} {fmt_dt(finish)}")
|
||||||
|
elif complete:
|
||||||
|
# Complete — use actual timestamps
|
||||||
|
start = first_ts
|
||||||
|
finish = last_ts
|
||||||
|
status = "done"
|
||||||
|
print(f"{label:<20} {total:>7,} {done:>7,} {pct:>5.1f}% complete")
|
||||||
|
else:
|
||||||
|
# In progress
|
||||||
|
remaining = total - done
|
||||||
|
start = first_ts
|
||||||
|
finish = cursor + timedelta(seconds=remaining * rate)
|
||||||
|
status = "active"
|
||||||
|
print(f"{label:<20} {total:>7,} {done:>7,} {pct:>5.1f}% {fmt_dt(finish)} ← in progress")
|
||||||
|
|
||||||
|
gantt_rows.append((label, start or cursor, finish, status))
|
||||||
|
if status != "done":
|
||||||
|
cursor = finish
|
||||||
|
|
||||||
|
print("-" * 68)
|
||||||
|
print(f"{'All done':<20} {TOTAL_SCANS:>7,} {total_done:>7,} {100*total_done/TOTAL_SCANS:>5.1f}% {fmt_dt(cursor)}")
|
||||||
|
|
||||||
|
# --- Mermaid rate chart ---
|
||||||
|
if args.rate_chart:
|
||||||
|
print()
|
||||||
|
print_rate_chart(all_timestamps)
|
||||||
|
|
||||||
|
# --- Mermaid Gantt ---
|
||||||
|
if args.mermaid:
|
||||||
|
print()
|
||||||
|
print("```mermaid")
|
||||||
|
print("gantt")
|
||||||
|
print(" title Metadata scan progress — all 12 machines")
|
||||||
|
print(" dateFormat YYYY-MM-DD HH:mm")
|
||||||
|
print(" axisFormat %b %d")
|
||||||
|
print()
|
||||||
|
|
||||||
|
section = None
|
||||||
|
for label, start, finish, status in gantt_rows:
|
||||||
|
prefix = label.split()[0][:3] # BW1, BW2, BW3
|
||||||
|
if prefix != section:
|
||||||
|
section = prefix
|
||||||
|
print(f" section {section}")
|
||||||
|
safe = label.replace("[", "").replace("]", "").replace(" ", "-")
|
||||||
|
tag = f"done, " if status == "done" else ("active, " if status == "active" else "")
|
||||||
|
s = start.strftime("%Y-%m-%d %H:%M")
|
||||||
|
e = finish.strftime("%Y-%m-%d %H:%M")
|
||||||
|
print(f" {label} :{tag}{safe}, {s}, {e}")
|
||||||
|
print("```")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||