#!/usr/bin/env python3 """ Split scans.csv into per-machine metadata CSVs. Reads the combined scans.csv produced by the scraper and writes one CSV per machine containing only the website-sourced metadata columns (no mosaic paths, download status, or error fields). Usage: python scripts/export_machine_metadata.py python scripts/export_machine_metadata.py --input archives/scans.csv --output-dir archives/by_machine """ from __future__ import annotations import argparse import csv import sys from collections import defaultdict from pathlib import Path METADATA_COLUMNS = [ "machine", "machine_id", "scan_id", "name", "scan_time", "start_x", "start_y", "end_x", "end_y", "dx", "dy", "nx", "ny", "total_tiles", "scan_lines", "scan_mode", "start_datetime", "end_datetime", "status", "user", "disk_space_mb", ] def sanitize_machine_label(label: str) -> str: return label.replace("[", "").replace("]", "").replace(" ", "_").strip("_") def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description="Split scans.csv into per-machine metadata CSVs.") p.add_argument( "--input", default="archives/scans.csv", metavar="FILE", help="Path to scans.csv (default: archives/scans.csv)", ) p.add_argument( "--output-dir", default="archives/by_machine", metavar="DIR", help="Directory for output CSVs (default: archives/by_machine)", ) return p.parse_args() def main() -> None: args = parse_args() input_path = Path(args.input) output_dir = Path(args.output_dir) if not input_path.exists(): sys.exit(f"Input file not found: {input_path}") with input_path.open(newline="") as fh: reader = csv.DictReader(fh) if reader.fieldnames is None: sys.exit(f"{input_path} appears to be empty.") missing = [c for c in METADATA_COLUMNS if c not in reader.fieldnames] if missing: sys.exit(f"Expected columns not found in {input_path}: {missing}") rows_by_machine: dict[str, list[dict]] = defaultdict(list) for row in reader: rows_by_machine[row["machine"]].append(row) output_dir.mkdir(parents=True, exist_ok=True) for machine_label, rows in sorted(rows_by_machine.items()): safe_name = sanitize_machine_label(machine_label) out_path = output_dir / f"{safe_name}_scans_metadata.csv" with out_path.open("w", newline="") as fh: writer = csv.DictWriter(fh, fieldnames=METADATA_COLUMNS, extrasaction="ignore") writer.writeheader() writer.writerows(rows) print(f" {out_path} ({len(rows)} rows)") total = sum(len(r) for r in rows_by_machine.values()) print(f"\n{len(rows_by_machine)} machine(s), {total} total rows → {output_dir}/") if __name__ == "__main__": main()