1334cfaf92
Co-authored-by: Cursor <cursoragent@cursor.com>
106 lines
2.9 KiB
Python
106 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Split scans.csv into per-machine metadata CSVs.
|
|
|
|
Reads the combined scans.csv produced by the scraper and writes one CSV per
|
|
machine containing only the website-sourced metadata columns (no mosaic paths,
|
|
download status, or error fields).
|
|
|
|
Usage:
|
|
python scripts/export_machine_metadata.py
|
|
python scripts/export_machine_metadata.py --input archives/scans.csv --output-dir archives/by_machine
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import sys
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
METADATA_COLUMNS = [
|
|
"machine",
|
|
"machine_id",
|
|
"scan_id",
|
|
"name",
|
|
"scan_time",
|
|
"start_x",
|
|
"start_y",
|
|
"end_x",
|
|
"end_y",
|
|
"dx",
|
|
"dy",
|
|
"nx",
|
|
"ny",
|
|
"total_tiles",
|
|
"scan_lines",
|
|
"scan_mode",
|
|
"start_datetime",
|
|
"end_datetime",
|
|
"status",
|
|
"user",
|
|
"disk_space_mb",
|
|
]
|
|
|
|
|
|
def sanitize_machine_label(label: str) -> str:
|
|
return label.replace("[", "").replace("]", "").replace(" ", "_").strip("_")
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
p = argparse.ArgumentParser(description="Split scans.csv into per-machine metadata CSVs.")
|
|
p.add_argument(
|
|
"--input",
|
|
default="archives/scans.csv",
|
|
metavar="FILE",
|
|
help="Path to scans.csv (default: archives/scans.csv)",
|
|
)
|
|
p.add_argument(
|
|
"--output-dir",
|
|
default="archives/by_machine",
|
|
metavar="DIR",
|
|
help="Directory for output CSVs (default: archives/by_machine)",
|
|
)
|
|
return p.parse_args()
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
input_path = Path(args.input)
|
|
output_dir = Path(args.output_dir)
|
|
|
|
if not input_path.exists():
|
|
sys.exit(f"Input file not found: {input_path}")
|
|
|
|
with input_path.open(newline="") as fh:
|
|
reader = csv.DictReader(fh)
|
|
if reader.fieldnames is None:
|
|
sys.exit(f"{input_path} appears to be empty.")
|
|
|
|
missing = [c for c in METADATA_COLUMNS if c not in reader.fieldnames]
|
|
if missing:
|
|
sys.exit(f"Expected columns not found in {input_path}: {missing}")
|
|
|
|
rows_by_machine: dict[str, list[dict]] = defaultdict(list)
|
|
for row in reader:
|
|
rows_by_machine[row["machine"]].append(row)
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
for machine_label, rows in sorted(rows_by_machine.items()):
|
|
safe_name = sanitize_machine_label(machine_label)
|
|
out_path = output_dir / f"{safe_name}_scans_metadata.csv"
|
|
with out_path.open("w", newline="") as fh:
|
|
writer = csv.DictWriter(fh, fieldnames=METADATA_COLUMNS, extrasaction="ignore")
|
|
writer.writeheader()
|
|
writer.writerows(rows)
|
|
print(f" {out_path} ({len(rows)} rows)")
|
|
|
|
total = sum(len(r) for r in rows_by_machine.values())
|
|
print(f"\n{len(rows_by_machine)} machine(s), {total} total rows → {output_dir}/")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|