Files
SPRUCE-scraper/scripts/export_machine_metadata.py
T

106 lines
2.9 KiB
Python

#!/usr/bin/env python3
"""
Split scans.csv into per-machine metadata CSVs.
Reads the combined scans.csv produced by the scraper and writes one CSV per
machine containing only the website-sourced metadata columns (no mosaic paths,
download status, or error fields).
Usage:
python scripts/export_machine_metadata.py
python scripts/export_machine_metadata.py --input archives/scans.csv --output-dir archives/by_machine
"""
from __future__ import annotations
import argparse
import csv
import sys
from collections import defaultdict
from pathlib import Path
METADATA_COLUMNS = [
"machine",
"machine_id",
"scan_id",
"name",
"scan_time",
"start_x",
"start_y",
"end_x",
"end_y",
"dx",
"dy",
"nx",
"ny",
"total_tiles",
"scan_lines",
"scan_mode",
"start_datetime",
"end_datetime",
"status",
"user",
"disk_space_mb",
]
def sanitize_machine_label(label: str) -> str:
return label.replace("[", "").replace("]", "").replace(" ", "_").strip("_")
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="Split scans.csv into per-machine metadata CSVs.")
p.add_argument(
"--input",
default="archives/scans.csv",
metavar="FILE",
help="Path to scans.csv (default: archives/scans.csv)",
)
p.add_argument(
"--output-dir",
default="archives/by_machine",
metavar="DIR",
help="Directory for output CSVs (default: archives/by_machine)",
)
return p.parse_args()
def main() -> None:
args = parse_args()
input_path = Path(args.input)
output_dir = Path(args.output_dir)
if not input_path.exists():
sys.exit(f"Input file not found: {input_path}")
with input_path.open(newline="") as fh:
reader = csv.DictReader(fh)
if reader.fieldnames is None:
sys.exit(f"{input_path} appears to be empty.")
missing = [c for c in METADATA_COLUMNS if c not in reader.fieldnames]
if missing:
sys.exit(f"Expected columns not found in {input_path}: {missing}")
rows_by_machine: dict[str, list[dict]] = defaultdict(list)
for row in reader:
rows_by_machine[row["machine"]].append(row)
output_dir.mkdir(parents=True, exist_ok=True)
for machine_label, rows in sorted(rows_by_machine.items()):
safe_name = sanitize_machine_label(machine_label)
out_path = output_dir / f"{safe_name}_scans_metadata.csv"
with out_path.open("w", newline="") as fh:
writer = csv.DictWriter(fh, fieldnames=METADATA_COLUMNS, extrasaction="ignore")
writer.writeheader()
writer.writerows(rows)
print(f" {out_path} ({len(rows)} rows)")
total = sum(len(r) for r in rows_by_machine.values())
print(f"\n{len(rows_by_machine)} machine(s), {total} total rows → {output_dir}/")
if __name__ == "__main__":
main()