Scraping resilience, metadata tooling, and repository hygiene
Consolidates mosaic and session hardening (login retry, skip processed scans, no retry on 404, started_at), progress reporting (Markdown tables, by-year rollup, rolling-window rate/ETA), and metadata workflow scripts (run_metadata_scan.sh, scan_progress_report.py, export_machine_metadata.py). Adds mosaic reconstruction sample JPEGs referenced by the report. Updates .gitignore for backup/ and .claude/; sample_random_scans helper is documented for branch testing/sample-runs only (see README).
This commit is contained in:
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Split scans.csv into per-machine metadata CSVs.
|
||||
|
||||
Reads the combined scans.csv produced by the scraper and writes one CSV per
|
||||
machine containing only the website-sourced metadata columns (no mosaic paths,
|
||||
download status, or error fields).
|
||||
|
||||
Usage:
|
||||
python scripts/export_machine_metadata.py
|
||||
python scripts/export_machine_metadata.py --input archives/scans.csv --output-dir archives/by_machine
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
METADATA_COLUMNS = [
|
||||
"machine",
|
||||
"machine_id",
|
||||
"scan_id",
|
||||
"name",
|
||||
"scan_time",
|
||||
"start_x",
|
||||
"start_y",
|
||||
"end_x",
|
||||
"end_y",
|
||||
"dx",
|
||||
"dy",
|
||||
"nx",
|
||||
"ny",
|
||||
"total_tiles",
|
||||
"scan_lines",
|
||||
"scan_mode",
|
||||
"start_datetime",
|
||||
"end_datetime",
|
||||
"status",
|
||||
"user",
|
||||
"disk_space_mb",
|
||||
]
|
||||
|
||||
|
||||
def sanitize_machine_label(label: str) -> str:
|
||||
return label.replace("[", "").replace("]", "").replace(" ", "_").strip("_")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(description="Split scans.csv into per-machine metadata CSVs.")
|
||||
p.add_argument(
|
||||
"--input",
|
||||
default="archives/scans.csv",
|
||||
metavar="FILE",
|
||||
help="Path to scans.csv (default: archives/scans.csv)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--output-dir",
|
||||
default="archives/by_machine",
|
||||
metavar="DIR",
|
||||
help="Directory for output CSVs (default: archives/by_machine)",
|
||||
)
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
input_path = Path(args.input)
|
||||
output_dir = Path(args.output_dir)
|
||||
|
||||
if not input_path.exists():
|
||||
sys.exit(f"Input file not found: {input_path}")
|
||||
|
||||
with input_path.open(newline="") as fh:
|
||||
reader = csv.DictReader(fh)
|
||||
if reader.fieldnames is None:
|
||||
sys.exit(f"{input_path} appears to be empty.")
|
||||
|
||||
missing = [c for c in METADATA_COLUMNS if c not in reader.fieldnames]
|
||||
if missing:
|
||||
sys.exit(f"Expected columns not found in {input_path}: {missing}")
|
||||
|
||||
rows_by_machine: dict[str, list[dict]] = defaultdict(list)
|
||||
for row in reader:
|
||||
rows_by_machine[row["machine"]].append(row)
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for machine_label, rows in sorted(rows_by_machine.items()):
|
||||
safe_name = sanitize_machine_label(machine_label)
|
||||
out_path = output_dir / f"{safe_name}_scans_metadata.csv"
|
||||
with out_path.open("w", newline="") as fh:
|
||||
writer = csv.DictWriter(fh, fieldnames=METADATA_COLUMNS, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
print(f" {out_path} ({len(rows)} rows)")
|
||||
|
||||
total = sum(len(r) for r in rows_by_machine.values())
|
||||
print(f"\n{len(rows_by_machine)} machine(s), {total} total rows → {output_dir}/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user