Initial commit

Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
2026-04-22 10:41:18 -04:00
commit e122f6435a
23 changed files with 3789 additions and 0 deletions
@@ -0,0 +1,259 @@
+"""
+Command-line interface for the spruce scraper.
+"""
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+
+import yaml
+
+from spruce.orchestrator import scrape_machine
+from spruce.parsers import parse_machine_option
+from spruce.progress import ProgressTracker, CsvWriter
+from spruce.recheck import recheck_archive, recheck_tile_files
+from spruce.settings import (
+    DEFAULT_CONFIG,
+    MAX_SAFE_WORKERS,
+    PROGRESS_FILENAME,
+    SCANS_CSV_FIELDS,
+    SCANS_CSV_FILENAME,
+    TILES_CSV_FIELDS,
+    TILES_CSV_FILENAME,
+    _clamp_workers,
+    load_config,
+)
+from spruce.session import MachineSession
+
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+log = logging.getLogger(__name__)
+
+
+def discover_machines(base_url: str, timeout: int = 30) -> list[dict]:
+    resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout)
+    resp.raise_for_status()
+    soup = BeautifulSoup(resp.text, "lxml")
+    select = soup.find("select", {"name": "RTLNAME"})
+    if not select:
+        log.warning("Could not find machine selector on login page.")
+        return []
+    return [
+        parse_machine_option(opt.get_text(strip=True), opt["value"])
+        for opt in select.find_all("option")
+    ]
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Archive minirhizotron image tiles from RootView.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument(
+        "--config",
+        default=DEFAULT_CONFIG,
+        metavar="FILE",
+        help=f"YAML config file (default: {DEFAULT_CONFIG})",
+    )
+    p.add_argument(
+        "--machine",
+        metavar="LABEL",
+        help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"',
+    )
+    p.add_argument(
+        "--scan-id",
+        type=int,
+        metavar="ID",
+        help="Download only this specific scan ID (use with --machine)",
+    )
+    p.add_argument(
+        "--mosaic-only",
+        action="store_true",
+        help="Download mosaics only; skip individual tiles",
+    )
+    p.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Preview what would be downloaded without saving any files",
+    )
+    p.add_argument(
+        "--workers",
+        type=int,
+        metavar="N",
+        help="Override parallel download threads from config",
+    )
+    p.add_argument(
+        "--list-machines",
+        action="store_true",
+        help="Print available machines and exit (no credentials needed)",
+    )
+    p.add_argument(
+        "--list-scans",
+        action="store_true",
+        help="Print all scans for --machine and exit",
+    )
+    p.add_argument(
+        "--recheck",
+        action="store_true",
+        help=(
+            "Scan the archive for zero-byte or missing tile files whose URLs "
+            "are marked complete in .progress.json, remove them from progress, "
+            "and report how many were re-queued. Run before resuming after a crash."
+        ),
+    )
+    p.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Enable debug logging",
+    )
+    return p.parse_args()
+
+
+def main() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s  %(levelname)-8s  %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    args = parse_args()
+
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # --list-machines doesn't need credentials
+    if args.list_machines:
+        base_url = "http://205.149.147.131:8010/"
+        timeout = 30
+        if os.path.exists(args.config):
+            cfg = yaml.safe_load(open(args.config))
+            base_url = cfg.get("base_url", base_url)
+            timeout = cfg.get("timeout", timeout)
+        machines = discover_machines(base_url, timeout)
+        print(f"{'Label':<25}  {'ID':>4}  {'IP':<17}  {'Version'}")
+        print("-" * 62)
+        for m in machines:
+            print(
+                f"{m['label']:<25}  {m['machine_id']:>4}  {m['ip']:<17}  {m['version']}"
+            )
+        return
+
+    if not os.path.exists(args.config):
+        sys.exit(
+            f"Config file '{args.config}' not found.\n"
+            f"Copy config.example.yaml to {args.config} and fill in your credentials."
+        )
+
+    config = load_config(args.config)
+    if args.workers:
+        config["workers"] = _clamp_workers(args.workers)
+
+    output_dir = Path(config["output_dir"])
+
+    # --recheck: validate archive integrity and re-queue bad tiles
+    if args.recheck:
+        progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
+        n_bad = recheck_tile_files(output_dir, progress)
+        n_requeued = recheck_archive(output_dir, progress)
+        if n_bad == 0 and n_requeued == 0:
+            log.info("Archive looks clean. No action needed.")
+        else:
+            log.info(
+                "Recheck complete: %d zero-byte file(s) deleted, "
+                "%d URL(s) re-queued for download.",
+                n_bad,
+                n_requeued,
+            )
+        return
+
+    # Build machine list
+    all_machines = discover_machines(config["base_url"], config["timeout"])
+    if not all_machines:
+        sys.exit("Could not retrieve machine list from server.")
+
+    # Apply --machine / config machines filter
+    filter_labels: list[str] | None = None
+    if args.machine:
+        filter_labels = [args.machine]
+    elif config.get("machines"):
+        filter_labels = list(config["machines"])
+
+    if filter_labels:
+        machines = [m for m in all_machines if m["label"] in filter_labels]
+        not_found = [
+            label
+            for label in filter_labels
+            if label not in {m["label"] for m in machines}
+        ]
+        if not_found:
+            log.warning("Unknown machine label(s): %s", not_found)
+    else:
+        machines = all_machines
+
+    if not machines:
+        sys.exit("No machines selected.")
+
+    # --list-scans: print and exit
+    if args.list_scans:
+        if len(machines) != 1:
+            sys.exit("--list-scans requires exactly one machine (use --machine).")
+        sess = MachineSession(machines[0], config)
+        if not sess.login():
+            sys.exit("Login failed.")
+        scans = sess.get_all_scans()
+        print(f"{'ID':>8}  {'Date':<22}  {'Name':<40}  {'Status'}")
+        print("-" * 85)
+        for sc in scans:
+            print(
+                f"{sc['scan_id']:>8}  {sc.get('scan_time', ''):<22}  "
+                f"{sc.get('name', ''):<40}  {sc.get('status', '')}"
+            )
+        print(f"\nTotal: {len(scans)} scans")
+        return
+
+    log.info(
+        "Scraping %d machine(s): %s",
+        len(machines),
+        ", ".join(m["label"] for m in machines),
+    )
+    if args.mosaic_only:
+        log.info("Mode: mosaics only (individual tiles skipped)")
+    if args.dry_run:
+        log.info("Mode: dry-run (no files will be written)")
+
+    # Shared progress and CSV writers
+    progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
+    tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
+    scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
+
+    total = 0
+    try:
+        for machine in machines:
+            count = scrape_machine(
+                machine=machine,
+                config=config,
+                output_dir=output_dir,
+                progress=progress,
+                tiles_csv=tiles_csv,
+                scans_csv=scans_csv,
+                dry_run=args.dry_run,
+                mosaic_only=args.mosaic_only,
+                scan_id_filter=args.scan_id,
+            )
+            total += count
+    finally:
+        tiles_csv.close()
+        scans_csv.close()
+        progress.save()
+
+    if args.dry_run:
+        log.info("Dry run complete.")
+    else:
+        log.info("Done. Total files downloaded: %d", total)
+        log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME)
+        log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME)
+        log.info("Progress  : %s", output_dir / PROGRESS_FILENAME)