""" Command-line interface for the spruce scraper. """ import argparse import logging import os import sys from pathlib import Path import yaml from spruce.orchestrator import scrape_machine, RunStats from spruce.parsers import parse_machine_option from spruce.progress import ProgressTracker, CsvWriter from spruce.recheck import recheck_archive, recheck_tile_files from spruce.settings import ( DEFAULT_CONFIG, MAX_SAFE_WORKERS, PROGRESS_FILENAME, SCANS_CSV_FIELDS, SCANS_CSV_FILENAME, TILES_CSV_FIELDS, TILES_CSV_FILENAME, _clamp_workers, load_config, ) from spruce.session import MachineSession import requests from bs4 import BeautifulSoup from urllib.parse import urljoin log = logging.getLogger(__name__) def discover_machines(base_url: str, timeout: int = 30) -> list[dict]: resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout) resp.raise_for_status() soup = BeautifulSoup(resp.text, "lxml") select = soup.find("select", {"name": "RTLNAME"}) if not select: log.warning("Could not find machine selector on login page.") return [] return [ parse_machine_option(opt.get_text(strip=True), opt["value"]) for opt in select.find_all("option") ] def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser( description="Archive minirhizotron image tiles from RootView.", formatter_class=argparse.RawDescriptionHelpFormatter, ) p.add_argument( "--config", default=DEFAULT_CONFIG, metavar="FILE", help=f"YAML config file (default: {DEFAULT_CONFIG})", ) p.add_argument( "--machine", metavar="LABEL", help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"', ) p.add_argument( "--scan-id", type=int, metavar="ID", help="Download only this specific scan ID (use with --machine)", ) p.add_argument( "--mosaic-only", action="store_true", help="Download mosaics only; skip individual tiles", ) p.add_argument( "--metadata-only", action="store_true", help=( "Fetch scan parameters only; write metadata.json and scans.csv " "rows but skip mosaics and tiles. Very fast — suitable for " "inventorying all scans across all machines." ), ) p.add_argument( "--dry-run", action="store_true", help="Preview what would be downloaded without saving any files", ) p.add_argument( "--workers", type=int, metavar="N", help="Override parallel download threads from config", ) p.add_argument( "--list-machines", action="store_true", help="Print available machines and exit (no credentials needed)", ) p.add_argument( "--list-scans", action="store_true", help="Print all scans for --machine and exit", ) p.add_argument( "--recheck", action="store_true", help=( "Scan the archive for zero-byte or missing tile files whose URLs " "are marked complete in .progress.json, remove them from progress, " "and report how many were re-queued. Run before resuming after a crash." ), ) p.add_argument( "--verbose", "-v", action="store_true", help="Enable debug logging", ) return p.parse_args() def main() -> None: logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%H:%M:%S", ) args = parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) # --list-machines doesn't need credentials if args.list_machines: base_url = "http://205.149.147.131:8010/" timeout = 30 if os.path.exists(args.config): cfg = yaml.safe_load(open(args.config)) base_url = cfg.get("base_url", base_url) timeout = cfg.get("timeout", timeout) machines = discover_machines(base_url, timeout) print(f"{'Label':<25} {'ID':>4} {'IP':<17} {'Version'}") print("-" * 62) for m in machines: print( f"{m['label']:<25} {m['machine_id']:>4} {m['ip']:<17} {m['version']}" ) return if not os.path.exists(args.config): sys.exit( f"Config file '{args.config}' not found.\n" f"Copy config.example.yaml to {args.config} and fill in your credentials." ) config = load_config(args.config) if args.workers: config["workers"] = _clamp_workers(args.workers) output_dir = Path(config["output_dir"]) # --recheck: validate archive integrity and re-queue bad tiles if args.recheck: progress = ProgressTracker(output_dir / PROGRESS_FILENAME) n_bad = recheck_tile_files(output_dir, progress) n_requeued = recheck_archive(output_dir, progress) if n_bad == 0 and n_requeued == 0: log.info("Archive looks clean. No action needed.") else: log.info( "Recheck complete: %d zero-byte file(s) deleted, " "%d URL(s) re-queued for download.", n_bad, n_requeued, ) return # Build machine list all_machines = discover_machines(config["base_url"], config["timeout"]) if not all_machines: sys.exit("Could not retrieve machine list from server.") # Apply --machine / config machines filter filter_labels: list[str] | None = None if args.machine: filter_labels = [args.machine] elif config.get("machines"): filter_labels = list(config["machines"]) if filter_labels: machines = [m for m in all_machines if m["label"] in filter_labels] not_found = [ label for label in filter_labels if label not in {m["label"] for m in machines} ] if not_found: log.warning("Unknown machine label(s): %s", not_found) else: machines = all_machines if not machines: sys.exit("No machines selected.") # --list-scans: print and exit if args.list_scans: if len(machines) != 1: sys.exit("--list-scans requires exactly one machine (use --machine).") sess = MachineSession(machines[0], config) if not sess.login(): sys.exit("Login failed.") scans = sess.get_all_scans() print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}") print("-" * 85) for sc in scans: print( f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} " f"{sc.get('name', ''):<40} {sc.get('status', '')}" ) print(f"\nTotal: {len(scans)} scans") return log.info( "Scraping %d machine(s): %s", len(machines), ", ".join(m["label"] for m in machines), ) if args.mosaic_only and args.metadata_only: sys.exit("--mosaic-only and --metadata-only are mutually exclusive.") if args.metadata_only: log.info("Mode: metadata only (mosaics and tiles skipped)") elif args.mosaic_only: log.info("Mode: mosaics only (individual tiles skipped)") if args.dry_run: log.info("Mode: dry-run (no files will be written)") # Shared progress and CSV writers progress = ProgressTracker(output_dir / PROGRESS_FILENAME) tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS) scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS) totals = RunStats() try: for machine in machines: stats = scrape_machine( machine=machine, config=config, output_dir=output_dir, progress=progress, tiles_csv=tiles_csv, scans_csv=scans_csv, dry_run=args.dry_run, mosaic_only=args.mosaic_only, metadata_only=args.metadata_only, scan_id_filter=args.scan_id, ) totals.merge(stats) finally: tiles_csv.close() scans_csv.close() progress.save() _print_summary( totals=totals, machines=machines, output_dir=output_dir, dry_run=args.dry_run, metadata_only=args.metadata_only, mosaic_only=args.mosaic_only, ) def _print_summary( totals: RunStats, machines: list[dict], output_dir: Path, dry_run: bool, metadata_only: bool, mosaic_only: bool, ) -> None: W = 46 sep = "─" * W def row(label: str, value: str, note: str = "") -> str: note_str = f" ({note})" if note else "" return f" {label:<22}{value}{note_str}" log.info(sep) if dry_run: log.info(" Dry run complete — no files written.") else: log.info(" Run complete") log.info(sep) log.info(row("Machines:", str(len(machines)))) log.info( row("Scans fetched:", str(totals.scans_fetched), f"{totals.scans_skipped} already cached, " f"{totals.scans_failed} failed" if totals.scans_skipped or totals.scans_failed else "") ) if not metadata_only: log.info(row("Mosaics downloaded:", str(totals.mosaics_downloaded))) if not metadata_only and not mosaic_only: log.info(row("Tiles downloaded:", str(totals.tiles_downloaded))) if metadata_only: log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files")) log.info(sep) log.info(row("Scans CSV:", str(output_dir / SCANS_CSV_FILENAME))) if not metadata_only: log.info(row("Tiles CSV:", str(output_dir / TILES_CSV_FILENAME))) log.info(row("Progress:", str(output_dir / PROGRESS_FILENAME))) log.info(sep)