""" Command-line interface for the spruce scraper. """ import argparse import logging import os import sys from pathlib import Path import yaml from spruce.orchestrator import scrape_machine from spruce.parsers import parse_machine_option from spruce.progress import ProgressTracker, CsvWriter from spruce.recheck import recheck_archive, recheck_tile_files from spruce.settings import ( DEFAULT_CONFIG, MAX_SAFE_WORKERS, PROGRESS_FILENAME, SCANS_CSV_FIELDS, SCANS_CSV_FILENAME, TILES_CSV_FIELDS, TILES_CSV_FILENAME, _clamp_workers, load_config, ) from spruce.session import MachineSession import requests from bs4 import BeautifulSoup from urllib.parse import urljoin log = logging.getLogger(__name__) def discover_machines(base_url: str, timeout: int = 30) -> list[dict]: resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout) resp.raise_for_status() soup = BeautifulSoup(resp.text, "lxml") select = soup.find("select", {"name": "RTLNAME"}) if not select: log.warning("Could not find machine selector on login page.") return [] return [ parse_machine_option(opt.get_text(strip=True), opt["value"]) for opt in select.find_all("option") ] def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser( description="Archive minirhizotron image tiles from RootView.", formatter_class=argparse.RawDescriptionHelpFormatter, ) p.add_argument( "--config", default=DEFAULT_CONFIG, metavar="FILE", help=f"YAML config file (default: {DEFAULT_CONFIG})", ) p.add_argument( "--machine", metavar="LABEL", help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"', ) p.add_argument( "--scan-id", type=int, metavar="ID", help="Download only this specific scan ID (use with --machine)", ) p.add_argument( "--mosaic-only", action="store_true", help="Download mosaics only; skip individual tiles", ) p.add_argument( "--dry-run", action="store_true", help="Preview what would be downloaded without saving any files", ) p.add_argument( "--workers", type=int, metavar="N", help="Override parallel download threads from config", ) p.add_argument( "--list-machines", action="store_true", help="Print available machines and exit (no credentials needed)", ) p.add_argument( "--list-scans", action="store_true", help="Print all scans for --machine and exit", ) p.add_argument( "--recheck", action="store_true", help=( "Scan the archive for zero-byte or missing tile files whose URLs " "are marked complete in .progress.json, remove them from progress, " "and report how many were re-queued. Run before resuming after a crash." ), ) p.add_argument( "--verbose", "-v", action="store_true", help="Enable debug logging", ) return p.parse_args() def main() -> None: logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%H:%M:%S", ) args = parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) # --list-machines doesn't need credentials if args.list_machines: base_url = "http://205.149.147.131:8010/" timeout = 30 if os.path.exists(args.config): cfg = yaml.safe_load(open(args.config)) base_url = cfg.get("base_url", base_url) timeout = cfg.get("timeout", timeout) machines = discover_machines(base_url, timeout) print(f"{'Label':<25} {'ID':>4} {'IP':<17} {'Version'}") print("-" * 62) for m in machines: print( f"{m['label']:<25} {m['machine_id']:>4} {m['ip']:<17} {m['version']}" ) return if not os.path.exists(args.config): sys.exit( f"Config file '{args.config}' not found.\n" f"Copy config.example.yaml to {args.config} and fill in your credentials." ) config = load_config(args.config) if args.workers: config["workers"] = _clamp_workers(args.workers) output_dir = Path(config["output_dir"]) # --recheck: validate archive integrity and re-queue bad tiles if args.recheck: progress = ProgressTracker(output_dir / PROGRESS_FILENAME) n_bad = recheck_tile_files(output_dir, progress) n_requeued = recheck_archive(output_dir, progress) if n_bad == 0 and n_requeued == 0: log.info("Archive looks clean. No action needed.") else: log.info( "Recheck complete: %d zero-byte file(s) deleted, " "%d URL(s) re-queued for download.", n_bad, n_requeued, ) return # Build machine list all_machines = discover_machines(config["base_url"], config["timeout"]) if not all_machines: sys.exit("Could not retrieve machine list from server.") # Apply --machine / config machines filter filter_labels: list[str] | None = None if args.machine: filter_labels = [args.machine] elif config.get("machines"): filter_labels = list(config["machines"]) if filter_labels: machines = [m for m in all_machines if m["label"] in filter_labels] not_found = [ label for label in filter_labels if label not in {m["label"] for m in machines} ] if not_found: log.warning("Unknown machine label(s): %s", not_found) else: machines = all_machines if not machines: sys.exit("No machines selected.") # --list-scans: print and exit if args.list_scans: if len(machines) != 1: sys.exit("--list-scans requires exactly one machine (use --machine).") sess = MachineSession(machines[0], config) if not sess.login(): sys.exit("Login failed.") scans = sess.get_all_scans() print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}") print("-" * 85) for sc in scans: print( f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} " f"{sc.get('name', ''):<40} {sc.get('status', '')}" ) print(f"\nTotal: {len(scans)} scans") return log.info( "Scraping %d machine(s): %s", len(machines), ", ".join(m["label"] for m in machines), ) if args.mosaic_only: log.info("Mode: mosaics only (individual tiles skipped)") if args.dry_run: log.info("Mode: dry-run (no files will be written)") # Shared progress and CSV writers progress = ProgressTracker(output_dir / PROGRESS_FILENAME) tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS) scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS) total = 0 try: for machine in machines: count = scrape_machine( machine=machine, config=config, output_dir=output_dir, progress=progress, tiles_csv=tiles_csv, scans_csv=scans_csv, dry_run=args.dry_run, mosaic_only=args.mosaic_only, scan_id_filter=args.scan_id, ) total += count finally: tiles_csv.close() scans_csv.close() progress.save() if args.dry_run: log.info("Dry run complete.") else: log.info("Done. Total files downloaded: %d", total) log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME) log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME) log.info("Progress : %s", output_dir / PROGRESS_FILENAME)