SPRUCE-scraper/spruce/cli.py

"""
Command-line interface for the spruce scraper.
"""

import argparse
import logging
import os
import sys
from pathlib import Path

import yaml

from spruce.orchestrator import scrape_machine, RunStats
from spruce.parsers import parse_machine_option
from spruce.progress import ProgressTracker, CsvWriter
from spruce.recheck import recheck_archive, recheck_tile_files
from spruce.settings import (
    DEFAULT_CONFIG,
    MAX_SAFE_WORKERS,
    PROGRESS_FILENAME,
    SCANS_CSV_FIELDS,
    SCANS_CSV_FILENAME,
    TILES_CSV_FIELDS,
    TILES_CSV_FILENAME,
    _clamp_workers,
    load_config,
)
from spruce.session import MachineSession

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

log = logging.getLogger(__name__)


def discover_machines(base_url: str, timeout: int = 30) -> list[dict]:
    resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "lxml")
    select = soup.find("select", {"name": "RTLNAME"})
    if not select:
        log.warning("Could not find machine selector on login page.")
        return []
    return [
        parse_machine_option(opt.get_text(strip=True), opt["value"])
        for opt in select.find_all("option")
    ]


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description="Archive minirhizotron image tiles from RootView.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    p.add_argument(
        "--config",
        default=DEFAULT_CONFIG,
        metavar="FILE",
        help=f"YAML config file (default: {DEFAULT_CONFIG})",
    )
    p.add_argument(
        "--machine",
        metavar="LABEL",
        help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"',
    )
    p.add_argument(
        "--scan-id",
        type=int,
        metavar="ID",
        help="Download only this specific scan ID (use with --machine)",
    )
    p.add_argument(
        "--mosaic-only",
        action="store_true",
        help="Download mosaics only; skip individual tiles",
    )
    p.add_argument(
        "--metadata-only",
        action="store_true",
        help=(
            "Fetch scan parameters only; write metadata.json and scans.csv "
            "rows but skip mosaics and tiles. Very fast — suitable for "
            "inventorying all scans across all machines."
        ),
    )
    p.add_argument(
        "--dry-run",
        action="store_true",
        help="Preview what would be downloaded without saving any files",
    )
    p.add_argument(
        "--workers",
        type=int,
        metavar="N",
        help="Override parallel download threads from config",
    )
    p.add_argument(
        "--list-machines",
        action="store_true",
        help="Print available machines and exit (no credentials needed)",
    )
    p.add_argument(
        "--list-scans",
        action="store_true",
        help="Print all scans for --machine and exit",
    )
    p.add_argument(
        "--recheck",
        action="store_true",
        help=(
            "Scan the archive for zero-byte or missing tile files whose URLs "
            "are marked complete in .progress.json, remove them from progress, "
            "and report how many were re-queued. Run before resuming after a crash."
        ),
    )
    p.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Enable debug logging",
    )
    return p.parse_args()


def main() -> None:
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s  %(levelname)-8s  %(message)s",
        datefmt="%H:%M:%S",
    )
    args = parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # --list-machines doesn't need credentials
    if args.list_machines:
        base_url = "http://205.149.147.131:8010/"
        timeout = 30
        if os.path.exists(args.config):
            cfg = yaml.safe_load(open(args.config))
            base_url = cfg.get("base_url", base_url)
            timeout = cfg.get("timeout", timeout)
        machines = discover_machines(base_url, timeout)
        print(f"{'Label':<25}  {'ID':>4}  {'IP':<17}  {'Version'}")
        print("-" * 62)
        for m in machines:
            print(
                f"{m['label']:<25}  {m['machine_id']:>4}  {m['ip']:<17}  {m['version']}"
            )
        return

    if not os.path.exists(args.config):
        sys.exit(
            f"Config file '{args.config}' not found.\n"
            f"Copy config.example.yaml to {args.config} and fill in your credentials."
        )

    config = load_config(args.config)
    if args.workers:
        config["workers"] = _clamp_workers(args.workers)

    output_dir = Path(config["output_dir"])

    # --recheck: validate archive integrity and re-queue bad tiles
    if args.recheck:
        progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
        n_bad = recheck_tile_files(output_dir, progress)
        n_requeued = recheck_archive(output_dir, progress)
        if n_bad == 0 and n_requeued == 0:
            log.info("Archive looks clean. No action needed.")
        else:
            log.info(
                "Recheck complete: %d zero-byte file(s) deleted, "
                "%d URL(s) re-queued for download.",
                n_bad,
                n_requeued,
            )
        return

    # Build machine list
    all_machines = discover_machines(config["base_url"], config["timeout"])
    if not all_machines:
        sys.exit("Could not retrieve machine list from server.")

    # Apply --machine / config machines filter
    filter_labels: list[str] | None = None
    if args.machine:
        filter_labels = [args.machine]
    elif config.get("machines"):
        filter_labels = list(config["machines"])

    if filter_labels:
        machines = [m for m in all_machines if m["label"] in filter_labels]
        not_found = [
            label
            for label in filter_labels
            if label not in {m["label"] for m in machines}
        ]
        if not_found:
            log.warning("Unknown machine label(s): %s", not_found)
    else:
        machines = all_machines

    if not machines:
        sys.exit("No machines selected.")

    # --list-scans: print and exit
    if args.list_scans:
        if len(machines) != 1:
            sys.exit("--list-scans requires exactly one machine (use --machine).")
        sess = MachineSession(machines[0], config)
        if not sess.login():
            sys.exit("Login failed.")
        scans = sess.get_all_scans()
        print(f"{'ID':>8}  {'Date':<22}  {'Name':<40}  {'Status'}")
        print("-" * 85)
        for sc in scans:
            print(
                f"{sc['scan_id']:>8}  {sc.get('scan_time', ''):<22}  "
                f"{sc.get('name', ''):<40}  {sc.get('status', '')}"
            )
        print(f"\nTotal: {len(scans)} scans")
        return

    log.info(
        "Scraping %d machine(s): %s",
        len(machines),
        ", ".join(m["label"] for m in machines),
    )
    if args.mosaic_only and args.metadata_only:
        sys.exit("--mosaic-only and --metadata-only are mutually exclusive.")
    if args.metadata_only:
        log.info("Mode: metadata only (mosaics and tiles skipped)")
    elif args.mosaic_only:
        log.info("Mode: mosaics only (individual tiles skipped)")
    if args.dry_run:
        log.info("Mode: dry-run (no files will be written)")

    # Shared progress and CSV writers
    progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
    tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
    scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)

    totals = RunStats()
    try:
        for machine in machines:
            stats = scrape_machine(
                machine=machine,
                config=config,
                output_dir=output_dir,
                progress=progress,
                tiles_csv=tiles_csv,
                scans_csv=scans_csv,
                dry_run=args.dry_run,
                mosaic_only=args.mosaic_only,
                metadata_only=args.metadata_only,
                scan_id_filter=args.scan_id,
            )
            totals.merge(stats)
    finally:
        tiles_csv.close()
        scans_csv.close()
        progress.save()

    _print_summary(
        totals=totals,
        machines=machines,
        output_dir=output_dir,
        dry_run=args.dry_run,
        metadata_only=args.metadata_only,
        mosaic_only=args.mosaic_only,
    )


def _print_summary(
    totals: RunStats,
    machines: list[dict],
    output_dir: Path,
    dry_run: bool,
    metadata_only: bool,
    mosaic_only: bool,
) -> None:
    W = 46
    sep = "─" * W

    def row(label: str, value: str, note: str = "") -> str:
        note_str = f"  ({note})" if note else ""
        return f"  {label:<22}{value}{note_str}"

    log.info(sep)
    if dry_run:
        log.info("  Dry run complete — no files written.")
    else:
        log.info("  Run complete")
        log.info(sep)
        log.info(row("Machines:", str(len(machines))))
        log.info(
            row(
                "Scans (metadata) fetched:",
                str(totals.scans_fetched),
                f"{totals.scans_skipped} already cached, "
                f"{totals.scans_failed} metadata failed"
                if totals.scans_skipped or totals.scans_failed
                else "",
            )
        )
        if not metadata_only:
            log.info(row("Mosaics downloaded:", str(totals.mosaics_downloaded)))
            if totals.mosaics_failed:
                log.info(
                    row(
                        "Mosaics failed:",
                        str(totals.mosaics_failed),
                        "0 bytes or HTTP error; see log above",
                    )
                )
        if not metadata_only and not mosaic_only:
            log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
        if metadata_only:
            log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files"))
    log.info(sep)
    log.info(row("Scans CSV:", str(output_dir / SCANS_CSV_FILENAME)))
    if not metadata_only:
        log.info(row("Tiles CSV:", str(output_dir / TILES_CSV_FILENAME)))
    log.info(row("Progress:", str(output_dir / PROGRESS_FILENAME)))
    log.info(sep)