329 lines
10 KiB
Python
329 lines
10 KiB
Python
"""
|
|
Command-line interface for the spruce scraper.
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
from spruce.orchestrator import scrape_machine, RunStats
|
|
from spruce.parsers import parse_machine_option
|
|
from spruce.progress import ProgressTracker, CsvWriter
|
|
from spruce.recheck import recheck_archive, recheck_tile_files
|
|
from spruce.settings import (
|
|
DEFAULT_CONFIG,
|
|
MAX_SAFE_WORKERS,
|
|
PROGRESS_FILENAME,
|
|
SCANS_CSV_FIELDS,
|
|
SCANS_CSV_FILENAME,
|
|
TILES_CSV_FIELDS,
|
|
TILES_CSV_FILENAME,
|
|
_clamp_workers,
|
|
load_config,
|
|
)
|
|
from spruce.session import MachineSession
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def discover_machines(base_url: str, timeout: int = 30) -> list[dict]:
|
|
resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
select = soup.find("select", {"name": "RTLNAME"})
|
|
if not select:
|
|
log.warning("Could not find machine selector on login page.")
|
|
return []
|
|
return [
|
|
parse_machine_option(opt.get_text(strip=True), opt["value"])
|
|
for opt in select.find_all("option")
|
|
]
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
p = argparse.ArgumentParser(
|
|
description="Archive minirhizotron image tiles from RootView.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
p.add_argument(
|
|
"--config",
|
|
default=DEFAULT_CONFIG,
|
|
metavar="FILE",
|
|
help=f"YAML config file (default: {DEFAULT_CONFIG})",
|
|
)
|
|
p.add_argument(
|
|
"--machine",
|
|
metavar="LABEL",
|
|
help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"',
|
|
)
|
|
p.add_argument(
|
|
"--scan-id",
|
|
type=int,
|
|
metavar="ID",
|
|
help="Download only this specific scan ID (use with --machine)",
|
|
)
|
|
p.add_argument(
|
|
"--mosaic-only",
|
|
action="store_true",
|
|
help="Download mosaics only; skip individual tiles",
|
|
)
|
|
p.add_argument(
|
|
"--metadata-only",
|
|
action="store_true",
|
|
help=(
|
|
"Fetch scan parameters only; write metadata.json and scans.csv "
|
|
"rows but skip mosaics and tiles. Very fast — suitable for "
|
|
"inventorying all scans across all machines."
|
|
),
|
|
)
|
|
p.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Preview what would be downloaded without saving any files",
|
|
)
|
|
p.add_argument(
|
|
"--workers",
|
|
type=int,
|
|
metavar="N",
|
|
help="Override parallel download threads from config",
|
|
)
|
|
p.add_argument(
|
|
"--list-machines",
|
|
action="store_true",
|
|
help="Print available machines and exit (no credentials needed)",
|
|
)
|
|
p.add_argument(
|
|
"--list-scans",
|
|
action="store_true",
|
|
help="Print all scans for --machine and exit",
|
|
)
|
|
p.add_argument(
|
|
"--recheck",
|
|
action="store_true",
|
|
help=(
|
|
"Scan the archive for zero-byte or missing tile files whose URLs "
|
|
"are marked complete in .progress.json, remove them from progress, "
|
|
"and report how many were re-queued. Run before resuming after a crash."
|
|
),
|
|
)
|
|
p.add_argument(
|
|
"--verbose",
|
|
"-v",
|
|
action="store_true",
|
|
help="Enable debug logging",
|
|
)
|
|
return p.parse_args()
|
|
|
|
|
|
def main() -> None:
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)-8s %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
args = parse_args()
|
|
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
# --list-machines doesn't need credentials
|
|
if args.list_machines:
|
|
base_url = "http://205.149.147.131:8010/"
|
|
timeout = 30
|
|
if os.path.exists(args.config):
|
|
cfg = yaml.safe_load(open(args.config))
|
|
base_url = cfg.get("base_url", base_url)
|
|
timeout = cfg.get("timeout", timeout)
|
|
machines = discover_machines(base_url, timeout)
|
|
print(f"{'Label':<25} {'ID':>4} {'IP':<17} {'Version'}")
|
|
print("-" * 62)
|
|
for m in machines:
|
|
print(
|
|
f"{m['label']:<25} {m['machine_id']:>4} {m['ip']:<17} {m['version']}"
|
|
)
|
|
return
|
|
|
|
if not os.path.exists(args.config):
|
|
sys.exit(
|
|
f"Config file '{args.config}' not found.\n"
|
|
f"Copy config.example.yaml to {args.config} and fill in your credentials."
|
|
)
|
|
|
|
config = load_config(args.config)
|
|
if args.workers:
|
|
config["workers"] = _clamp_workers(args.workers)
|
|
|
|
output_dir = Path(config["output_dir"])
|
|
|
|
# --recheck: validate archive integrity and re-queue bad tiles
|
|
if args.recheck:
|
|
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
|
|
n_bad = recheck_tile_files(output_dir, progress)
|
|
n_requeued = recheck_archive(output_dir, progress)
|
|
if n_bad == 0 and n_requeued == 0:
|
|
log.info("Archive looks clean. No action needed.")
|
|
else:
|
|
log.info(
|
|
"Recheck complete: %d zero-byte file(s) deleted, "
|
|
"%d URL(s) re-queued for download.",
|
|
n_bad,
|
|
n_requeued,
|
|
)
|
|
return
|
|
|
|
# Build machine list
|
|
all_machines = discover_machines(config["base_url"], config["timeout"])
|
|
if not all_machines:
|
|
sys.exit("Could not retrieve machine list from server.")
|
|
|
|
# Apply --machine / config machines filter
|
|
filter_labels: list[str] | None = None
|
|
if args.machine:
|
|
filter_labels = [args.machine]
|
|
elif config.get("machines"):
|
|
filter_labels = list(config["machines"])
|
|
|
|
if filter_labels:
|
|
machines = [m for m in all_machines if m["label"] in filter_labels]
|
|
not_found = [
|
|
label
|
|
for label in filter_labels
|
|
if label not in {m["label"] for m in machines}
|
|
]
|
|
if not_found:
|
|
log.warning("Unknown machine label(s): %s", not_found)
|
|
else:
|
|
machines = all_machines
|
|
|
|
if not machines:
|
|
sys.exit("No machines selected.")
|
|
|
|
# --list-scans: print and exit
|
|
if args.list_scans:
|
|
if len(machines) != 1:
|
|
sys.exit("--list-scans requires exactly one machine (use --machine).")
|
|
sess = MachineSession(machines[0], config)
|
|
if not sess.login():
|
|
sys.exit("Login failed.")
|
|
scans = sess.get_all_scans()
|
|
print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}")
|
|
print("-" * 85)
|
|
for sc in scans:
|
|
print(
|
|
f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} "
|
|
f"{sc.get('name', ''):<40} {sc.get('status', '')}"
|
|
)
|
|
print(f"\nTotal: {len(scans)} scans")
|
|
return
|
|
|
|
log.info(
|
|
"Scraping %d machine(s): %s",
|
|
len(machines),
|
|
", ".join(m["label"] for m in machines),
|
|
)
|
|
if args.mosaic_only and args.metadata_only:
|
|
sys.exit("--mosaic-only and --metadata-only are mutually exclusive.")
|
|
if args.metadata_only:
|
|
log.info("Mode: metadata only (mosaics and tiles skipped)")
|
|
elif args.mosaic_only:
|
|
log.info("Mode: mosaics only (individual tiles skipped)")
|
|
if args.dry_run:
|
|
log.info("Mode: dry-run (no files will be written)")
|
|
|
|
# Shared progress and CSV writers
|
|
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
|
|
tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
|
|
scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
|
|
|
|
totals = RunStats()
|
|
try:
|
|
for machine in machines:
|
|
stats = scrape_machine(
|
|
machine=machine,
|
|
config=config,
|
|
output_dir=output_dir,
|
|
progress=progress,
|
|
tiles_csv=tiles_csv,
|
|
scans_csv=scans_csv,
|
|
dry_run=args.dry_run,
|
|
mosaic_only=args.mosaic_only,
|
|
metadata_only=args.metadata_only,
|
|
scan_id_filter=args.scan_id,
|
|
)
|
|
totals.merge(stats)
|
|
finally:
|
|
tiles_csv.close()
|
|
scans_csv.close()
|
|
progress.save()
|
|
|
|
_print_summary(
|
|
totals=totals,
|
|
machines=machines,
|
|
output_dir=output_dir,
|
|
dry_run=args.dry_run,
|
|
metadata_only=args.metadata_only,
|
|
mosaic_only=args.mosaic_only,
|
|
)
|
|
|
|
|
|
def _print_summary(
|
|
totals: RunStats,
|
|
machines: list[dict],
|
|
output_dir: Path,
|
|
dry_run: bool,
|
|
metadata_only: bool,
|
|
mosaic_only: bool,
|
|
) -> None:
|
|
W = 46
|
|
sep = "─" * W
|
|
|
|
def row(label: str, value: str, note: str = "") -> str:
|
|
note_str = f" ({note})" if note else ""
|
|
return f" {label:<22}{value}{note_str}"
|
|
|
|
log.info(sep)
|
|
if dry_run:
|
|
log.info(" Dry run complete — no files written.")
|
|
else:
|
|
log.info(" Run complete")
|
|
log.info(sep)
|
|
log.info(row("Machines:", str(len(machines))))
|
|
log.info(
|
|
row(
|
|
"Scans (metadata) fetched:",
|
|
str(totals.scans_fetched),
|
|
f"{totals.scans_skipped} already cached, "
|
|
f"{totals.scans_failed} metadata failed"
|
|
if totals.scans_skipped or totals.scans_failed
|
|
else "",
|
|
)
|
|
)
|
|
if not metadata_only:
|
|
log.info(row("Mosaics downloaded:", str(totals.mosaics_downloaded)))
|
|
if totals.mosaics_failed:
|
|
log.info(
|
|
row(
|
|
"Mosaics failed:",
|
|
str(totals.mosaics_failed),
|
|
"0 bytes or HTTP error; see log above",
|
|
)
|
|
)
|
|
if not metadata_only and not mosaic_only:
|
|
log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
|
|
if metadata_only:
|
|
log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files"))
|
|
log.info(sep)
|
|
log.info(row("Scans CSV:", str(output_dir / SCANS_CSV_FILENAME)))
|
|
if not metadata_only:
|
|
log.info(row("Tiles CSV:", str(output_dir / TILES_CSV_FILENAME)))
|
|
log.info(row("Progress:", str(output_dir / PROGRESS_FILENAME)))
|
|
log.info(sep)
|