e122f6435a
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
260 lines
7.9 KiB
Python
260 lines
7.9 KiB
Python
"""
|
|
Command-line interface for the spruce scraper.
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
from spruce.orchestrator import scrape_machine
|
|
from spruce.parsers import parse_machine_option
|
|
from spruce.progress import ProgressTracker, CsvWriter
|
|
from spruce.recheck import recheck_archive, recheck_tile_files
|
|
from spruce.settings import (
|
|
DEFAULT_CONFIG,
|
|
MAX_SAFE_WORKERS,
|
|
PROGRESS_FILENAME,
|
|
SCANS_CSV_FIELDS,
|
|
SCANS_CSV_FILENAME,
|
|
TILES_CSV_FIELDS,
|
|
TILES_CSV_FILENAME,
|
|
_clamp_workers,
|
|
load_config,
|
|
)
|
|
from spruce.session import MachineSession
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def discover_machines(base_url: str, timeout: int = 30) -> list[dict]:
|
|
resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
select = soup.find("select", {"name": "RTLNAME"})
|
|
if not select:
|
|
log.warning("Could not find machine selector on login page.")
|
|
return []
|
|
return [
|
|
parse_machine_option(opt.get_text(strip=True), opt["value"])
|
|
for opt in select.find_all("option")
|
|
]
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
p = argparse.ArgumentParser(
|
|
description="Archive minirhizotron image tiles from RootView.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
p.add_argument(
|
|
"--config",
|
|
default=DEFAULT_CONFIG,
|
|
metavar="FILE",
|
|
help=f"YAML config file (default: {DEFAULT_CONFIG})",
|
|
)
|
|
p.add_argument(
|
|
"--machine",
|
|
metavar="LABEL",
|
|
help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"',
|
|
)
|
|
p.add_argument(
|
|
"--scan-id",
|
|
type=int,
|
|
metavar="ID",
|
|
help="Download only this specific scan ID (use with --machine)",
|
|
)
|
|
p.add_argument(
|
|
"--mosaic-only",
|
|
action="store_true",
|
|
help="Download mosaics only; skip individual tiles",
|
|
)
|
|
p.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Preview what would be downloaded without saving any files",
|
|
)
|
|
p.add_argument(
|
|
"--workers",
|
|
type=int,
|
|
metavar="N",
|
|
help="Override parallel download threads from config",
|
|
)
|
|
p.add_argument(
|
|
"--list-machines",
|
|
action="store_true",
|
|
help="Print available machines and exit (no credentials needed)",
|
|
)
|
|
p.add_argument(
|
|
"--list-scans",
|
|
action="store_true",
|
|
help="Print all scans for --machine and exit",
|
|
)
|
|
p.add_argument(
|
|
"--recheck",
|
|
action="store_true",
|
|
help=(
|
|
"Scan the archive for zero-byte or missing tile files whose URLs "
|
|
"are marked complete in .progress.json, remove them from progress, "
|
|
"and report how many were re-queued. Run before resuming after a crash."
|
|
),
|
|
)
|
|
p.add_argument(
|
|
"--verbose",
|
|
"-v",
|
|
action="store_true",
|
|
help="Enable debug logging",
|
|
)
|
|
return p.parse_args()
|
|
|
|
|
|
def main() -> None:
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)-8s %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
args = parse_args()
|
|
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
# --list-machines doesn't need credentials
|
|
if args.list_machines:
|
|
base_url = "http://205.149.147.131:8010/"
|
|
timeout = 30
|
|
if os.path.exists(args.config):
|
|
cfg = yaml.safe_load(open(args.config))
|
|
base_url = cfg.get("base_url", base_url)
|
|
timeout = cfg.get("timeout", timeout)
|
|
machines = discover_machines(base_url, timeout)
|
|
print(f"{'Label':<25} {'ID':>4} {'IP':<17} {'Version'}")
|
|
print("-" * 62)
|
|
for m in machines:
|
|
print(
|
|
f"{m['label']:<25} {m['machine_id']:>4} {m['ip']:<17} {m['version']}"
|
|
)
|
|
return
|
|
|
|
if not os.path.exists(args.config):
|
|
sys.exit(
|
|
f"Config file '{args.config}' not found.\n"
|
|
f"Copy config.example.yaml to {args.config} and fill in your credentials."
|
|
)
|
|
|
|
config = load_config(args.config)
|
|
if args.workers:
|
|
config["workers"] = _clamp_workers(args.workers)
|
|
|
|
output_dir = Path(config["output_dir"])
|
|
|
|
# --recheck: validate archive integrity and re-queue bad tiles
|
|
if args.recheck:
|
|
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
|
|
n_bad = recheck_tile_files(output_dir, progress)
|
|
n_requeued = recheck_archive(output_dir, progress)
|
|
if n_bad == 0 and n_requeued == 0:
|
|
log.info("Archive looks clean. No action needed.")
|
|
else:
|
|
log.info(
|
|
"Recheck complete: %d zero-byte file(s) deleted, "
|
|
"%d URL(s) re-queued for download.",
|
|
n_bad,
|
|
n_requeued,
|
|
)
|
|
return
|
|
|
|
# Build machine list
|
|
all_machines = discover_machines(config["base_url"], config["timeout"])
|
|
if not all_machines:
|
|
sys.exit("Could not retrieve machine list from server.")
|
|
|
|
# Apply --machine / config machines filter
|
|
filter_labels: list[str] | None = None
|
|
if args.machine:
|
|
filter_labels = [args.machine]
|
|
elif config.get("machines"):
|
|
filter_labels = list(config["machines"])
|
|
|
|
if filter_labels:
|
|
machines = [m for m in all_machines if m["label"] in filter_labels]
|
|
not_found = [
|
|
label
|
|
for label in filter_labels
|
|
if label not in {m["label"] for m in machines}
|
|
]
|
|
if not_found:
|
|
log.warning("Unknown machine label(s): %s", not_found)
|
|
else:
|
|
machines = all_machines
|
|
|
|
if not machines:
|
|
sys.exit("No machines selected.")
|
|
|
|
# --list-scans: print and exit
|
|
if args.list_scans:
|
|
if len(machines) != 1:
|
|
sys.exit("--list-scans requires exactly one machine (use --machine).")
|
|
sess = MachineSession(machines[0], config)
|
|
if not sess.login():
|
|
sys.exit("Login failed.")
|
|
scans = sess.get_all_scans()
|
|
print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}")
|
|
print("-" * 85)
|
|
for sc in scans:
|
|
print(
|
|
f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} "
|
|
f"{sc.get('name', ''):<40} {sc.get('status', '')}"
|
|
)
|
|
print(f"\nTotal: {len(scans)} scans")
|
|
return
|
|
|
|
log.info(
|
|
"Scraping %d machine(s): %s",
|
|
len(machines),
|
|
", ".join(m["label"] for m in machines),
|
|
)
|
|
if args.mosaic_only:
|
|
log.info("Mode: mosaics only (individual tiles skipped)")
|
|
if args.dry_run:
|
|
log.info("Mode: dry-run (no files will be written)")
|
|
|
|
# Shared progress and CSV writers
|
|
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
|
|
tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
|
|
scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
|
|
|
|
total = 0
|
|
try:
|
|
for machine in machines:
|
|
count = scrape_machine(
|
|
machine=machine,
|
|
config=config,
|
|
output_dir=output_dir,
|
|
progress=progress,
|
|
tiles_csv=tiles_csv,
|
|
scans_csv=scans_csv,
|
|
dry_run=args.dry_run,
|
|
mosaic_only=args.mosaic_only,
|
|
scan_id_filter=args.scan_id,
|
|
)
|
|
total += count
|
|
finally:
|
|
tiles_csv.close()
|
|
scans_csv.close()
|
|
progress.save()
|
|
|
|
if args.dry_run:
|
|
log.info("Dry run complete.")
|
|
else:
|
|
log.info("Done. Total files downloaded: %d", total)
|
|
log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME)
|
|
log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME)
|
|
log.info("Progress : %s", output_dir / PROGRESS_FILENAME)
|