Initial commit
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
This commit is contained in:
+259
@@ -0,0 +1,259 @@
|
||||
"""
|
||||
Command-line interface for the spruce scraper.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from spruce.orchestrator import scrape_machine
|
||||
from spruce.parsers import parse_machine_option
|
||||
from spruce.progress import ProgressTracker, CsvWriter
|
||||
from spruce.recheck import recheck_archive, recheck_tile_files
|
||||
from spruce.settings import (
|
||||
DEFAULT_CONFIG,
|
||||
MAX_SAFE_WORKERS,
|
||||
PROGRESS_FILENAME,
|
||||
SCANS_CSV_FIELDS,
|
||||
SCANS_CSV_FILENAME,
|
||||
TILES_CSV_FIELDS,
|
||||
TILES_CSV_FILENAME,
|
||||
_clamp_workers,
|
||||
load_config,
|
||||
)
|
||||
from spruce.session import MachineSession
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def discover_machines(base_url: str, timeout: int = 30) -> list[dict]:
|
||||
resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
select = soup.find("select", {"name": "RTLNAME"})
|
||||
if not select:
|
||||
log.warning("Could not find machine selector on login page.")
|
||||
return []
|
||||
return [
|
||||
parse_machine_option(opt.get_text(strip=True), opt["value"])
|
||||
for opt in select.find_all("option")
|
||||
]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(
|
||||
description="Archive minirhizotron image tiles from RootView.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
p.add_argument(
|
||||
"--config",
|
||||
default=DEFAULT_CONFIG,
|
||||
metavar="FILE",
|
||||
help=f"YAML config file (default: {DEFAULT_CONFIG})",
|
||||
)
|
||||
p.add_argument(
|
||||
"--machine",
|
||||
metavar="LABEL",
|
||||
help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"',
|
||||
)
|
||||
p.add_argument(
|
||||
"--scan-id",
|
||||
type=int,
|
||||
metavar="ID",
|
||||
help="Download only this specific scan ID (use with --machine)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--mosaic-only",
|
||||
action="store_true",
|
||||
help="Download mosaics only; skip individual tiles",
|
||||
)
|
||||
p.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Preview what would be downloaded without saving any files",
|
||||
)
|
||||
p.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
metavar="N",
|
||||
help="Override parallel download threads from config",
|
||||
)
|
||||
p.add_argument(
|
||||
"--list-machines",
|
||||
action="store_true",
|
||||
help="Print available machines and exit (no credentials needed)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--list-scans",
|
||||
action="store_true",
|
||||
help="Print all scans for --machine and exit",
|
||||
)
|
||||
p.add_argument(
|
||||
"--recheck",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Scan the archive for zero-byte or missing tile files whose URLs "
|
||||
"are marked complete in .progress.json, remove them from progress, "
|
||||
"and report how many were re-queued. Run before resuming after a crash."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
help="Enable debug logging",
|
||||
)
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)-8s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
args = parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
# --list-machines doesn't need credentials
|
||||
if args.list_machines:
|
||||
base_url = "http://205.149.147.131:8010/"
|
||||
timeout = 30
|
||||
if os.path.exists(args.config):
|
||||
cfg = yaml.safe_load(open(args.config))
|
||||
base_url = cfg.get("base_url", base_url)
|
||||
timeout = cfg.get("timeout", timeout)
|
||||
machines = discover_machines(base_url, timeout)
|
||||
print(f"{'Label':<25} {'ID':>4} {'IP':<17} {'Version'}")
|
||||
print("-" * 62)
|
||||
for m in machines:
|
||||
print(
|
||||
f"{m['label']:<25} {m['machine_id']:>4} {m['ip']:<17} {m['version']}"
|
||||
)
|
||||
return
|
||||
|
||||
if not os.path.exists(args.config):
|
||||
sys.exit(
|
||||
f"Config file '{args.config}' not found.\n"
|
||||
f"Copy config.example.yaml to {args.config} and fill in your credentials."
|
||||
)
|
||||
|
||||
config = load_config(args.config)
|
||||
if args.workers:
|
||||
config["workers"] = _clamp_workers(args.workers)
|
||||
|
||||
output_dir = Path(config["output_dir"])
|
||||
|
||||
# --recheck: validate archive integrity and re-queue bad tiles
|
||||
if args.recheck:
|
||||
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
|
||||
n_bad = recheck_tile_files(output_dir, progress)
|
||||
n_requeued = recheck_archive(output_dir, progress)
|
||||
if n_bad == 0 and n_requeued == 0:
|
||||
log.info("Archive looks clean. No action needed.")
|
||||
else:
|
||||
log.info(
|
||||
"Recheck complete: %d zero-byte file(s) deleted, "
|
||||
"%d URL(s) re-queued for download.",
|
||||
n_bad,
|
||||
n_requeued,
|
||||
)
|
||||
return
|
||||
|
||||
# Build machine list
|
||||
all_machines = discover_machines(config["base_url"], config["timeout"])
|
||||
if not all_machines:
|
||||
sys.exit("Could not retrieve machine list from server.")
|
||||
|
||||
# Apply --machine / config machines filter
|
||||
filter_labels: list[str] | None = None
|
||||
if args.machine:
|
||||
filter_labels = [args.machine]
|
||||
elif config.get("machines"):
|
||||
filter_labels = list(config["machines"])
|
||||
|
||||
if filter_labels:
|
||||
machines = [m for m in all_machines if m["label"] in filter_labels]
|
||||
not_found = [
|
||||
label
|
||||
for label in filter_labels
|
||||
if label not in {m["label"] for m in machines}
|
||||
]
|
||||
if not_found:
|
||||
log.warning("Unknown machine label(s): %s", not_found)
|
||||
else:
|
||||
machines = all_machines
|
||||
|
||||
if not machines:
|
||||
sys.exit("No machines selected.")
|
||||
|
||||
# --list-scans: print and exit
|
||||
if args.list_scans:
|
||||
if len(machines) != 1:
|
||||
sys.exit("--list-scans requires exactly one machine (use --machine).")
|
||||
sess = MachineSession(machines[0], config)
|
||||
if not sess.login():
|
||||
sys.exit("Login failed.")
|
||||
scans = sess.get_all_scans()
|
||||
print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}")
|
||||
print("-" * 85)
|
||||
for sc in scans:
|
||||
print(
|
||||
f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} "
|
||||
f"{sc.get('name', ''):<40} {sc.get('status', '')}"
|
||||
)
|
||||
print(f"\nTotal: {len(scans)} scans")
|
||||
return
|
||||
|
||||
log.info(
|
||||
"Scraping %d machine(s): %s",
|
||||
len(machines),
|
||||
", ".join(m["label"] for m in machines),
|
||||
)
|
||||
if args.mosaic_only:
|
||||
log.info("Mode: mosaics only (individual tiles skipped)")
|
||||
if args.dry_run:
|
||||
log.info("Mode: dry-run (no files will be written)")
|
||||
|
||||
# Shared progress and CSV writers
|
||||
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
|
||||
tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
|
||||
scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
|
||||
|
||||
total = 0
|
||||
try:
|
||||
for machine in machines:
|
||||
count = scrape_machine(
|
||||
machine=machine,
|
||||
config=config,
|
||||
output_dir=output_dir,
|
||||
progress=progress,
|
||||
tiles_csv=tiles_csv,
|
||||
scans_csv=scans_csv,
|
||||
dry_run=args.dry_run,
|
||||
mosaic_only=args.mosaic_only,
|
||||
scan_id_filter=args.scan_id,
|
||||
)
|
||||
total += count
|
||||
finally:
|
||||
tiles_csv.close()
|
||||
scans_csv.close()
|
||||
progress.save()
|
||||
|
||||
if args.dry_run:
|
||||
log.info("Dry run complete.")
|
||||
else:
|
||||
log.info("Done. Total files downloaded: %d", total)
|
||||
log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME)
|
||||
log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME)
|
||||
log.info("Progress : %s", output_dir / PROGRESS_FILENAME)
|
||||
Reference in New Issue
Block a user