Initial commit

Add spruce scraper with CLI, session management, parsers, progress tracking,
recheck logic, and test suite. Includes example config and README.
This commit is contained in:
2026-04-22 10:41:18 -04:00
commit e122f6435a
23 changed files with 3789 additions and 0 deletions
+259
View File
@@ -0,0 +1,259 @@
"""
Command-line interface for the spruce scraper.
"""
import argparse
import logging
import os
import sys
from pathlib import Path
import yaml
from spruce.orchestrator import scrape_machine
from spruce.parsers import parse_machine_option
from spruce.progress import ProgressTracker, CsvWriter
from spruce.recheck import recheck_archive, recheck_tile_files
from spruce.settings import (
DEFAULT_CONFIG,
MAX_SAFE_WORKERS,
PROGRESS_FILENAME,
SCANS_CSV_FIELDS,
SCANS_CSV_FILENAME,
TILES_CSV_FIELDS,
TILES_CSV_FILENAME,
_clamp_workers,
load_config,
)
from spruce.session import MachineSession
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
log = logging.getLogger(__name__)
def discover_machines(base_url: str, timeout: int = 30) -> list[dict]:
resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
select = soup.find("select", {"name": "RTLNAME"})
if not select:
log.warning("Could not find machine selector on login page.")
return []
return [
parse_machine_option(opt.get_text(strip=True), opt["value"])
for opt in select.find_all("option")
]
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Archive minirhizotron image tiles from RootView.",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument(
"--config",
default=DEFAULT_CONFIG,
metavar="FILE",
help=f"YAML config file (default: {DEFAULT_CONFIG})",
)
p.add_argument(
"--machine",
metavar="LABEL",
help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"',
)
p.add_argument(
"--scan-id",
type=int,
metavar="ID",
help="Download only this specific scan ID (use with --machine)",
)
p.add_argument(
"--mosaic-only",
action="store_true",
help="Download mosaics only; skip individual tiles",
)
p.add_argument(
"--dry-run",
action="store_true",
help="Preview what would be downloaded without saving any files",
)
p.add_argument(
"--workers",
type=int,
metavar="N",
help="Override parallel download threads from config",
)
p.add_argument(
"--list-machines",
action="store_true",
help="Print available machines and exit (no credentials needed)",
)
p.add_argument(
"--list-scans",
action="store_true",
help="Print all scans for --machine and exit",
)
p.add_argument(
"--recheck",
action="store_true",
help=(
"Scan the archive for zero-byte or missing tile files whose URLs "
"are marked complete in .progress.json, remove them from progress, "
"and report how many were re-queued. Run before resuming after a crash."
),
)
p.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable debug logging",
)
return p.parse_args()
def main() -> None:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%H:%M:%S",
)
args = parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
# --list-machines doesn't need credentials
if args.list_machines:
base_url = "http://205.149.147.131:8010/"
timeout = 30
if os.path.exists(args.config):
cfg = yaml.safe_load(open(args.config))
base_url = cfg.get("base_url", base_url)
timeout = cfg.get("timeout", timeout)
machines = discover_machines(base_url, timeout)
print(f"{'Label':<25} {'ID':>4} {'IP':<17} {'Version'}")
print("-" * 62)
for m in machines:
print(
f"{m['label']:<25} {m['machine_id']:>4} {m['ip']:<17} {m['version']}"
)
return
if not os.path.exists(args.config):
sys.exit(
f"Config file '{args.config}' not found.\n"
f"Copy config.example.yaml to {args.config} and fill in your credentials."
)
config = load_config(args.config)
if args.workers:
config["workers"] = _clamp_workers(args.workers)
output_dir = Path(config["output_dir"])
# --recheck: validate archive integrity and re-queue bad tiles
if args.recheck:
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
n_bad = recheck_tile_files(output_dir, progress)
n_requeued = recheck_archive(output_dir, progress)
if n_bad == 0 and n_requeued == 0:
log.info("Archive looks clean. No action needed.")
else:
log.info(
"Recheck complete: %d zero-byte file(s) deleted, "
"%d URL(s) re-queued for download.",
n_bad,
n_requeued,
)
return
# Build machine list
all_machines = discover_machines(config["base_url"], config["timeout"])
if not all_machines:
sys.exit("Could not retrieve machine list from server.")
# Apply --machine / config machines filter
filter_labels: list[str] | None = None
if args.machine:
filter_labels = [args.machine]
elif config.get("machines"):
filter_labels = list(config["machines"])
if filter_labels:
machines = [m for m in all_machines if m["label"] in filter_labels]
not_found = [
label
for label in filter_labels
if label not in {m["label"] for m in machines}
]
if not_found:
log.warning("Unknown machine label(s): %s", not_found)
else:
machines = all_machines
if not machines:
sys.exit("No machines selected.")
# --list-scans: print and exit
if args.list_scans:
if len(machines) != 1:
sys.exit("--list-scans requires exactly one machine (use --machine).")
sess = MachineSession(machines[0], config)
if not sess.login():
sys.exit("Login failed.")
scans = sess.get_all_scans()
print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}")
print("-" * 85)
for sc in scans:
print(
f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} "
f"{sc.get('name', ''):<40} {sc.get('status', '')}"
)
print(f"\nTotal: {len(scans)} scans")
return
log.info(
"Scraping %d machine(s): %s",
len(machines),
", ".join(m["label"] for m in machines),
)
if args.mosaic_only:
log.info("Mode: mosaics only (individual tiles skipped)")
if args.dry_run:
log.info("Mode: dry-run (no files will be written)")
# Shared progress and CSV writers
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
total = 0
try:
for machine in machines:
count = scrape_machine(
machine=machine,
config=config,
output_dir=output_dir,
progress=progress,
tiles_csv=tiles_csv,
scans_csv=scans_csv,
dry_run=args.dry_run,
mosaic_only=args.mosaic_only,
scan_id_filter=args.scan_id,
)
total += count
finally:
tiles_csv.close()
scans_csv.close()
progress.save()
if args.dry_run:
log.info("Dry run complete.")
else:
log.info("Done. Total files downloaded: %d", total)
log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME)
log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME)
log.info("Progress : %s", output_dir / PROGRESS_FILENAME)