Files
SPRUCE-scraper/spruce/cli.py
T
poprhythm 4118e6e4f0 Add sample_random_scans script and first-page list-scans option
- scripts/sample_random_scans.sh: pick a random scan per machine (default: first list page) and download mosaic and/or tiles
- --list-scans-first-page-only: one HTTP request for scan list (up to 320 IDs)
- scripts/machines.example.txt; .gitignore local machines.txt (copy from example)
- README: document usage
2026-04-26 20:56:52 -04:00

350 lines
11 KiB
Python

"""
Command-line interface for the spruce scraper.
"""
import argparse
import logging
import os
import sys
from pathlib import Path
import yaml
from spruce.orchestrator import scrape_machine, RunStats
from spruce.parsers import parse_machine_option
from spruce.progress import ProgressTracker, CsvWriter
from spruce.recheck import recheck_archive, recheck_tile_files
from spruce.settings import (
DEFAULT_CONFIG,
MAX_SAFE_WORKERS,
PROGRESS_FILENAME,
SCANS_CSV_FIELDS,
SCANS_CSV_FILENAME,
TILES_CSV_FIELDS,
TILES_CSV_FILENAME,
_clamp_workers,
load_config,
)
from spruce.session import MachineSession
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
log = logging.getLogger(__name__)
def discover_machines(base_url: str, timeout: int = 30) -> list[dict]:
resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
select = soup.find("select", {"name": "RTLNAME"})
if not select:
log.warning("Could not find machine selector on login page.")
return []
return [
parse_machine_option(opt.get_text(strip=True), opt["value"])
for opt in select.find_all("option")
]
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Archive minirhizotron image tiles from RootView.",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument(
"--config",
default=DEFAULT_CONFIG,
metavar="FILE",
help=f"YAML config file (default: {DEFAULT_CONFIG})",
)
p.add_argument(
"--machine",
metavar="LABEL",
help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"',
)
p.add_argument(
"--scan-id",
type=int,
metavar="ID",
help="Download only this specific scan ID (use with --machine)",
)
p.add_argument(
"--mosaic-only",
action="store_true",
help="Download mosaics only; skip individual tiles",
)
p.add_argument(
"--metadata-only",
action="store_true",
help=(
"Fetch scan parameters only; write metadata.json and scans.csv "
"rows but skip mosaics and tiles. Very fast — suitable for "
"inventorying all scans across all machines."
),
)
p.add_argument(
"--dry-run",
action="store_true",
help="Preview what would be downloaded without saving any files",
)
p.add_argument(
"--workers",
type=int,
metavar="N",
help="Override parallel download threads from config",
)
p.add_argument(
"--list-machines",
action="store_true",
help="Print available machines and exit (no credentials needed)",
)
p.add_argument(
"--list-scans",
action="store_true",
help="Print all scans for --machine and exit",
)
p.add_argument(
"--list-scans-first-page-only",
action="store_true",
help=(
"With --list-scans: only fetch the first list page (up to 320 scans) "
"— one HTTP request, no pagination"
),
)
p.add_argument(
"--recheck",
action="store_true",
help=(
"Scan the archive for zero-byte or missing tile files whose URLs "
"are marked complete in .progress.json, remove them from progress, "
"and report how many were re-queued. Run before resuming after a crash."
),
)
p.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable debug logging",
)
return p.parse_args()
def main() -> None:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%H:%M:%S",
)
args = parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
if args.list_scans_first_page_only and not args.list_scans:
sys.exit("--list-scans-first-page-only requires --list-scans")
# --list-machines doesn't need credentials
if args.list_machines:
base_url = "http://205.149.147.131:8010/"
timeout = 30
if os.path.exists(args.config):
cfg = yaml.safe_load(open(args.config))
base_url = cfg.get("base_url", base_url)
timeout = cfg.get("timeout", timeout)
machines = discover_machines(base_url, timeout)
print(f"{'Label':<25} {'ID':>4} {'IP':<17} {'Version'}")
print("-" * 62)
for m in machines:
print(
f"{m['label']:<25} {m['machine_id']:>4} {m['ip']:<17} {m['version']}"
)
return
if not os.path.exists(args.config):
sys.exit(
f"Config file '{args.config}' not found.\n"
f"Copy config.example.yaml to {args.config} and fill in your credentials."
)
config = load_config(args.config)
if args.workers:
config["workers"] = _clamp_workers(args.workers)
output_dir = Path(config["output_dir"])
# --recheck: validate archive integrity and re-queue bad tiles
if args.recheck:
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
n_bad = recheck_tile_files(output_dir, progress)
n_requeued = recheck_archive(output_dir, progress)
if n_bad == 0 and n_requeued == 0:
log.info("Archive looks clean. No action needed.")
else:
log.info(
"Recheck complete: %d zero-byte file(s) deleted, "
"%d URL(s) re-queued for download.",
n_bad,
n_requeued,
)
return
# Build machine list
all_machines = discover_machines(config["base_url"], config["timeout"])
if not all_machines:
sys.exit("Could not retrieve machine list from server.")
# Apply --machine / config machines filter
filter_labels: list[str] | None = None
if args.machine:
filter_labels = [args.machine]
elif config.get("machines"):
filter_labels = list(config["machines"])
if filter_labels:
machines = [m for m in all_machines if m["label"] in filter_labels]
not_found = [
label
for label in filter_labels
if label not in {m["label"] for m in machines}
]
if not_found:
log.warning("Unknown machine label(s): %s", not_found)
else:
machines = all_machines
if not machines:
sys.exit("No machines selected.")
# --list-scans: print and exit
if args.list_scans:
if len(machines) != 1:
sys.exit("--list-scans requires exactly one machine (use --machine).")
sess = MachineSession(machines[0], config)
if not sess.login():
sys.exit("Login failed.")
first_only = bool(args.list_scans_first_page_only)
scans = sess.get_all_scans(first_page_only=first_only)
print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}")
print("-" * 85)
for sc in scans:
print(
f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} "
f"{sc.get('name', ''):<40} {sc.get('status', '')}"
)
total_note = " (first page only — not full archive)" if first_only else ""
print(f"\nTotal: {len(scans)} scans{total_note}")
return
log.info(
"Scraping %d machine(s): %s",
len(machines),
", ".join(m["label"] for m in machines),
)
if args.mosaic_only and args.metadata_only:
sys.exit("--mosaic-only and --metadata-only are mutually exclusive.")
if args.metadata_only:
log.info("Mode: metadata only (mosaics and tiles skipped)")
elif args.mosaic_only:
log.info("Mode: mosaics only (individual tiles skipped)")
if args.dry_run:
log.info("Mode: dry-run (no files will be written)")
# Shared progress and CSV writers
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
totals = RunStats()
try:
for machine in machines:
stats = scrape_machine(
machine=machine,
config=config,
output_dir=output_dir,
progress=progress,
tiles_csv=tiles_csv,
scans_csv=scans_csv,
dry_run=args.dry_run,
mosaic_only=args.mosaic_only,
metadata_only=args.metadata_only,
scan_id_filter=args.scan_id,
)
totals.merge(stats)
finally:
tiles_csv.close()
scans_csv.close()
progress.save()
_print_summary(
totals=totals,
machines=machines,
output_dir=output_dir,
dry_run=args.dry_run,
metadata_only=args.metadata_only,
mosaic_only=args.mosaic_only,
)
def _print_summary(
totals: RunStats,
machines: list[dict],
output_dir: Path,
dry_run: bool,
metadata_only: bool,
mosaic_only: bool,
) -> None:
W = 46
sep = "─" * W
def row(label: str, value: str, note: str = "") -> str:
note_str = f" ({note})" if note else ""
return f" {label:<22}{value}{note_str}"
log.info(sep)
if dry_run:
log.info(" Dry run complete — no files written.")
else:
log.info(" Run complete")
log.info(sep)
log.info(row("Machines:", str(len(machines))))
log.info(
row(
"Scans (metadata) fetched:",
str(totals.scans_fetched),
f"{totals.scans_skipped} already cached, "
f"{totals.scans_failed} metadata failed"
if totals.scans_skipped or totals.scans_failed
else "",
)
)
if not metadata_only:
log.info(row("Mosaics downloaded:", str(totals.mosaics_downloaded)))
if totals.mosaics_failed:
log.info(
row(
"Mosaics failed:",
str(totals.mosaics_failed),
"0 bytes or HTTP error; see scans.csv and logs",
)
)
if not metadata_only and not mosaic_only:
log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
if not dry_run and not metadata_only:
log.info(
row(
"Mosaic & tile errors:",
"",
f"{SCANS_CSV_FILENAME} & {TILES_CSV_FILENAME} (error_class, error, error_code)",
)
)
if metadata_only:
log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files"))
log.info(sep)
log.info(row("Scans CSV:", str(output_dir / SCANS_CSV_FILENAME)))
if not metadata_only:
log.info(row("Tiles CSV:", str(output_dir / TILES_CSV_FILENAME)))
log.info(row("Progress:", str(output_dir / PROGRESS_FILENAME)))
log.info(sep)