commit e122f6435a9752fa3605947fdfb9c0bca55def13 Author: James Kolpack Date: Wed Apr 22 10:41:18 2026 -0400 Initial commit Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0022c39 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +config.yaml +archives/ +__pycache__/ +*.pyc +.DS_Store +explore_dumps/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..3e6c1a5 --- /dev/null +++ b/README.md @@ -0,0 +1,185 @@ +# Spruce Minirhizotron Scraper + +A Python tool for archiving image data collected by minirhizotron cameras at the Spruce experiment site. It authenticates against the RootView web interface, enumerates all scans across all 12 camera machines, and downloads image tiles and mosaics to a structured local archive with full metadata. + +--- + +## Background + +[Minirhizotron cameras](https://en.wikipedia.org/wiki/Minirhizotron) are inserted into clear tubes buried in the ground to image root systems non-destructively over time. This project archives data from the **SPRUCE** (Spruce and Peatland Responses Under Changing Environments) experiment, which monitors boreal peatland responses to warming and elevated CO₂. + +The 12 AMR camera machines (`BW1-4` through `BW3-21`) are managed by a **RootView** web application at `http://205.149.147.131:8010`. Each scan captures a grid of overlapping image tiles along a buried tube. The server also pre-renders a full stitched mosaic for each scan. + +--- + +## Archive inventory (as of April 2026) + +| Machine | Scans | Scan type (sampled) | +|---|---:|---| +| BW1-4 [AMR-15] | 6,121 | Mixed (full-tube + partial) | +| BW1-6 [AMR-19] | 18,198 | Full-tube (~33,784 tiles, ~1.7 GB each) | +| BW1-7 [AMR-18] | 430 | Full-tube (~33,784 tiles, ~1.8 GB each) | +| BW2-8 [AMR-25] | 8,191 | Partial (~400 tiles, ~10 MB each) | +| BW2-10 [AMR-22] | 16,537 | Not yet sampled | +| BW2-11 [AMR-23] | 26,763 | Not yet sampled | +| BW2-13 [AMR-24] | 13,537 | Not yet sampled | +| BW3-16 [AMR-16] | 7,325 | Not yet sampled | +| BW3-17 [AMR-20] | 471 | Not yet sampled | +| BW3-19 [AMR-21] | 15,186 | Not yet sampled | +| BW3-20 [AMR-26] | 23,052 | Full-tube (~33,784 tiles, ~1.95 GB each) | +| BW3-21 [AMR-17] | 10,115 | Not yet sampled | +| **Total** | **145,926** | | + +### Storage estimates + +| What | Size | Notes | +|---|---|---| +| Mosaics only | ~2.4 TB | 145,926 × 16.6 MB per mosaic | +| Full tiles (mixed scans) | ~160 TB | Assumes 40% full-tube, 60% partial | +| Full tiles (worst case) | ~368 TB | If all scans are full-tube | + +A full-tube scan covers a 310 mm × 740 mm cylinder at 3.01 × 2.26 mm steps, producing a **103 × 328 = 33,784 tile grid**. Each tile is ~79 KB on average (JPEG, 137 KB at the tube surface). + +### Download speed + +Tile downloads are server-limited: the RootView PHP backend renders tiles on-demand, sustaining ~**0.67 tiles/sec** with 8 parallel workers regardless of local bandwidth. Mosaics are pre-rendered and download ~20× faster per MB. + +| Scenario | Estimated time | +|---|---| +| All mosaics (4 workers) | ~3 months | +| Full tiles for one scan (8 workers) | ~14 hours | +| All tiles, full-tube machines only | Years — not recommended | + +**Recommended approach:** archive mosaics first (`--mosaic-only`), then selectively download tiles for priority scans. + +--- + +## Setup + +```bash +# 1. Clone / download this repo +cd spruce_scrapper + +# 2. Install dependencies (Python 3.10+) +pip install -r requirements.txt + +# 3. Configure credentials +cp config.example.yaml config.yaml +# Edit config.yaml: set username and password +``` + +`config.yaml` is gitignored and never committed. + +--- + +## Usage + +```bash +# List all available machines (no login needed) +python scraper.py --list-machines + +# List all scans for a machine +python scraper.py --list-scans --machine "BW3-20 [AMR-26]" + +# Preview what would be downloaded (dry run) +python scraper.py --machine "BW3-20 [AMR-26]" --dry-run + +# Download mosaics only for one machine +python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only + +# Download mosaics for all machines +python scraper.py --mosaic-only + +# Download all tiles for a specific scan +python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4 + +# Resume an interrupted download (automatically skips completed files) +python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4 +``` + +### All options + +| Flag | Description | +|---|---| +| `--config FILE` | Config file path (default: `config.yaml`) | +| `--machine LABEL` | Restrict to one machine, e.g. `"BW3-20 [AMR-26]"` | +| `--scan-id ID` | Download only this scan (use with `--machine`) | +| `--mosaic-only` | Download mosaics only; skip individual tiles | +| `--dry-run` | Print what would be downloaded without saving | +| `--workers N` | Parallel download threads (default: 2, hard cap: 4) | +| `--recheck` | Scan archive for zero-byte/missing tiles and remove them from `.progress.json` so they re-download on next run | +| `--list-machines` | Print all machines and exit | +| `--list-scans` | Print all scans for `--machine` and exit | +| `--verbose` / `-v` | Debug logging | + +--- + +## Output layout + +``` +archives/ +├── .progress.json # tracks completed URLs for resume support +├── scans.csv # scan-level metadata for every processed scan +├── tiles.csv # tile-level metadata for every downloaded tile +│ +└── BW3-20__AMR-26/ + └── 2024-07-29/ + └── 158374/ + ├── metadata.json # full scan parameters (grid, timestamps, etc.) + ├── mosaic.jpg # pre-stitched full image (~16 MB) + └── tiles/ + ├── tile_r000_c000.jpg # row 0, column 0 + ├── tile_r000_c001.jpg + └── ... # 33,784 tiles total for a full-tube scan +``` + +Tile filenames encode position: `tile_r{row}_c{col}.jpg` where row increases with depth (Y in mm) and column increases along the tube circumference (X in mm). + +### Metadata files + +**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_downloaded` + +**`tiles.csv`** columns: `machine`, `machine_id`, `scan_id`, `scan_time`, `row_index`, `col_index`, `x_mm`, `y_mm`, `url`, `local_path`, `downloaded_at`, `file_size_bytes` + +--- + +## Site structure (RootView) + +The RootView interface runs on a standard PHP stack. Key endpoints discovered: + +| Endpoint | Description | +|---|---| +| `POST index.php` | Login (`RTLLogin=1`, `RTLNAME`, `RTLUSER`, `RTLPWD`) | +| `POST index.php {cmd:scan, start:N, FilterCount:320}` | Paginated scan list | +| `GET index.php?cmd=scan&mode=view&id=ID` | Scan detail (grid params, disk usage) | +| `GET index.php?cmd=image&mode=image_scan&id=ID&s=1&x=X&y=Y` | Individual tile JPEG | +| `GET http://:8011/RootView_Database/ID/mosaic.jpg` | Pre-stitched mosaic | + +Grid coordinates (X, Y) are in millimetres, starting from `(start_x, start_y)` with step `(dx, dy)`. + +--- + +## Resume and reliability + +- **Resumable**: `.progress.json` records every completed URL. Re-running the same command skips already-downloaded files. +- **Retry logic**: each tile download retries up to 3 times with exponential backoff (5 s → 10 s → 20 s) before logging a warning and moving on. +- **Worker cap**: the RootView server renders tiles on a single-threaded PHP process. Running more than 4 concurrent requests causes cascading read timeouts. The default is 2 workers; the scraper hard-caps at 4 and warns loudly if you try to exceed it. +- **Crash recovery**: if a run is killed mid-flight, some in-progress tiles may have been written as zero-byte files without being marked complete. Run `--recheck` before resuming — it deletes zero-byte files on disk and removes their URLs from `.progress.json` so they are cleanly re-downloaded. + +```bash +# After any interrupted run, always do this first: +python scraper.py --recheck +# Then resume normally: +python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 +``` + +--- + +## Dependencies + +| Package | Purpose | +|---|---| +| `requests` | HTTP client | +| `beautifulsoup4` + `lxml` | HTML parsing | +| `pyyaml` | Config file | +| `tqdm` | Progress bars | diff --git a/config.example.yaml b/config.example.yaml new file mode 100644 index 0000000..7df15f2 --- /dev/null +++ b/config.example.yaml @@ -0,0 +1,29 @@ +# RootView scraper configuration +# Copy this to config.yaml and fill in your credentials. +# config.yaml is gitignored — never commit it. + +base_url: "http://205.149.147.131:8010/" + +# Login credentials (same for all machines) +username: "your_username_here" +password: "your_password_here" + +# Local directory where archives will be written +output_dir: "archives" + +# Number of parallel download threads. +# WARNING: The RootView server is single-threaded and will time out under heavy +# load. Measured safe limit is 2 workers. Values above 4 cause cascading +# timeouts and lost tiles. Do not exceed 4. +workers: 2 + +# Request timeout in seconds +timeout: 60 + +# Delay between requests to a single machine (seconds, float ok) +request_delay: 0.5 + +# Optional: limit to specific machines by label (comment out to scrape all) +# machines: +# - "BW1-4 [AMR-15]" +# - "BW1-6 [AMR-19]" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8093dd9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +requests>=2.31.0 +beautifulsoup4>=4.12.0 +lxml>=5.0.0 +pyyaml>=6.0.1 +tqdm>=4.66.0 +pytest>=8.0 diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..8980515 --- /dev/null +++ b/scraper.py @@ -0,0 +1,5 @@ +"""Entry point — delegates entirely to spruce.cli.""" +from spruce.cli import main + +if __name__ == "__main__": + main() diff --git a/spruce/__init__.py b/spruce/__init__.py new file mode 100644 index 0000000..4981bb9 --- /dev/null +++ b/spruce/__init__.py @@ -0,0 +1 @@ +# spruce — minirhizotron archive library diff --git a/spruce/cli.py b/spruce/cli.py new file mode 100644 index 0000000..6370ea9 --- /dev/null +++ b/spruce/cli.py @@ -0,0 +1,259 @@ +""" +Command-line interface for the spruce scraper. +""" + +import argparse +import logging +import os +import sys +from pathlib import Path + +import yaml + +from spruce.orchestrator import scrape_machine +from spruce.parsers import parse_machine_option +from spruce.progress import ProgressTracker, CsvWriter +from spruce.recheck import recheck_archive, recheck_tile_files +from spruce.settings import ( + DEFAULT_CONFIG, + MAX_SAFE_WORKERS, + PROGRESS_FILENAME, + SCANS_CSV_FIELDS, + SCANS_CSV_FILENAME, + TILES_CSV_FIELDS, + TILES_CSV_FILENAME, + _clamp_workers, + load_config, +) +from spruce.session import MachineSession + +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin + +log = logging.getLogger(__name__) + + +def discover_machines(base_url: str, timeout: int = 30) -> list[dict]: + resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "lxml") + select = soup.find("select", {"name": "RTLNAME"}) + if not select: + log.warning("Could not find machine selector on login page.") + return [] + return [ + parse_machine_option(opt.get_text(strip=True), opt["value"]) + for opt in select.find_all("option") + ] + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser( + description="Archive minirhizotron image tiles from RootView.", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.add_argument( + "--config", + default=DEFAULT_CONFIG, + metavar="FILE", + help=f"YAML config file (default: {DEFAULT_CONFIG})", + ) + p.add_argument( + "--machine", + metavar="LABEL", + help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"', + ) + p.add_argument( + "--scan-id", + type=int, + metavar="ID", + help="Download only this specific scan ID (use with --machine)", + ) + p.add_argument( + "--mosaic-only", + action="store_true", + help="Download mosaics only; skip individual tiles", + ) + p.add_argument( + "--dry-run", + action="store_true", + help="Preview what would be downloaded without saving any files", + ) + p.add_argument( + "--workers", + type=int, + metavar="N", + help="Override parallel download threads from config", + ) + p.add_argument( + "--list-machines", + action="store_true", + help="Print available machines and exit (no credentials needed)", + ) + p.add_argument( + "--list-scans", + action="store_true", + help="Print all scans for --machine and exit", + ) + p.add_argument( + "--recheck", + action="store_true", + help=( + "Scan the archive for zero-byte or missing tile files whose URLs " + "are marked complete in .progress.json, remove them from progress, " + "and report how many were re-queued. Run before resuming after a crash." + ), + ) + p.add_argument( + "--verbose", + "-v", + action="store_true", + help="Enable debug logging", + ) + return p.parse_args() + + +def main() -> None: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)-8s %(message)s", + datefmt="%H:%M:%S", + ) + args = parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # --list-machines doesn't need credentials + if args.list_machines: + base_url = "http://205.149.147.131:8010/" + timeout = 30 + if os.path.exists(args.config): + cfg = yaml.safe_load(open(args.config)) + base_url = cfg.get("base_url", base_url) + timeout = cfg.get("timeout", timeout) + machines = discover_machines(base_url, timeout) + print(f"{'Label':<25} {'ID':>4} {'IP':<17} {'Version'}") + print("-" * 62) + for m in machines: + print( + f"{m['label']:<25} {m['machine_id']:>4} {m['ip']:<17} {m['version']}" + ) + return + + if not os.path.exists(args.config): + sys.exit( + f"Config file '{args.config}' not found.\n" + f"Copy config.example.yaml to {args.config} and fill in your credentials." + ) + + config = load_config(args.config) + if args.workers: + config["workers"] = _clamp_workers(args.workers) + + output_dir = Path(config["output_dir"]) + + # --recheck: validate archive integrity and re-queue bad tiles + if args.recheck: + progress = ProgressTracker(output_dir / PROGRESS_FILENAME) + n_bad = recheck_tile_files(output_dir, progress) + n_requeued = recheck_archive(output_dir, progress) + if n_bad == 0 and n_requeued == 0: + log.info("Archive looks clean. No action needed.") + else: + log.info( + "Recheck complete: %d zero-byte file(s) deleted, " + "%d URL(s) re-queued for download.", + n_bad, + n_requeued, + ) + return + + # Build machine list + all_machines = discover_machines(config["base_url"], config["timeout"]) + if not all_machines: + sys.exit("Could not retrieve machine list from server.") + + # Apply --machine / config machines filter + filter_labels: list[str] | None = None + if args.machine: + filter_labels = [args.machine] + elif config.get("machines"): + filter_labels = list(config["machines"]) + + if filter_labels: + machines = [m for m in all_machines if m["label"] in filter_labels] + not_found = [ + label + for label in filter_labels + if label not in {m["label"] for m in machines} + ] + if not_found: + log.warning("Unknown machine label(s): %s", not_found) + else: + machines = all_machines + + if not machines: + sys.exit("No machines selected.") + + # --list-scans: print and exit + if args.list_scans: + if len(machines) != 1: + sys.exit("--list-scans requires exactly one machine (use --machine).") + sess = MachineSession(machines[0], config) + if not sess.login(): + sys.exit("Login failed.") + scans = sess.get_all_scans() + print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}") + print("-" * 85) + for sc in scans: + print( + f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} " + f"{sc.get('name', ''):<40} {sc.get('status', '')}" + ) + print(f"\nTotal: {len(scans)} scans") + return + + log.info( + "Scraping %d machine(s): %s", + len(machines), + ", ".join(m["label"] for m in machines), + ) + if args.mosaic_only: + log.info("Mode: mosaics only (individual tiles skipped)") + if args.dry_run: + log.info("Mode: dry-run (no files will be written)") + + # Shared progress and CSV writers + progress = ProgressTracker(output_dir / PROGRESS_FILENAME) + tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS) + scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS) + + total = 0 + try: + for machine in machines: + count = scrape_machine( + machine=machine, + config=config, + output_dir=output_dir, + progress=progress, + tiles_csv=tiles_csv, + scans_csv=scans_csv, + dry_run=args.dry_run, + mosaic_only=args.mosaic_only, + scan_id_filter=args.scan_id, + ) + total += count + finally: + tiles_csv.close() + scans_csv.close() + progress.save() + + if args.dry_run: + log.info("Dry run complete.") + else: + log.info("Done. Total files downloaded: %d", total) + log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME) + log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME) + log.info("Progress : %s", output_dir / PROGRESS_FILENAME) diff --git a/spruce/orchestrator.py b/spruce/orchestrator.py new file mode 100644 index 0000000..1525bf5 --- /dev/null +++ b/spruce/orchestrator.py @@ -0,0 +1,307 @@ +""" +High-level scrape orchestration: drives the per-machine and per-scan loops. +""" + +import json +import logging +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Any + +from tqdm import tqdm + +from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date +from spruce.progress import ProgressTracker, CsvWriter +from spruce.session import MachineSession + +log = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Per-scan helpers +# --------------------------------------------------------------------------- + + +def _download_mosaic( + sess: MachineSession, + scan_meta: dict[str, Any], + scan_id: int, + mosaic_path: Path, + progress: ProgressTracker, + machine: dict[str, Any], + dry_run: bool, +) -> bool: + """Download the scan mosaic if not already done. Returns True if downloaded.""" + url = sess.mosaic_url(scan_id) + if progress.is_done(url): + return False + if dry_run: + log.info("[DRY-RUN] Mosaic: %s → %s", url, mosaic_path) + return False + log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id) + size = sess.download_file(url, mosaic_path) + if size: + progress.mark_done(url) + progress.save() + log.info( + "[%s] Mosaic saved: %s (%.1f MB)", + machine["label"], + mosaic_path, + size / 1e6, + ) + return True + return False + + +def _download_tiles_for_scan( + sess: MachineSession, + tiles: list[dict[str, Any]], + scan_meta: dict[str, Any], + scan_id: int, + output_dir: Path, + machine: dict[str, Any], + config: dict[str, Any], + progress: ProgressTracker, + tiles_csv: CsvWriter, + dry_run: bool, +) -> int: + """Download all pending tiles for a scan. Returns count of tiles downloaded.""" + pending = [t for t in tiles if not progress.is_done(t["url"])] + log.info( + "[%s] Scan %d: %d tiles total, %d pending.", + machine["label"], + scan_id, + len(tiles), + len(pending), + ) + + if dry_run: + for t in pending[:5]: + log.info("[DRY-RUN] Tile: %s", t["url"]) + if len(pending) > 5: + log.info("[DRY-RUN] … and %d more tiles.", len(pending) - 5) + return 0 + + # Attach scan_time for CSV rows + for t in pending: + t["scan_time"] = scan_meta.get("scan_time", "") + + workers: int = config["workers"] + downloaded = 0 + + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = { + pool.submit( + sess.download_tile, + tile, + tile_dest(output_dir, machine, scan_meta, tile), + False, + ): tile + for tile in pending + } + + save_every = max(50, workers * 4) + batch: list[dict[str, Any]] = [] + + with tqdm( + total=len(pending), + desc=f"{machine['label']} scan {scan_id}", + unit="tile", + leave=True, + ) as pbar: + for future in as_completed(futures): + result = future.result() + if result.get("file_size_bytes"): + batch.append(result) + progress.mark_done(result["url"]) + downloaded += 1 + pbar.update(1) + + if len(batch) >= save_every: + for row in batch: + tiles_csv.write(row) + progress.save() + batch.clear() + + for row in batch: + tiles_csv.write(row) + progress.save() + + log.info( + "[%s] Scan %d complete: %d tiles downloaded.", + machine["label"], + scan_id, + downloaded, + ) + return downloaded + + +# --------------------------------------------------------------------------- +# Per-scan driver +# --------------------------------------------------------------------------- + + +def process_scan( + sess: MachineSession, + scan: dict[str, Any], + output_dir: Path, + machine: dict[str, Any], + config: dict[str, Any], + progress: ProgressTracker, + scans_csv: CsvWriter, + tiles_csv: CsvWriter, + dry_run: bool, + mosaic_only: bool, +) -> int: + """ + Process one scan: fetch metadata, download mosaic and (optionally) tiles. + Returns total files downloaded for this scan. + """ + scan_id: int = scan["scan_id"] + log.info("[%s] Processing scan %d …", machine["label"], scan_id) + + try: + scan_meta = sess.get_scan_metadata(scan_id) + except Exception as exc: + log.error( + "[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc + ) + return 0 + + if not scan_meta.get("nx") or not scan_meta.get("ny"): + log.warning( + "[%s] Scan %d: missing grid params, skipping.", + machine["label"], + scan_id, + ) + return 0 + + # Merge list-level metadata into scan_meta (detail page takes precedence) + for k in ( + "name", + "scan_time", + "start_datetime", + "end_datetime", + "status", + "user", + "scan_lines", + "scan_mode", + ): + scan_meta.setdefault(k, scan.get(k, "")) + + # Save per-scan metadata.json + scan_date = _extract_date(scan_meta.get("scan_time", "")) + scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id) + if not dry_run: + scan_dir.mkdir(parents=True, exist_ok=True) + meta_file = scan_dir / "metadata.json" + if not meta_file.exists(): + meta_file.write_text( + json.dumps(scan_meta, indent=2, default=str), encoding="utf-8" + ) + + # Mosaic + mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id) + mosaic_url = sess.mosaic_url(scan_id) + mosaic_downloaded = _download_mosaic( + sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run + ) + total = 1 if mosaic_downloaded else 0 + + # Write scan-level CSV row + scans_csv.write( + { + "machine": machine["label"], + "machine_id": machine["machine_id"], + "scan_id": scan_id, + "name": scan_meta.get("name", ""), + "scan_time": scan_meta.get("scan_time", ""), + "start_x": scan_meta.get("start_x", ""), + "start_y": scan_meta.get("start_y", ""), + "end_x": scan_meta.get("end_x", ""), + "end_y": scan_meta.get("end_y", ""), + "dx": scan_meta.get("dx", ""), + "dy": scan_meta.get("dy", ""), + "nx": scan_meta.get("nx", ""), + "ny": scan_meta.get("ny", ""), + "total_tiles": scan_meta.get("total_tiles", ""), + "scan_lines": scan_meta.get("scan_lines", ""), + "scan_mode": scan_meta.get("scan_mode", ""), + "start_datetime": scan_meta.get("start_datetime", ""), + "end_datetime": scan_meta.get("end_datetime", ""), + "status": scan_meta.get("status", ""), + "user": scan_meta.get("user", ""), + "disk_space_mb": scan_meta.get("disk_space_mb", ""), + "mosaic_url": mosaic_url, + "mosaic_local_path": str(mosaic_path), + "mosaic_downloaded": mosaic_downloaded, + } + ) + + if mosaic_only: + return total + + # Tiles + tiles = sess.enumerate_tiles(scan_meta) + total += _download_tiles_for_scan( + sess, + tiles, + scan_meta, + scan_id, + output_dir, + machine, + config, + progress, + tiles_csv, + dry_run, + ) + return total + + +# --------------------------------------------------------------------------- +# Per-machine driver +# --------------------------------------------------------------------------- + + +def scrape_machine( + machine: dict[str, Any], + config: dict[str, Any], + output_dir: Path, + progress: ProgressTracker, + tiles_csv: CsvWriter, + scans_csv: CsvWriter, + dry_run: bool, + mosaic_only: bool, + scan_id_filter: int | None, +) -> int: + """Login, fetch scans, and download all content for one machine.""" + sess = MachineSession(machine, config) + if not sess.login(): + return 0 + + if scan_id_filter is not None: + scans: list[dict[str, Any]] = [ + {"scan_id": scan_id_filter, "status": "Completed"} + ] + log.info("[%s] Targeting scan ID %d.", machine["label"], scan_id_filter) + else: + scans = sess.get_all_scans() + if not scans: + log.warning("[%s] No scans found.", machine["label"]) + return 0 + + total = 0 + for scan in scans: + total += process_scan( + sess=sess, + scan=scan, + output_dir=output_dir, + machine=machine, + config=config, + progress=progress, + scans_csv=scans_csv, + tiles_csv=tiles_csv, + dry_run=dry_run, + mosaic_only=mosaic_only, + ) + return total diff --git a/spruce/parsers.py b/spruce/parsers.py new file mode 100644 index 0000000..e2bcf81 --- /dev/null +++ b/spruce/parsers.py @@ -0,0 +1,213 @@ +""" +Pure HTML / text parsing functions for the RootView web application. + +All functions are side-effect-free: string (or list[str]) in, dict/list out. +No network access, no filesystem access. +""" + +import math +import re +from typing import Any +from urllib.parse import unquote + +from bs4 import BeautifulSoup + + +# --------------------------------------------------------------------------- +# Machine descriptor +# --------------------------------------------------------------------------- + + +def parse_machine_option(label: str, raw_value: str) -> dict[str, Any]: + """Decode the pipe-delimited