Initial commit

Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
2026-04-22 10:41:18 -04:00
commit e122f6435a
23 changed files with 3789 additions and 0 deletions
@@ -0,0 +1 @@
+# spruce — minirhizotron archive library
@@ -0,0 +1,259 @@
+"""
+Command-line interface for the spruce scraper.
+"""
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+
+import yaml
+
+from spruce.orchestrator import scrape_machine
+from spruce.parsers import parse_machine_option
+from spruce.progress import ProgressTracker, CsvWriter
+from spruce.recheck import recheck_archive, recheck_tile_files
+from spruce.settings import (
+    DEFAULT_CONFIG,
+    MAX_SAFE_WORKERS,
+    PROGRESS_FILENAME,
+    SCANS_CSV_FIELDS,
+    SCANS_CSV_FILENAME,
+    TILES_CSV_FIELDS,
+    TILES_CSV_FILENAME,
+    _clamp_workers,
+    load_config,
+)
+from spruce.session import MachineSession
+
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+
+log = logging.getLogger(__name__)
+
+
+def discover_machines(base_url: str, timeout: int = 30) -> list[dict]:
+    resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout)
+    resp.raise_for_status()
+    soup = BeautifulSoup(resp.text, "lxml")
+    select = soup.find("select", {"name": "RTLNAME"})
+    if not select:
+        log.warning("Could not find machine selector on login page.")
+        return []
+    return [
+        parse_machine_option(opt.get_text(strip=True), opt["value"])
+        for opt in select.find_all("option")
+    ]
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Archive minirhizotron image tiles from RootView.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument(
+        "--config",
+        default=DEFAULT_CONFIG,
+        metavar="FILE",
+        help=f"YAML config file (default: {DEFAULT_CONFIG})",
+    )
+    p.add_argument(
+        "--machine",
+        metavar="LABEL",
+        help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"',
+    )
+    p.add_argument(
+        "--scan-id",
+        type=int,
+        metavar="ID",
+        help="Download only this specific scan ID (use with --machine)",
+    )
+    p.add_argument(
+        "--mosaic-only",
+        action="store_true",
+        help="Download mosaics only; skip individual tiles",
+    )
+    p.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Preview what would be downloaded without saving any files",
+    )
+    p.add_argument(
+        "--workers",
+        type=int,
+        metavar="N",
+        help="Override parallel download threads from config",
+    )
+    p.add_argument(
+        "--list-machines",
+        action="store_true",
+        help="Print available machines and exit (no credentials needed)",
+    )
+    p.add_argument(
+        "--list-scans",
+        action="store_true",
+        help="Print all scans for --machine and exit",
+    )
+    p.add_argument(
+        "--recheck",
+        action="store_true",
+        help=(
+            "Scan the archive for zero-byte or missing tile files whose URLs "
+            "are marked complete in .progress.json, remove them from progress, "
+            "and report how many were re-queued. Run before resuming after a crash."
+        ),
+    )
+    p.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Enable debug logging",
+    )
+    return p.parse_args()
+
+
+def main() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s  %(levelname)-8s  %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    args = parse_args()
+
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # --list-machines doesn't need credentials
+    if args.list_machines:
+        base_url = "http://205.149.147.131:8010/"
+        timeout = 30
+        if os.path.exists(args.config):
+            cfg = yaml.safe_load(open(args.config))
+            base_url = cfg.get("base_url", base_url)
+            timeout = cfg.get("timeout", timeout)
+        machines = discover_machines(base_url, timeout)
+        print(f"{'Label':<25}  {'ID':>4}  {'IP':<17}  {'Version'}")
+        print("-" * 62)
+        for m in machines:
+            print(
+                f"{m['label']:<25}  {m['machine_id']:>4}  {m['ip']:<17}  {m['version']}"
+            )
+        return
+
+    if not os.path.exists(args.config):
+        sys.exit(
+            f"Config file '{args.config}' not found.\n"
+            f"Copy config.example.yaml to {args.config} and fill in your credentials."
+        )
+
+    config = load_config(args.config)
+    if args.workers:
+        config["workers"] = _clamp_workers(args.workers)
+
+    output_dir = Path(config["output_dir"])
+
+    # --recheck: validate archive integrity and re-queue bad tiles
+    if args.recheck:
+        progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
+        n_bad = recheck_tile_files(output_dir, progress)
+        n_requeued = recheck_archive(output_dir, progress)
+        if n_bad == 0 and n_requeued == 0:
+            log.info("Archive looks clean. No action needed.")
+        else:
+            log.info(
+                "Recheck complete: %d zero-byte file(s) deleted, "
+                "%d URL(s) re-queued for download.",
+                n_bad,
+                n_requeued,
+            )
+        return
+
+    # Build machine list
+    all_machines = discover_machines(config["base_url"], config["timeout"])
+    if not all_machines:
+        sys.exit("Could not retrieve machine list from server.")
+
+    # Apply --machine / config machines filter
+    filter_labels: list[str] | None = None
+    if args.machine:
+        filter_labels = [args.machine]
+    elif config.get("machines"):
+        filter_labels = list(config["machines"])
+
+    if filter_labels:
+        machines = [m for m in all_machines if m["label"] in filter_labels]
+        not_found = [
+            label
+            for label in filter_labels
+            if label not in {m["label"] for m in machines}
+        ]
+        if not_found:
+            log.warning("Unknown machine label(s): %s", not_found)
+    else:
+        machines = all_machines
+
+    if not machines:
+        sys.exit("No machines selected.")
+
+    # --list-scans: print and exit
+    if args.list_scans:
+        if len(machines) != 1:
+            sys.exit("--list-scans requires exactly one machine (use --machine).")
+        sess = MachineSession(machines[0], config)
+        if not sess.login():
+            sys.exit("Login failed.")
+        scans = sess.get_all_scans()
+        print(f"{'ID':>8}  {'Date':<22}  {'Name':<40}  {'Status'}")
+        print("-" * 85)
+        for sc in scans:
+            print(
+                f"{sc['scan_id']:>8}  {sc.get('scan_time', ''):<22}  "
+                f"{sc.get('name', ''):<40}  {sc.get('status', '')}"
+            )
+        print(f"\nTotal: {len(scans)} scans")
+        return
+
+    log.info(
+        "Scraping %d machine(s): %s",
+        len(machines),
+        ", ".join(m["label"] for m in machines),
+    )
+    if args.mosaic_only:
+        log.info("Mode: mosaics only (individual tiles skipped)")
+    if args.dry_run:
+        log.info("Mode: dry-run (no files will be written)")
+
+    # Shared progress and CSV writers
+    progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
+    tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
+    scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
+
+    total = 0
+    try:
+        for machine in machines:
+            count = scrape_machine(
+                machine=machine,
+                config=config,
+                output_dir=output_dir,
+                progress=progress,
+                tiles_csv=tiles_csv,
+                scans_csv=scans_csv,
+                dry_run=args.dry_run,
+                mosaic_only=args.mosaic_only,
+                scan_id_filter=args.scan_id,
+            )
+            total += count
+    finally:
+        tiles_csv.close()
+        scans_csv.close()
+        progress.save()
+
+    if args.dry_run:
+        log.info("Dry run complete.")
+    else:
+        log.info("Done. Total files downloaded: %d", total)
+        log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME)
+        log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME)
+        log.info("Progress  : %s", output_dir / PROGRESS_FILENAME)
@@ -0,0 +1,307 @@
+"""
+High-level scrape orchestration: drives the per-machine and per-scan loops.
+"""
+
+import json
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Any
+
+from tqdm import tqdm
+
+from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
+from spruce.progress import ProgressTracker, CsvWriter
+from spruce.session import MachineSession
+
+log = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Per-scan helpers
+# ---------------------------------------------------------------------------
+
+
+def _download_mosaic(
+    sess: MachineSession,
+    scan_meta: dict[str, Any],
+    scan_id: int,
+    mosaic_path: Path,
+    progress: ProgressTracker,
+    machine: dict[str, Any],
+    dry_run: bool,
+) -> bool:
+    """Download the scan mosaic if not already done. Returns True if downloaded."""
+    url = sess.mosaic_url(scan_id)
+    if progress.is_done(url):
+        return False
+    if dry_run:
+        log.info("[DRY-RUN] Mosaic: %s → %s", url, mosaic_path)
+        return False
+    log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id)
+    size = sess.download_file(url, mosaic_path)
+    if size:
+        progress.mark_done(url)
+        progress.save()
+        log.info(
+            "[%s] Mosaic saved: %s (%.1f MB)",
+            machine["label"],
+            mosaic_path,
+            size / 1e6,
+        )
+        return True
+    return False
+
+
+def _download_tiles_for_scan(
+    sess: MachineSession,
+    tiles: list[dict[str, Any]],
+    scan_meta: dict[str, Any],
+    scan_id: int,
+    output_dir: Path,
+    machine: dict[str, Any],
+    config: dict[str, Any],
+    progress: ProgressTracker,
+    tiles_csv: CsvWriter,
+    dry_run: bool,
+) -> int:
+    """Download all pending tiles for a scan. Returns count of tiles downloaded."""
+    pending = [t for t in tiles if not progress.is_done(t["url"])]
+    log.info(
+        "[%s] Scan %d: %d tiles total, %d pending.",
+        machine["label"],
+        scan_id,
+        len(tiles),
+        len(pending),
+    )
+
+    if dry_run:
+        for t in pending[:5]:
+            log.info("[DRY-RUN] Tile: %s", t["url"])
+        if len(pending) > 5:
+            log.info("[DRY-RUN] … and %d more tiles.", len(pending) - 5)
+        return 0
+
+    # Attach scan_time for CSV rows
+    for t in pending:
+        t["scan_time"] = scan_meta.get("scan_time", "")
+
+    workers: int = config["workers"]
+    downloaded = 0
+
+    with ThreadPoolExecutor(max_workers=workers) as pool:
+        futures = {
+            pool.submit(
+                sess.download_tile,
+                tile,
+                tile_dest(output_dir, machine, scan_meta, tile),
+                False,
+            ): tile
+            for tile in pending
+        }
+
+        save_every = max(50, workers * 4)
+        batch: list[dict[str, Any]] = []
+
+        with tqdm(
+            total=len(pending),
+            desc=f"{machine['label']} scan {scan_id}",
+            unit="tile",
+            leave=True,
+        ) as pbar:
+            for future in as_completed(futures):
+                result = future.result()
+                if result.get("file_size_bytes"):
+                    batch.append(result)
+                    progress.mark_done(result["url"])
+                    downloaded += 1
+                pbar.update(1)
+
+                if len(batch) >= save_every:
+                    for row in batch:
+                        tiles_csv.write(row)
+                    progress.save()
+                    batch.clear()
+
+        for row in batch:
+            tiles_csv.write(row)
+        progress.save()
+
+    log.info(
+        "[%s] Scan %d complete: %d tiles downloaded.",
+        machine["label"],
+        scan_id,
+        downloaded,
+    )
+    return downloaded
+
+
+# ---------------------------------------------------------------------------
+# Per-scan driver
+# ---------------------------------------------------------------------------
+
+
+def process_scan(
+    sess: MachineSession,
+    scan: dict[str, Any],
+    output_dir: Path,
+    machine: dict[str, Any],
+    config: dict[str, Any],
+    progress: ProgressTracker,
+    scans_csv: CsvWriter,
+    tiles_csv: CsvWriter,
+    dry_run: bool,
+    mosaic_only: bool,
+) -> int:
+    """
+    Process one scan: fetch metadata, download mosaic and (optionally) tiles.
+    Returns total files downloaded for this scan.
+    """
+    scan_id: int = scan["scan_id"]
+    log.info("[%s] Processing scan %d …", machine["label"], scan_id)
+
+    try:
+        scan_meta = sess.get_scan_metadata(scan_id)
+    except Exception as exc:
+        log.error(
+            "[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
+        )
+        return 0
+
+    if not scan_meta.get("nx") or not scan_meta.get("ny"):
+        log.warning(
+            "[%s] Scan %d: missing grid params, skipping.",
+            machine["label"],
+            scan_id,
+        )
+        return 0
+
+    # Merge list-level metadata into scan_meta (detail page takes precedence)
+    for k in (
+        "name",
+        "scan_time",
+        "start_datetime",
+        "end_datetime",
+        "status",
+        "user",
+        "scan_lines",
+        "scan_mode",
+    ):
+        scan_meta.setdefault(k, scan.get(k, ""))
+
+    # Save per-scan metadata.json
+    scan_date = _extract_date(scan_meta.get("scan_time", ""))
+    scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id)
+    if not dry_run:
+        scan_dir.mkdir(parents=True, exist_ok=True)
+        meta_file = scan_dir / "metadata.json"
+        if not meta_file.exists():
+            meta_file.write_text(
+                json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
+            )
+
+    # Mosaic
+    mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
+    mosaic_url = sess.mosaic_url(scan_id)
+    mosaic_downloaded = _download_mosaic(
+        sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
+    )
+    total = 1 if mosaic_downloaded else 0
+
+    # Write scan-level CSV row
+    scans_csv.write(
+        {
+            "machine": machine["label"],
+            "machine_id": machine["machine_id"],
+            "scan_id": scan_id,
+            "name": scan_meta.get("name", ""),
+            "scan_time": scan_meta.get("scan_time", ""),
+            "start_x": scan_meta.get("start_x", ""),
+            "start_y": scan_meta.get("start_y", ""),
+            "end_x": scan_meta.get("end_x", ""),
+            "end_y": scan_meta.get("end_y", ""),
+            "dx": scan_meta.get("dx", ""),
+            "dy": scan_meta.get("dy", ""),
+            "nx": scan_meta.get("nx", ""),
+            "ny": scan_meta.get("ny", ""),
+            "total_tiles": scan_meta.get("total_tiles", ""),
+            "scan_lines": scan_meta.get("scan_lines", ""),
+            "scan_mode": scan_meta.get("scan_mode", ""),
+            "start_datetime": scan_meta.get("start_datetime", ""),
+            "end_datetime": scan_meta.get("end_datetime", ""),
+            "status": scan_meta.get("status", ""),
+            "user": scan_meta.get("user", ""),
+            "disk_space_mb": scan_meta.get("disk_space_mb", ""),
+            "mosaic_url": mosaic_url,
+            "mosaic_local_path": str(mosaic_path),
+            "mosaic_downloaded": mosaic_downloaded,
+        }
+    )
+
+    if mosaic_only:
+        return total
+
+    # Tiles
+    tiles = sess.enumerate_tiles(scan_meta)
+    total += _download_tiles_for_scan(
+        sess,
+        tiles,
+        scan_meta,
+        scan_id,
+        output_dir,
+        machine,
+        config,
+        progress,
+        tiles_csv,
+        dry_run,
+    )
+    return total
+
+
+# ---------------------------------------------------------------------------
+# Per-machine driver
+# ---------------------------------------------------------------------------
+
+
+def scrape_machine(
+    machine: dict[str, Any],
+    config: dict[str, Any],
+    output_dir: Path,
+    progress: ProgressTracker,
+    tiles_csv: CsvWriter,
+    scans_csv: CsvWriter,
+    dry_run: bool,
+    mosaic_only: bool,
+    scan_id_filter: int | None,
+) -> int:
+    """Login, fetch scans, and download all content for one machine."""
+    sess = MachineSession(machine, config)
+    if not sess.login():
+        return 0
+
+    if scan_id_filter is not None:
+        scans: list[dict[str, Any]] = [
+            {"scan_id": scan_id_filter, "status": "Completed"}
+        ]
+        log.info("[%s] Targeting scan ID %d.", machine["label"], scan_id_filter)
+    else:
+        scans = sess.get_all_scans()
+        if not scans:
+            log.warning("[%s] No scans found.", machine["label"])
+            return 0
+
+    total = 0
+    for scan in scans:
+        total += process_scan(
+            sess=sess,
+            scan=scan,
+            output_dir=output_dir,
+            machine=machine,
+            config=config,
+            progress=progress,
+            scans_csv=scans_csv,
+            tiles_csv=tiles_csv,
+            dry_run=dry_run,
+            mosaic_only=mosaic_only,
+        )
+    return total
@@ -0,0 +1,213 @@
+"""
+Pure HTML / text parsing functions for the RootView web application.
+
+All functions are side-effect-free: string (or list[str]) in, dict/list out.
+No network access, no filesystem access.
+"""
+
+import math
+import re
+from typing import Any
+from urllib.parse import unquote
+
+from bs4 import BeautifulSoup
+
+
+# ---------------------------------------------------------------------------
+# Machine descriptor
+# ---------------------------------------------------------------------------
+
+
+def parse_machine_option(label: str, raw_value: str) -> dict[str, Any]:
+    """Decode the pipe-delimited <option> value for a machine."""
+    decoded = unquote(raw_value)
+    parts = decoded.split("|")
+    return {
+        "label": label,
+        "option_value": raw_value,
+        "name": parts[0] if len(parts) > 0 else label,
+        "ip": parts[1] if len(parts) > 1 else "",
+        "port1": parts[2] if len(parts) > 2 else "",
+        "machine_id": parts[3] if len(parts) > 3 else "",
+        "port2": parts[4] if len(parts) > 4 else "",
+        "version": parts[5] if len(parts) > 5 else "",
+    }
+
+
+# ---------------------------------------------------------------------------
+# Scan list row
+# ---------------------------------------------------------------------------
+
+
+def parse_scan_row(cells: list[str]) -> dict[str, Any] | None:
+    """
+    Parse one table row from the scan list into a scan dict.
+
+    Expected columns (from the observed HTML):
+      ID, Name, Scan Time, Step Units, (X,Y)-(X,Y)-(DX,DY),
+      Dwell Time ms, Scan Lines, Scan Mode, Start Time, End Time,
+      Cancelled, User, Scan Status, Archived, [View link]
+
+    Returns None for header rows or rows whose first cell is not a digit.
+    """
+    if not cells or not cells[0].strip().isdigit():
+        return None
+    try:
+        scan_id = int(cells[0].strip())
+        return {
+            "scan_id": scan_id,
+            "name": cells[1].strip() if len(cells) > 1 else "",
+            "scan_time": cells[2].strip() if len(cells) > 2 else "",
+            "step_units": cells[3].strip() if len(cells) > 3 else "",
+            "coord_str": cells[4].strip() if len(cells) > 4 else "",
+            "dwell_ms": cells[5].strip() if len(cells) > 5 else "",
+            "scan_lines": cells[6].strip() if len(cells) > 6 else "",
+            "scan_mode": cells[7].strip() if len(cells) > 7 else "",
+            "start_datetime": cells[8].strip() if len(cells) > 8 else "",
+            "end_datetime": cells[9].strip() if len(cells) > 9 else "",
+            "cancelled": cells[10].strip() if len(cells) > 10 else "",
+            "user": cells[11].strip() if len(cells) > 11 else "",
+            "status": cells[12].strip() if len(cells) > 12 else "",
+            "archived": cells[13].strip() if len(cells) > 13 else "",
+        }
+    except (ValueError, IndexError):
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Scan view page
+# ---------------------------------------------------------------------------
+
+
+def parse_scan_view(html: str) -> dict[str, Any]:
+    """
+    Extract grid parameters from a scan view page.
+
+    Returns a dict with keys:
+      scan_id, name, scan_time, start_x, start_y, end_x, end_y,
+      dx, dy, nx, ny, total_tiles, disk_space_mb, ...
+    """
+    result: dict[str, Any] = {}
+
+    # Extract grid params from the show_tile() URL inside the page JS.
+    # The scan view embeds them as query params in a JS string, e.g.:
+    #   "include/tile_view.php?...&sX=0&sY=0&eX=310&eY=740&dX=3.01&dY=2.26&..."
+    tile_url_m = re.search(r'tile_view\.php\?([^"\']+)', html)
+    if tile_url_m:
+        qs = tile_url_m.group(1)
+        param_map = {
+            "sX": "startX",
+            "sY": "startY",
+            "eX": "endX",
+            "eY": "endY",
+            "dX": "deltaX",
+            "dY": "deltaY",
+        }
+        for qs_key, result_key in param_map.items():
+            m = re.search(rf"(?:^|&){re.escape(qs_key)}=([\d.]+)", qs)
+            if m:
+                result[result_key] = float(m.group(1))
+
+    # Fallback: look for standalone JS var declarations (present in tile_view.php)
+    js_var_patterns = {
+        "startX": r"var\s+startX\s*=\s*([\d.]+)",
+        "startY": r"var\s+startY\s*=\s*([\d.]+)",
+        "endX": r"var\s+endX\s*=\s*([\d.]+)",
+        "endY": r"var\s+endY\s*=\s*([\d.]+)",
+        "deltaX": r"var\s+deltaX\s*=\s*([\d.]+)",
+        "deltaY": r"var\s+deltaY\s*=\s*([\d.]+)",
+    }
+    for key, pattern in js_var_patterns.items():
+        if key not in result:
+            m = re.search(pattern, html)
+            if m:
+                result[key] = float(m.group(1))
+
+    # Extract from the data table
+    soup = BeautifulSoup(html, "lxml")
+    for row in soup.find_all("tr"):
+        cells = [td.get_text(strip=True) for td in row.find_all("td")]
+        for i, cell in enumerate(cells):
+            if cell == "Scan ID:" and i + 1 < len(cells):
+                try:
+                    result["scan_id"] = int(cells[i + 1])
+                except ValueError:
+                    pass
+            elif cell == "Name:" and i + 1 < len(cells):
+                result["name"] = cells[i + 1]
+            elif cell == "Scan Time:" and i + 1 < len(cells):
+                result["scan_time"] = cells[i + 1]
+            elif cell == "Starting X:" and i + 1 < len(cells):
+                result["start_x_label"] = cells[i + 1]
+            elif cell == "Starting Y:" and i + 1 < len(cells):
+                result["start_y_label"] = cells[i + 1]
+            elif cell == "Ending X:" and i + 1 < len(cells):
+                result["end_x_label"] = cells[i + 1]
+            elif cell == "Ending Y:" and i + 1 < len(cells):
+                result["end_y_label"] = cells[i + 1]
+            elif cell == "DX:" and i + 1 < len(cells):
+                result["dx_label"] = cells[i + 1]
+            elif cell == "DY:" and i + 1 < len(cells):
+                result["dy_label"] = cells[i + 1]
+            elif cell == "Scan Lines:" and i + 1 < len(cells):
+                result["scan_lines"] = cells[i + 1]
+            elif cell == "Scan Mode:" and i + 1 < len(cells):
+                result["scan_mode"] = cells[i + 1]
+            elif cell == "Start Time:" and i + 1 < len(cells):
+                result["start_datetime"] = cells[i + 1]
+            elif cell == "End Time:" and i + 1 < len(cells):
+                result["end_datetime"] = cells[i + 1]
+            elif cell == "Scan Status:" and i + 1 < len(cells):
+                result["status"] = cells[i + 1]
+            elif cell == "User:" and i + 1 < len(cells):
+                result["user"] = cells[i + 1]
+            elif cell == "Total number of images:" and i + 1 < len(cells):
+                # Format: "33784 (103x328)"
+                m = re.match(r"(\d+)\s*\((\d+)x(\d+)\)", cells[i + 1])
+                if m:
+                    result["total_tiles"] = int(m.group(1))
+                    result["nx"] = int(m.group(2))
+                    result["ny"] = int(m.group(3))
+            elif cell == "Total Disk Space:" and i + 1 < len(cells):
+                m = re.search(r"([\d.]+)\s*Mb", cells[i + 1])
+                if m:
+                    result["disk_space_mb"] = float(m.group(1))
+
+    # Promote JS/URL grid param names to canonical keys
+    for raw, canon in (
+        ("startX", "start_x"),
+        ("startY", "start_y"),
+        ("endX", "end_x"),
+        ("endY", "end_y"),
+        ("deltaX", "dx"),
+        ("deltaY", "dy"),
+    ):
+        if raw in result:
+            result[canon] = result.pop(raw)
+
+    # Compute nx/ny from grid params if not parsed from table
+    if "nx" not in result and all(k in result for k in ("start_x", "end_x", "dx")):
+        result["nx"] = _grid_count(result["start_x"], result["end_x"], result["dx"])
+    if "ny" not in result and all(k in result for k in ("start_y", "end_y", "dy")):
+        result["ny"] = _grid_count(result["start_y"], result["end_y"], result["dy"])
+    if "total_tiles" not in result and "nx" in result and "ny" in result:
+        result["total_tiles"] = result["nx"] * result["ny"]
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Grid helpers
+# ---------------------------------------------------------------------------
+
+
+def _grid_count(start: float, end: float, step: float) -> int:
+    """Number of grid positions from start up to (but not including) end."""
+    if step <= 0:
+        return 0
+    return math.ceil((end - start) / step)
+
+
+def _grid_values(start: float, count: int, step: float) -> list[float]:
+    """Generate `count` evenly-spaced grid positions, rounded to 2 dp."""
+    return [round(start + i * step, 2) for i in range(count)]
@@ -0,0 +1,62 @@
+"""
+Pure path-helper functions — no network, no JSON, no progress state.
+"""
+
+import re
+from pathlib import Path
+from typing import Any
+
+
+def machine_dir_name(machine: dict[str, Any]) -> str:
+    """Sanitise machine label for use as a directory name."""
+    return re.sub(r"[^\w\-.]", "_", machine["label"]).strip("_")
+
+
+def _extract_date(dt_str: str) -> str:
+    """Pull YYYY-MM-DD from a datetime string, fall back to 'unknown'."""
+    m = re.search(r"(\d{4}-\d{2}-\d{2})", dt_str)
+    return m.group(1) if m else "unknown"
+
+
+def tile_dest(
+    output_dir: Path,
+    machine: dict[str, Any],
+    scan_meta: dict[str, Any],
+    tile: dict[str, Any],
+) -> Path:
+    """Return the local path for a single tile file."""
+    scan_date = _extract_date(scan_meta.get("scan_time", ""))
+    scan_id = tile["scan_id"]
+    ny = scan_meta.get("ny", 1)
+    nx = scan_meta.get("nx", 1)
+    row_width = len(str(ny - 1)) if ny > 1 else 1
+    col_width = len(str(nx - 1)) if nx > 1 else 1
+    filename = (
+        f"tile_r{tile['row_index']:0{row_width}d}"
+        f"_c{tile['col_index']:0{col_width}d}.jpg"
+    )
+    return (
+        output_dir
+        / machine_dir_name(machine)
+        / scan_date
+        / str(scan_id)
+        / "tiles"
+        / filename
+    )
+
+
+def mosaic_dest(
+    output_dir: Path,
+    machine: dict[str, Any],
+    scan_meta: dict[str, Any],
+    scan_id: int,
+) -> Path:
+    """Return the local path for a scan's mosaic file."""
+    scan_date = _extract_date(scan_meta.get("scan_time", ""))
+    return (
+        output_dir
+        / machine_dir_name(machine)
+        / scan_date
+        / str(scan_id)
+        / "mosaic.jpg"
+    )
@@ -0,0 +1,82 @@
+"""
+Progress tracking (JSON) and CSV writing.
+"""
+
+import csv
+import json
+import logging
+from pathlib import Path
+from typing import Iterator
+
+log = logging.getLogger(__name__)
+
+
+class ProgressTracker:
+    """
+    Tracks successfully downloaded URLs in a JSON file so runs can be resumed.
+
+    Public API (all external code should use only these methods):
+      is_done(url)    — True if url has been downloaded
+      mark_done(url)  — Record url as complete (call save() to persist)
+      discard(url)    — Remove url from the completed set
+      iter_urls()     — Iterate over all completed URLs
+      __len__()       — Number of completed URLs
+      save()          — Flush state to disk
+    """
+
+    def __init__(self, path: Path) -> None:
+        self.path = path
+        self._done: set[str] = set()
+        self._load()
+
+    def _load(self) -> None:
+        if self.path.exists():
+            try:
+                data = json.loads(self.path.read_text())
+                self._done = set(data.get("completed_urls", []))
+                log.info("Resuming: %d URLs already downloaded.", len(self._done))
+            except Exception:
+                log.warning("Could not read progress file; starting fresh.")
+
+    def is_done(self, url: str) -> bool:
+        return url in self._done
+
+    def mark_done(self, url: str) -> None:
+        self._done.add(url)
+
+    def discard(self, url: str) -> None:
+        """Remove a URL from the completed set (re-queues it for download)."""
+        self._done.discard(url)
+
+    def iter_urls(self) -> Iterator[str]:
+        """Iterate over all completed URLs."""
+        return iter(self._done)
+
+    def __len__(self) -> int:
+        return len(self._done)
+
+    def save(self) -> None:
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+        self.path.write_text(
+            json.dumps({"completed_urls": sorted(self._done)}, indent=2)
+        )
+
+
+class CsvWriter:
+    """Append-mode CSV writer that writes a header on first creation."""
+
+    def __init__(self, path: Path, fields: list[str]) -> None:
+        is_new = not path.exists()
+        path.parent.mkdir(parents=True, exist_ok=True)
+        self._fh = open(path, "a", newline="", encoding="utf-8")
+        self._writer = csv.DictWriter(self._fh, fieldnames=fields)
+        if is_new:
+            self._writer.writeheader()
+        self._fields = fields
+
+    def write(self, row: dict) -> None:
+        self._writer.writerow({f: row.get(f, "") for f in self._fields})
+        self._fh.flush()
+
+    def close(self) -> None:
+        self._fh.close()
@@ -0,0 +1,156 @@
+"""
+Archive integrity checks — find corrupt / missing tiles and remove them
+from the progress tracker so they are re-downloaded on the next run.
+"""
+
+import logging
+import urllib.parse
+from pathlib import Path
+from typing import Any
+
+from spruce.progress import ProgressTracker
+
+log = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Private helpers
+# ---------------------------------------------------------------------------
+
+
+def _parse_tile_url(url: str) -> dict[str, str]:
+    """Extract scan_id, x, y from a tile URL query string."""
+    qs = dict(urllib.parse.parse_qsl(urllib.parse.urlparse(url).query))
+    return {
+        "scan_id": qs.get("id", ""),
+        "x": qs.get("x", ""),
+        "y": qs.get("y", ""),
+    }
+
+
+def _build_disk_index(output_dir: Path) -> dict[Path, int]:
+    """Return {tile_path: size_bytes} for every tile file found on disk."""
+    return {p: p.stat().st_size for p in output_dir.rglob("tile_r*.jpg")}
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def recheck_tile_files(output_dir: Path, progress: ProgressTracker) -> int:
+    """
+    Walk every tile file on disk and delete any that are zero bytes.
+    Also removes the corresponding URL from progress in the same pass,
+    so a single --recheck call is sufficient before resuming.
+
+    Returns the count of files deleted.
+    """
+    # Build a reverse map: (scan_id, x, y) -> url for all completed tile URLs
+    coord_to_url: dict[tuple[str, str, str], str] = {}
+    for url in progress.iter_urls():
+        if "cmd=image" in url:
+            p = _parse_tile_url(url)
+            key = (p["scan_id"], p["x"], p["y"])
+            coord_to_url[key] = url
+
+    deleted = 0
+    for tile_path in output_dir.rglob("tile_r*.jpg"):
+        if tile_path.stat().st_size == 0:
+            log.warning("Deleting zero-byte tile: %s", tile_path)
+            tile_path.unlink()
+            deleted += 1
+
+            # Try to find the matching URL from progress and discard it
+            scan_id = _scan_id_from_path(tile_path)
+            if scan_id:
+                # Discard any URL for this scan_id — precise x/y matching
+                # requires metadata.json; scan-level discard is safe because
+                # recheck_archive will clean up any remaining stale URLs.
+                for key, url in list(coord_to_url.items()):
+                    if key[0] == scan_id:
+                        progress.discard(url)
+                        del coord_to_url[key]
+
+    if deleted:
+        log.info("Deleted %d zero-byte tile file(s).", deleted)
+        progress.save()
+    else:
+        log.info("No zero-byte tile files found on disk.")
+    return deleted
+
+
+def recheck_archive(output_dir: Path, progress: ProgressTracker) -> int:
+    """
+    Walk every URL in .progress.json and verify its local file exists and is
+    non-empty. Removes bad entries from progress so the next run re-downloads
+    them. Returns the count of entries removed.
+
+    Only tile URLs are checked (mosaic URLs are skipped — mosaics are large
+    single files and are unlikely to be partially written due to streaming).
+    """
+    if len(progress) == 0:
+        log.info("Progress file is empty — nothing to recheck.")
+        return 0
+
+    tile_urls = [u for u in progress.iter_urls() if "cmd=image" in u]
+    mosaic_count = len(progress) - len(tile_urls)
+    log.info(
+        "Rechecking %d tile URLs (%d mosaic URLs not rechecked) …",
+        len(tile_urls),
+        mosaic_count,
+    )
+
+    # Build a disk index once
+    existing_files = _build_disk_index(output_dir)
+    log.debug("Found %d tile files on disk.", len(existing_files))
+
+    bad_urls: list[str] = []
+
+    for url in tile_urls:
+        p = _parse_tile_url(url)
+        scan_id = p["scan_id"]
+
+        # Find tile files that live under a directory named after this scan_id
+        candidates = [path for path in existing_files if str(scan_id) in path.parts]
+
+        if not candidates:
+            bad_urls.append(url)
+            continue
+
+        if not any(existing_files[path] > 0 for path in candidates):
+            bad_urls.append(url)
+
+    if not bad_urls:
+        log.info("All %d tile URLs look healthy.", len(tile_urls))
+        return 0
+
+    log.warning(
+        "Found %d suspect tile URL(s). Removing from progress.",
+        len(bad_urls),
+    )
+    for url in bad_urls:
+        progress.discard(url)
+    progress.save()
+    log.info(
+        "Removed %d URL(s) from .progress.json — they will be re-downloaded on next run.",
+        len(bad_urls),
+    )
+    return len(bad_urls)
+
+
+# ---------------------------------------------------------------------------
+# Internal utility
+# ---------------------------------------------------------------------------
+
+
+def _scan_id_from_path(tile_path: Path) -> str | None:
+    """
+    Given a tile path like .../158374/tiles/tile_r0_c0.jpg, return '158374'.
+    Looks for the directory two levels above the filename (parent.parent.name).
+    """
+    try:
+        # structure: <machine>/<date>/<scan_id>/tiles/<filename>
+        return tile_path.parent.parent.name
+    except Exception:
+        return None
@@ -0,0 +1,274 @@
+"""
+HTTP session for a single RootView machine: login, scan listing, tile downloads.
+"""
+
+import logging
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.parse import urljoin
+from typing import Any
+
+import requests
+from bs4 import BeautifulSoup
+
+from spruce.parsers import parse_scan_row, parse_scan_view, _grid_values
+
+log = logging.getLogger(__name__)
+
+USER_AGENT = "spruce-scraper/1.0"
+
+
+class MachineSession:
+    """Manages an authenticated HTTP session for one RootView machine."""
+
+    def __init__(self, machine: dict[str, Any], config: dict[str, Any]) -> None:
+        self.machine = machine
+        self.cfg = config
+        self.http = requests.Session()
+        self.http.headers["User-Agent"] = USER_AGENT
+        self.base_url: str = config["base_url"]
+        self.image_base_url: str = config.get(
+            "image_base_url", "http://205.149.147.131:8011/"
+        )
+
+    # ------------------------------------------------------------------
+    # Auth
+    # ------------------------------------------------------------------
+
+    def login(self) -> bool:
+        url = urljoin(self.base_url, "index.php")
+        payload = {
+            "RTLLogin": "1",
+            "RTLNAME": self.machine["option_value"],
+            "RTLUSER": self.cfg["username"],
+            "RTLPWD": self.cfg["password"],
+            "rtl_latest_version": "3.0.0.18",
+            "submit": " submit ",
+        }
+        try:
+            resp = self.http.post(url, data=payload, timeout=self.cfg["timeout"])
+            resp.raise_for_status()
+        except requests.RequestException as exc:
+            log.error("[%s] Login failed: %s", self.machine["label"], exc)
+            return False
+
+        soup = BeautifulSoup(resp.text, "lxml")
+        error_tag = soup.find(class_="error")
+        if error_tag and error_tag.get_text(strip=True):
+            log.error(
+                "[%s] Login rejected: %s",
+                self.machine["label"],
+                error_tag.get_text(strip=True),
+            )
+            return False
+
+        log.info("[%s] Login succeeded.", self.machine["label"])
+        return True
+
+    # ------------------------------------------------------------------
+    # Scan list (paginated)
+    # ------------------------------------------------------------------
+
+    def get_all_scans(self) -> list[dict[str, Any]]:
+        """
+        Fetch the complete scan list across all pages.
+
+        Uses a large FilterCount (320) to minimise round-trips.
+        Falls back to repeated pages if the list is longer.
+        """
+        all_scans: list[dict[str, Any]] = []
+        start = 0
+        page_size = 320
+
+        while True:
+            page_scans = self._fetch_scan_page(start, page_size)
+            if not page_scans:
+                break
+            all_scans.extend(page_scans)
+            log.debug(
+                "[%s] Page start=%d: %d scans (total so far: %d)",
+                self.machine["label"],
+                start,
+                len(page_scans),
+                len(all_scans),
+            )
+            if len(page_scans) < page_size:
+                break
+            start += page_size
+            time.sleep(self.cfg["request_delay"])
+
+        log.info("[%s] Found %d scans.", self.machine["label"], len(all_scans))
+        return all_scans
+
+    def _fetch_scan_page(
+        self, start: int, page_size: int
+    ) -> list[dict[str, Any]]:
+        """POST the scan list form and parse the returned table."""
+        time.sleep(self.cfg["request_delay"])
+        resp = self.http.post(
+            urljoin(self.base_url, "index.php"),
+            data={
+                "cmd": "scan",
+                "start": str(start),
+                "order": "0",
+                "order_dir": "1",
+                "FilterScanStatus": "2",  # Completed scans
+                "FilterUser": "",
+                "hidedate": "",
+                "FilterDtFrom": "",
+                "FilterDtTo": "",
+                "FilterIdFrom": "0",
+                "FilterIdTo": "0",
+                "FilterCount": str(page_size),
+            },
+            timeout=self.cfg["timeout"],
+        )
+        resp.raise_for_status()
+
+        soup = BeautifulSoup(resp.text, "lxml")
+        scans: list[dict[str, Any]] = []
+        for row in soup.find_all("tr"):
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            scan = parse_scan_row(cells)
+            if scan:
+                scans.append(scan)
+        return scans
+
+    # ------------------------------------------------------------------
+    # Scan detail
+    # ------------------------------------------------------------------
+
+    def get_scan_metadata(self, scan_id: int) -> dict[str, Any]:
+        """Fetch the scan view page and extract grid parameters."""
+        time.sleep(self.cfg["request_delay"])
+        resp = self.http.get(
+            urljoin(self.base_url, "index.php"),
+            params={"cmd": "scan", "mode": "view", "id": str(scan_id)},
+            timeout=self.cfg["timeout"],
+        )
+        resp.raise_for_status()
+        return parse_scan_view(resp.text)
+
+    # ------------------------------------------------------------------
+    # Tile enumeration
+    # ------------------------------------------------------------------
+
+    def enumerate_tiles(self, scan_meta: dict[str, Any]) -> list[dict[str, Any]]:
+        """
+        Generate the full list of tile descriptors for a scan.
+
+        Each descriptor has: url, row_index, col_index, x_mm, y_mm
+        """
+        scan_id = scan_meta["scan_id"]
+        nx: int = scan_meta.get("nx", 0)
+        ny: int = scan_meta.get("ny", 0)
+        start_x: float = scan_meta.get("start_x", 0.0)
+        start_y: float = scan_meta.get("start_y", 0.0)
+        dx: float = scan_meta.get("dx", 1.0)
+        dy: float = scan_meta.get("dy", 1.0)
+        scale: int = self.cfg.get("tile_scale", 1)
+
+        xs = _grid_values(start_x, nx, dx)
+        ys = _grid_values(start_y, ny, dy)
+
+        tiles: list[dict[str, Any]] = []
+        for row_idx, y in enumerate(ys):
+            for col_idx, x in enumerate(xs):
+                url = (
+                    urljoin(self.base_url, "index.php")
+                    + f"?cmd=image&mode=image_scan&id={scan_id}"
+                    + f"&s={scale}&x={x}&y={y}"
+                )
+                tiles.append(
+                    {
+                        "scan_id": scan_id,
+                        "row_index": row_idx,
+                        "col_index": col_idx,
+                        "x_mm": x,
+                        "y_mm": y,
+                        "url": url,
+                    }
+                )
+        return tiles
+
+    # ------------------------------------------------------------------
+    # Mosaic URL
+    # ------------------------------------------------------------------
+
+    def mosaic_url(self, scan_id: int) -> str:
+        return urljoin(
+            self.image_base_url, f"RootView_Database/{scan_id}/mosaic.jpg"
+        )
+
+    # ------------------------------------------------------------------
+    # Downloads
+    # ------------------------------------------------------------------
+
+    def download_file(self, url: str, dest: Path, retries: int = 3) -> int:
+        """Stream-download url to dest with retries. Returns bytes written (0 on failure)."""
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        backoff = 5.0
+        for attempt in range(1, retries + 1):
+            try:
+                resp = self.http.get(
+                    url, timeout=self.cfg["timeout"], stream=True
+                )
+                resp.raise_for_status()
+                size = 0
+                with open(dest, "wb") as fh:
+                    for chunk in resp.iter_content(chunk_size=65536):
+                        if chunk:
+                            fh.write(chunk)
+                            size += len(chunk)
+                return size
+            except Exception as exc:
+                if attempt < retries:
+                    log.debug(
+                        "Attempt %d/%d failed %s: %s — retrying in %.0fs",
+                        attempt,
+                        retries,
+                        url,
+                        exc,
+                        backoff,
+                    )
+                    time.sleep(backoff)
+                    backoff *= 2
+                else:
+                    log.warning(
+                        "Download failed after %d attempts %s: %s",
+                        retries,
+                        url,
+                        exc,
+                    )
+        return 0
+
+    def download_tile(
+        self, tile: dict[str, Any], dest: Path, dry_run: bool
+    ) -> dict[str, Any]:
+        """Download a single tile. Returns a metadata row dict."""
+        row: dict[str, Any] = {
+            "machine": self.machine["label"],
+            "machine_id": self.machine["machine_id"],
+            "scan_id": tile["scan_id"],
+            "scan_time": tile.get("scan_time", ""),
+            "row_index": tile["row_index"],
+            "col_index": tile["col_index"],
+            "x_mm": tile["x_mm"],
+            "y_mm": tile["y_mm"],
+            "url": tile["url"],
+            "local_path": str(dest),
+            "downloaded_at": "",
+            "file_size_bytes": "",
+        }
+        if dry_run:
+            return row
+        if dest.exists():
+            row["downloaded_at"] = "already_exists"
+            row["file_size_bytes"] = dest.stat().st_size
+            return row
+        size = self.download_file(tile["url"], dest)
+        if size:
+            row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
+            row["file_size_bytes"] = size
+        return row
@@ -0,0 +1,109 @@
+"""
+Constants, field lists, and config loading for the spruce scraper.
+"""
+
+import logging
+import sys
+
+import yaml
+
+log = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# File-name constants
+# ---------------------------------------------------------------------------
+
+DEFAULT_CONFIG = "config.yaml"
+PROGRESS_FILENAME = ".progress.json"
+SCANS_CSV_FILENAME = "scans.csv"
+TILES_CSV_FILENAME = "tiles.csv"
+
+# ---------------------------------------------------------------------------
+# CSV field lists
+# ---------------------------------------------------------------------------
+
+SCANS_CSV_FIELDS: list[str] = [
+    "machine",
+    "machine_id",
+    "scan_id",
+    "name",
+    "scan_time",
+    "start_x",
+    "start_y",
+    "end_x",
+    "end_y",
+    "dx",
+    "dy",
+    "nx",
+    "ny",
+    "total_tiles",
+    "scan_lines",
+    "scan_mode",
+    "start_datetime",
+    "end_datetime",
+    "status",
+    "user",
+    "disk_space_mb",
+    "mosaic_url",
+    "mosaic_local_path",
+    "mosaic_downloaded",
+]
+
+TILES_CSV_FIELDS: list[str] = [
+    "machine",
+    "machine_id",
+    "scan_id",
+    "scan_time",
+    "row_index",
+    "col_index",
+    "x_mm",
+    "y_mm",
+    "url",
+    "local_path",
+    "downloaded_at",
+    "file_size_bytes",
+]
+
+# ---------------------------------------------------------------------------
+# Worker safety
+# ---------------------------------------------------------------------------
+
+MAX_SAFE_WORKERS = 4  # above this the RootView server starts timing out
+
+
+def _clamp_workers(n: int) -> int:
+    """Return n clamped to MAX_SAFE_WORKERS, logging a warning if clamped."""
+    if n > MAX_SAFE_WORKERS:
+        log.warning(
+            "workers=%d exceeds the safe limit of %d. "
+            "The RootView server will time out under this load, causing lost tiles. "
+            "Capping at %d.",
+            n,
+            MAX_SAFE_WORKERS,
+            MAX_SAFE_WORKERS,
+        )
+        return MAX_SAFE_WORKERS
+    return n
+
+
+# ---------------------------------------------------------------------------
+# Config loader
+# ---------------------------------------------------------------------------
+
+
+def load_config(path: str) -> dict:
+    """Load and validate config.yaml. Exits on missing required fields."""
+    with open(path) as fh:
+        cfg = yaml.safe_load(fh)
+    missing = [k for k in ("username", "password") if not cfg.get(k)]
+    if missing:
+        sys.exit(f"Config {path} is missing required fields: {missing}")
+    cfg.setdefault("base_url", "http://205.149.147.131:8010/")
+    cfg.setdefault("image_base_url", "http://205.149.147.131:8011/")
+    cfg.setdefault("output_dir", "archives")
+    cfg.setdefault("workers", 2)
+    cfg.setdefault("timeout", 60)
+    cfg.setdefault("request_delay", 0.5)
+    cfg.setdefault("tile_scale", 1)
+    cfg["workers"] = _clamp_workers(cfg["workers"])
+    return cfg