Initial commit

Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
2026-04-22 10:41:18 -04:00
commit e122f6435a
23 changed files with 3789 additions and 0 deletions
@@ -0,0 +1,274 @@
+"""
+HTTP session for a single RootView machine: login, scan listing, tile downloads.
+"""
+
+import logging
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.parse import urljoin
+from typing import Any
+
+import requests
+from bs4 import BeautifulSoup
+
+from spruce.parsers import parse_scan_row, parse_scan_view, _grid_values
+
+log = logging.getLogger(__name__)
+
+USER_AGENT = "spruce-scraper/1.0"
+
+
+class MachineSession:
+    """Manages an authenticated HTTP session for one RootView machine."""
+
+    def __init__(self, machine: dict[str, Any], config: dict[str, Any]) -> None:
+        self.machine = machine
+        self.cfg = config
+        self.http = requests.Session()
+        self.http.headers["User-Agent"] = USER_AGENT
+        self.base_url: str = config["base_url"]
+        self.image_base_url: str = config.get(
+            "image_base_url", "http://205.149.147.131:8011/"
+        )
+
+    # ------------------------------------------------------------------
+    # Auth
+    # ------------------------------------------------------------------
+
+    def login(self) -> bool:
+        url = urljoin(self.base_url, "index.php")
+        payload = {
+            "RTLLogin": "1",
+            "RTLNAME": self.machine["option_value"],
+            "RTLUSER": self.cfg["username"],
+            "RTLPWD": self.cfg["password"],
+            "rtl_latest_version": "3.0.0.18",
+            "submit": " submit ",
+        }
+        try:
+            resp = self.http.post(url, data=payload, timeout=self.cfg["timeout"])
+            resp.raise_for_status()
+        except requests.RequestException as exc:
+            log.error("[%s] Login failed: %s", self.machine["label"], exc)
+            return False
+
+        soup = BeautifulSoup(resp.text, "lxml")
+        error_tag = soup.find(class_="error")
+        if error_tag and error_tag.get_text(strip=True):
+            log.error(
+                "[%s] Login rejected: %s",
+                self.machine["label"],
+                error_tag.get_text(strip=True),
+            )
+            return False
+
+        log.info("[%s] Login succeeded.", self.machine["label"])
+        return True
+
+    # ------------------------------------------------------------------
+    # Scan list (paginated)
+    # ------------------------------------------------------------------
+
+    def get_all_scans(self) -> list[dict[str, Any]]:
+        """
+        Fetch the complete scan list across all pages.
+
+        Uses a large FilterCount (320) to minimise round-trips.
+        Falls back to repeated pages if the list is longer.
+        """
+        all_scans: list[dict[str, Any]] = []
+        start = 0
+        page_size = 320
+
+        while True:
+            page_scans = self._fetch_scan_page(start, page_size)
+            if not page_scans:
+                break
+            all_scans.extend(page_scans)
+            log.debug(
+                "[%s] Page start=%d: %d scans (total so far: %d)",
+                self.machine["label"],
+                start,
+                len(page_scans),
+                len(all_scans),
+            )
+            if len(page_scans) < page_size:
+                break
+            start += page_size
+            time.sleep(self.cfg["request_delay"])
+
+        log.info("[%s] Found %d scans.", self.machine["label"], len(all_scans))
+        return all_scans
+
+    def _fetch_scan_page(
+        self, start: int, page_size: int
+    ) -> list[dict[str, Any]]:
+        """POST the scan list form and parse the returned table."""
+        time.sleep(self.cfg["request_delay"])
+        resp = self.http.post(
+            urljoin(self.base_url, "index.php"),
+            data={
+                "cmd": "scan",
+                "start": str(start),
+                "order": "0",
+                "order_dir": "1",
+                "FilterScanStatus": "2",  # Completed scans
+                "FilterUser": "",
+                "hidedate": "",
+                "FilterDtFrom": "",
+                "FilterDtTo": "",
+                "FilterIdFrom": "0",
+                "FilterIdTo": "0",
+                "FilterCount": str(page_size),
+            },
+            timeout=self.cfg["timeout"],
+        )
+        resp.raise_for_status()
+
+        soup = BeautifulSoup(resp.text, "lxml")
+        scans: list[dict[str, Any]] = []
+        for row in soup.find_all("tr"):
+            cells = [td.get_text(strip=True) for td in row.find_all("td")]
+            scan = parse_scan_row(cells)
+            if scan:
+                scans.append(scan)
+        return scans
+
+    # ------------------------------------------------------------------
+    # Scan detail
+    # ------------------------------------------------------------------
+
+    def get_scan_metadata(self, scan_id: int) -> dict[str, Any]:
+        """Fetch the scan view page and extract grid parameters."""
+        time.sleep(self.cfg["request_delay"])
+        resp = self.http.get(
+            urljoin(self.base_url, "index.php"),
+            params={"cmd": "scan", "mode": "view", "id": str(scan_id)},
+            timeout=self.cfg["timeout"],
+        )
+        resp.raise_for_status()
+        return parse_scan_view(resp.text)
+
+    # ------------------------------------------------------------------
+    # Tile enumeration
+    # ------------------------------------------------------------------
+
+    def enumerate_tiles(self, scan_meta: dict[str, Any]) -> list[dict[str, Any]]:
+        """
+        Generate the full list of tile descriptors for a scan.
+
+        Each descriptor has: url, row_index, col_index, x_mm, y_mm
+        """
+        scan_id = scan_meta["scan_id"]
+        nx: int = scan_meta.get("nx", 0)
+        ny: int = scan_meta.get("ny", 0)
+        start_x: float = scan_meta.get("start_x", 0.0)
+        start_y: float = scan_meta.get("start_y", 0.0)
+        dx: float = scan_meta.get("dx", 1.0)
+        dy: float = scan_meta.get("dy", 1.0)
+        scale: int = self.cfg.get("tile_scale", 1)
+
+        xs = _grid_values(start_x, nx, dx)
+        ys = _grid_values(start_y, ny, dy)
+
+        tiles: list[dict[str, Any]] = []
+        for row_idx, y in enumerate(ys):
+            for col_idx, x in enumerate(xs):
+                url = (
+                    urljoin(self.base_url, "index.php")
+                    + f"?cmd=image&mode=image_scan&id={scan_id}"
+                    + f"&s={scale}&x={x}&y={y}"
+                )
+                tiles.append(
+                    {
+                        "scan_id": scan_id,
+                        "row_index": row_idx,
+                        "col_index": col_idx,
+                        "x_mm": x,
+                        "y_mm": y,
+                        "url": url,
+                    }
+                )
+        return tiles
+
+    # ------------------------------------------------------------------
+    # Mosaic URL
+    # ------------------------------------------------------------------
+
+    def mosaic_url(self, scan_id: int) -> str:
+        return urljoin(
+            self.image_base_url, f"RootView_Database/{scan_id}/mosaic.jpg"
+        )
+
+    # ------------------------------------------------------------------
+    # Downloads
+    # ------------------------------------------------------------------
+
+    def download_file(self, url: str, dest: Path, retries: int = 3) -> int:
+        """Stream-download url to dest with retries. Returns bytes written (0 on failure)."""
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        backoff = 5.0
+        for attempt in range(1, retries + 1):
+            try:
+                resp = self.http.get(
+                    url, timeout=self.cfg["timeout"], stream=True
+                )
+                resp.raise_for_status()
+                size = 0
+                with open(dest, "wb") as fh:
+                    for chunk in resp.iter_content(chunk_size=65536):
+                        if chunk:
+                            fh.write(chunk)
+                            size += len(chunk)
+                return size
+            except Exception as exc:
+                if attempt < retries:
+                    log.debug(
+                        "Attempt %d/%d failed %s: %s — retrying in %.0fs",
+                        attempt,
+                        retries,
+                        url,
+                        exc,
+                        backoff,
+                    )
+                    time.sleep(backoff)
+                    backoff *= 2
+                else:
+                    log.warning(
+                        "Download failed after %d attempts %s: %s",
+                        retries,
+                        url,
+                        exc,
+                    )
+        return 0
+
+    def download_tile(
+        self, tile: dict[str, Any], dest: Path, dry_run: bool
+    ) -> dict[str, Any]:
+        """Download a single tile. Returns a metadata row dict."""
+        row: dict[str, Any] = {
+            "machine": self.machine["label"],
+            "machine_id": self.machine["machine_id"],
+            "scan_id": tile["scan_id"],
+            "scan_time": tile.get("scan_time", ""),
+            "row_index": tile["row_index"],
+            "col_index": tile["col_index"],
+            "x_mm": tile["x_mm"],
+            "y_mm": tile["y_mm"],
+            "url": tile["url"],
+            "local_path": str(dest),
+            "downloaded_at": "",
+            "file_size_bytes": "",
+        }
+        if dry_run:
+            return row
+        if dest.exists():
+            row["downloaded_at"] = "already_exists"
+            row["file_size_bytes"] = dest.stat().st_size
+            return row
+        size = self.download_file(tile["url"], dest)
+        if size:
+            row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
+            row["file_size_bytes"] = size
+        return row