e122f6435a
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
275 lines
9.5 KiB
Python
275 lines
9.5 KiB
Python
"""
|
|
HTTP session for a single RootView machine: login, scan listing, tile downloads.
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin
|
|
from typing import Any
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from spruce.parsers import parse_scan_row, parse_scan_view, _grid_values
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
USER_AGENT = "spruce-scraper/1.0"
|
|
|
|
|
|
class MachineSession:
|
|
"""Manages an authenticated HTTP session for one RootView machine."""
|
|
|
|
def __init__(self, machine: dict[str, Any], config: dict[str, Any]) -> None:
|
|
self.machine = machine
|
|
self.cfg = config
|
|
self.http = requests.Session()
|
|
self.http.headers["User-Agent"] = USER_AGENT
|
|
self.base_url: str = config["base_url"]
|
|
self.image_base_url: str = config.get(
|
|
"image_base_url", "http://205.149.147.131:8011/"
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Auth
|
|
# ------------------------------------------------------------------
|
|
|
|
def login(self) -> bool:
|
|
url = urljoin(self.base_url, "index.php")
|
|
payload = {
|
|
"RTLLogin": "1",
|
|
"RTLNAME": self.machine["option_value"],
|
|
"RTLUSER": self.cfg["username"],
|
|
"RTLPWD": self.cfg["password"],
|
|
"rtl_latest_version": "3.0.0.18",
|
|
"submit": " submit ",
|
|
}
|
|
try:
|
|
resp = self.http.post(url, data=payload, timeout=self.cfg["timeout"])
|
|
resp.raise_for_status()
|
|
except requests.RequestException as exc:
|
|
log.error("[%s] Login failed: %s", self.machine["label"], exc)
|
|
return False
|
|
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
error_tag = soup.find(class_="error")
|
|
if error_tag and error_tag.get_text(strip=True):
|
|
log.error(
|
|
"[%s] Login rejected: %s",
|
|
self.machine["label"],
|
|
error_tag.get_text(strip=True),
|
|
)
|
|
return False
|
|
|
|
log.info("[%s] Login succeeded.", self.machine["label"])
|
|
return True
|
|
|
|
# ------------------------------------------------------------------
|
|
# Scan list (paginated)
|
|
# ------------------------------------------------------------------
|
|
|
|
def get_all_scans(self) -> list[dict[str, Any]]:
|
|
"""
|
|
Fetch the complete scan list across all pages.
|
|
|
|
Uses a large FilterCount (320) to minimise round-trips.
|
|
Falls back to repeated pages if the list is longer.
|
|
"""
|
|
all_scans: list[dict[str, Any]] = []
|
|
start = 0
|
|
page_size = 320
|
|
|
|
while True:
|
|
page_scans = self._fetch_scan_page(start, page_size)
|
|
if not page_scans:
|
|
break
|
|
all_scans.extend(page_scans)
|
|
log.debug(
|
|
"[%s] Page start=%d: %d scans (total so far: %d)",
|
|
self.machine["label"],
|
|
start,
|
|
len(page_scans),
|
|
len(all_scans),
|
|
)
|
|
if len(page_scans) < page_size:
|
|
break
|
|
start += page_size
|
|
time.sleep(self.cfg["request_delay"])
|
|
|
|
log.info("[%s] Found %d scans.", self.machine["label"], len(all_scans))
|
|
return all_scans
|
|
|
|
def _fetch_scan_page(
|
|
self, start: int, page_size: int
|
|
) -> list[dict[str, Any]]:
|
|
"""POST the scan list form and parse the returned table."""
|
|
time.sleep(self.cfg["request_delay"])
|
|
resp = self.http.post(
|
|
urljoin(self.base_url, "index.php"),
|
|
data={
|
|
"cmd": "scan",
|
|
"start": str(start),
|
|
"order": "0",
|
|
"order_dir": "1",
|
|
"FilterScanStatus": "2", # Completed scans
|
|
"FilterUser": "",
|
|
"hidedate": "",
|
|
"FilterDtFrom": "",
|
|
"FilterDtTo": "",
|
|
"FilterIdFrom": "0",
|
|
"FilterIdTo": "0",
|
|
"FilterCount": str(page_size),
|
|
},
|
|
timeout=self.cfg["timeout"],
|
|
)
|
|
resp.raise_for_status()
|
|
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
scans: list[dict[str, Any]] = []
|
|
for row in soup.find_all("tr"):
|
|
cells = [td.get_text(strip=True) for td in row.find_all("td")]
|
|
scan = parse_scan_row(cells)
|
|
if scan:
|
|
scans.append(scan)
|
|
return scans
|
|
|
|
# ------------------------------------------------------------------
|
|
# Scan detail
|
|
# ------------------------------------------------------------------
|
|
|
|
def get_scan_metadata(self, scan_id: int) -> dict[str, Any]:
|
|
"""Fetch the scan view page and extract grid parameters."""
|
|
time.sleep(self.cfg["request_delay"])
|
|
resp = self.http.get(
|
|
urljoin(self.base_url, "index.php"),
|
|
params={"cmd": "scan", "mode": "view", "id": str(scan_id)},
|
|
timeout=self.cfg["timeout"],
|
|
)
|
|
resp.raise_for_status()
|
|
return parse_scan_view(resp.text)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Tile enumeration
|
|
# ------------------------------------------------------------------
|
|
|
|
def enumerate_tiles(self, scan_meta: dict[str, Any]) -> list[dict[str, Any]]:
|
|
"""
|
|
Generate the full list of tile descriptors for a scan.
|
|
|
|
Each descriptor has: url, row_index, col_index, x_mm, y_mm
|
|
"""
|
|
scan_id = scan_meta["scan_id"]
|
|
nx: int = scan_meta.get("nx", 0)
|
|
ny: int = scan_meta.get("ny", 0)
|
|
start_x: float = scan_meta.get("start_x", 0.0)
|
|
start_y: float = scan_meta.get("start_y", 0.0)
|
|
dx: float = scan_meta.get("dx", 1.0)
|
|
dy: float = scan_meta.get("dy", 1.0)
|
|
scale: int = self.cfg.get("tile_scale", 1)
|
|
|
|
xs = _grid_values(start_x, nx, dx)
|
|
ys = _grid_values(start_y, ny, dy)
|
|
|
|
tiles: list[dict[str, Any]] = []
|
|
for row_idx, y in enumerate(ys):
|
|
for col_idx, x in enumerate(xs):
|
|
url = (
|
|
urljoin(self.base_url, "index.php")
|
|
+ f"?cmd=image&mode=image_scan&id={scan_id}"
|
|
+ f"&s={scale}&x={x}&y={y}"
|
|
)
|
|
tiles.append(
|
|
{
|
|
"scan_id": scan_id,
|
|
"row_index": row_idx,
|
|
"col_index": col_idx,
|
|
"x_mm": x,
|
|
"y_mm": y,
|
|
"url": url,
|
|
}
|
|
)
|
|
return tiles
|
|
|
|
# ------------------------------------------------------------------
|
|
# Mosaic URL
|
|
# ------------------------------------------------------------------
|
|
|
|
def mosaic_url(self, scan_id: int) -> str:
|
|
return urljoin(
|
|
self.image_base_url, f"RootView_Database/{scan_id}/mosaic.jpg"
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Downloads
|
|
# ------------------------------------------------------------------
|
|
|
|
def download_file(self, url: str, dest: Path, retries: int = 3) -> int:
|
|
"""Stream-download url to dest with retries. Returns bytes written (0 on failure)."""
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
backoff = 5.0
|
|
for attempt in range(1, retries + 1):
|
|
try:
|
|
resp = self.http.get(
|
|
url, timeout=self.cfg["timeout"], stream=True
|
|
)
|
|
resp.raise_for_status()
|
|
size = 0
|
|
with open(dest, "wb") as fh:
|
|
for chunk in resp.iter_content(chunk_size=65536):
|
|
if chunk:
|
|
fh.write(chunk)
|
|
size += len(chunk)
|
|
return size
|
|
except Exception as exc:
|
|
if attempt < retries:
|
|
log.debug(
|
|
"Attempt %d/%d failed %s: %s — retrying in %.0fs",
|
|
attempt,
|
|
retries,
|
|
url,
|
|
exc,
|
|
backoff,
|
|
)
|
|
time.sleep(backoff)
|
|
backoff *= 2
|
|
else:
|
|
log.warning(
|
|
"Download failed after %d attempts %s: %s",
|
|
retries,
|
|
url,
|
|
exc,
|
|
)
|
|
return 0
|
|
|
|
def download_tile(
|
|
self, tile: dict[str, Any], dest: Path, dry_run: bool
|
|
) -> dict[str, Any]:
|
|
"""Download a single tile. Returns a metadata row dict."""
|
|
row: dict[str, Any] = {
|
|
"machine": self.machine["label"],
|
|
"machine_id": self.machine["machine_id"],
|
|
"scan_id": tile["scan_id"],
|
|
"scan_time": tile.get("scan_time", ""),
|
|
"row_index": tile["row_index"],
|
|
"col_index": tile["col_index"],
|
|
"x_mm": tile["x_mm"],
|
|
"y_mm": tile["y_mm"],
|
|
"url": tile["url"],
|
|
"local_path": str(dest),
|
|
"downloaded_at": "",
|
|
"file_size_bytes": "",
|
|
}
|
|
if dry_run:
|
|
return row
|
|
if dest.exists():
|
|
row["downloaded_at"] = "already_exists"
|
|
row["file_size_bytes"] = dest.stat().st_size
|
|
return row
|
|
size = self.download_file(tile["url"], dest)
|
|
if size:
|
|
row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
|
|
row["file_size_bytes"] = size
|
|
return row
|