f2193011ca
- Add --metadata-only flag: fetches scan detail pages, writes metadata.json + scans.csv rows, skips all image downloads. Re-runs skip scans whose metadata.json already exists. - Atomic progress.json saves (temp-file rename). - Heal-on-resume: tiles on disk but not in progress are silently re-marked before building the pending list. - scans.csv dedup: skip row if mosaic URL already in progress. - Rename mosaic_downloaded -> mosaic_on_disk (reflects disk state). - --recheck now checks mosaics as well as tiles. - RunStats dataclass replaces raw int return; richer run summary. - Fix argparse allow_abbrev reverted; fix --scan-id + --metadata-only glob fallback when scan_time is absent. - Add .venv/ to .gitignore. - README: fix typo, update worker counts, document all new behaviour.
271 lines
9.4 KiB
Python
271 lines
9.4 KiB
Python
"""
|
|
HTTP session for a single RootView machine: login, scan listing, tile downloads.
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin
|
|
from typing import Any
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from spruce.parsers import parse_scan_row, parse_scan_view, _grid_values
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
USER_AGENT = "spruce-scraper/1.0"
|
|
|
|
|
|
class MachineSession:
|
|
"""Manages an authenticated HTTP session for one RootView machine."""
|
|
|
|
def __init__(self, machine: dict[str, Any], config: dict[str, Any]) -> None:
|
|
self.machine = machine
|
|
self.cfg = config
|
|
self.http = requests.Session()
|
|
self.http.headers["User-Agent"] = USER_AGENT
|
|
self.base_url: str = config["base_url"]
|
|
self.image_base_url: str = config.get(
|
|
"image_base_url", "http://205.149.147.131:8011/"
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Auth
|
|
# ------------------------------------------------------------------
|
|
|
|
def login(self) -> bool:
|
|
url = urljoin(self.base_url, "index.php")
|
|
payload = {
|
|
"RTLLogin": "1",
|
|
"RTLNAME": self.machine["option_value"],
|
|
"RTLUSER": self.cfg["username"],
|
|
"RTLPWD": self.cfg["password"],
|
|
"rtl_latest_version": "3.0.0.18",
|
|
"submit": " submit ",
|
|
}
|
|
try:
|
|
resp = self.http.post(url, data=payload, timeout=self.cfg["timeout"])
|
|
resp.raise_for_status()
|
|
except requests.RequestException as exc:
|
|
log.error("[%s] Login failed: %s", self.machine["label"], exc)
|
|
return False
|
|
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
error_tag = soup.find(class_="error")
|
|
if error_tag and error_tag.get_text(strip=True):
|
|
log.error(
|
|
"[%s] Login rejected: %s",
|
|
self.machine["label"],
|
|
error_tag.get_text(strip=True),
|
|
)
|
|
return False
|
|
|
|
log.info("[%s] Login succeeded.", self.machine["label"])
|
|
return True
|
|
|
|
# ------------------------------------------------------------------
|
|
# Scan list (paginated)
|
|
# ------------------------------------------------------------------
|
|
|
|
def get_all_scans(self) -> list[dict[str, Any]]:
|
|
"""
|
|
Fetch the complete scan list across all pages.
|
|
|
|
Uses a large FilterCount (320) to minimise round-trips.
|
|
Falls back to repeated pages if the list is longer.
|
|
"""
|
|
all_scans: list[dict[str, Any]] = []
|
|
start = 0
|
|
page_size = 320
|
|
|
|
while True:
|
|
page_scans = self._fetch_scan_page(start, page_size)
|
|
if not page_scans:
|
|
break
|
|
all_scans.extend(page_scans)
|
|
log.debug(
|
|
"[%s] Page start=%d: %d scans (total so far: %d)",
|
|
self.machine["label"],
|
|
start,
|
|
len(page_scans),
|
|
len(all_scans),
|
|
)
|
|
if len(page_scans) < page_size:
|
|
break
|
|
start += page_size
|
|
time.sleep(self.cfg["request_delay"])
|
|
|
|
log.info("[%s] Found %d scans.", self.machine["label"], len(all_scans))
|
|
return all_scans
|
|
|
|
def _fetch_scan_page(
|
|
self, start: int, page_size: int
|
|
) -> list[dict[str, Any]]:
|
|
"""POST the scan list form and parse the returned table."""
|
|
time.sleep(self.cfg["request_delay"])
|
|
resp = self.http.post(
|
|
urljoin(self.base_url, "index.php"),
|
|
data={
|
|
"cmd": "scan",
|
|
"start": str(start),
|
|
"order": "0",
|
|
"order_dir": "1",
|
|
"FilterScanStatus": "2", # Completed scans
|
|
"FilterUser": "",
|
|
"hidedate": "",
|
|
"FilterDtFrom": "",
|
|
"FilterDtTo": "",
|
|
"FilterIdFrom": "0",
|
|
"FilterIdTo": "0",
|
|
"FilterCount": str(page_size),
|
|
},
|
|
timeout=self.cfg["timeout"],
|
|
)
|
|
resp.raise_for_status()
|
|
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
scans: list[dict[str, Any]] = []
|
|
for row in soup.find_all("tr"):
|
|
cells = [td.get_text(strip=True) for td in row.find_all("td")]
|
|
scan = parse_scan_row(cells)
|
|
if scan:
|
|
scans.append(scan)
|
|
return scans
|
|
|
|
# ------------------------------------------------------------------
|
|
# Scan detail
|
|
# ------------------------------------------------------------------
|
|
|
|
def get_scan_metadata(self, scan_id: int) -> dict[str, Any]:
|
|
"""Fetch the scan view page and extract grid parameters."""
|
|
time.sleep(self.cfg["request_delay"])
|
|
resp = self.http.get(
|
|
urljoin(self.base_url, "index.php"),
|
|
params={"cmd": "scan", "mode": "view", "id": str(scan_id)},
|
|
timeout=self.cfg["timeout"],
|
|
)
|
|
resp.raise_for_status()
|
|
return parse_scan_view(resp.text)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Tile enumeration
|
|
# ------------------------------------------------------------------
|
|
|
|
def enumerate_tiles(self, scan_meta: dict[str, Any]) -> list[dict[str, Any]]:
|
|
"""
|
|
Generate the full list of tile descriptors for a scan.
|
|
|
|
Each descriptor has: url, row_index, col_index, x_mm, y_mm
|
|
"""
|
|
scan_id = scan_meta["scan_id"]
|
|
nx: int = scan_meta.get("nx", 0)
|
|
ny: int = scan_meta.get("ny", 0)
|
|
start_x: float = scan_meta.get("start_x", 0.0)
|
|
start_y: float = scan_meta.get("start_y", 0.0)
|
|
dx: float = scan_meta.get("dx", 1.0)
|
|
dy: float = scan_meta.get("dy", 1.0)
|
|
scale: int = self.cfg.get("tile_scale", 1)
|
|
|
|
xs = _grid_values(start_x, nx, dx)
|
|
ys = _grid_values(start_y, ny, dy)
|
|
|
|
tiles: list[dict[str, Any]] = []
|
|
for row_idx, y in enumerate(ys):
|
|
for col_idx, x in enumerate(xs):
|
|
url = (
|
|
urljoin(self.base_url, "index.php")
|
|
+ f"?cmd=image&mode=image_scan&id={scan_id}"
|
|
+ f"&s={scale}&x={x}&y={y}"
|
|
)
|
|
tiles.append(
|
|
{
|
|
"scan_id": scan_id,
|
|
"row_index": row_idx,
|
|
"col_index": col_idx,
|
|
"x_mm": x,
|
|
"y_mm": y,
|
|
"url": url,
|
|
}
|
|
)
|
|
return tiles
|
|
|
|
# ------------------------------------------------------------------
|
|
# Mosaic URL
|
|
# ------------------------------------------------------------------
|
|
|
|
def mosaic_url(self, scan_id: int) -> str:
|
|
return urljoin(
|
|
self.image_base_url, f"RootView_Database/{scan_id}/mosaic.jpg"
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Downloads
|
|
# ------------------------------------------------------------------
|
|
|
|
def download_file(self, url: str, dest: Path, retries: int = 3) -> int:
|
|
"""Stream-download url to dest with retries. Returns bytes written (0 on failure)."""
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
backoff = 5.0
|
|
for attempt in range(1, retries + 1):
|
|
try:
|
|
resp = self.http.get(
|
|
url, timeout=self.cfg["timeout"], stream=True
|
|
)
|
|
resp.raise_for_status()
|
|
size = 0
|
|
with open(dest, "wb") as fh:
|
|
for chunk in resp.iter_content(chunk_size=65536):
|
|
if chunk:
|
|
fh.write(chunk)
|
|
size += len(chunk)
|
|
return size
|
|
except Exception as exc:
|
|
if attempt < retries:
|
|
log.debug(
|
|
"Attempt %d/%d failed %s: %s — retrying in %.0fs",
|
|
attempt,
|
|
retries,
|
|
url,
|
|
exc,
|
|
backoff,
|
|
)
|
|
time.sleep(backoff)
|
|
backoff *= 2
|
|
else:
|
|
log.warning(
|
|
"Download failed after %d attempts %s: %s",
|
|
retries,
|
|
url,
|
|
exc,
|
|
)
|
|
return 0
|
|
|
|
def download_tile(
|
|
self, tile: dict[str, Any], dest: Path, dry_run: bool
|
|
) -> dict[str, Any]:
|
|
"""Download a single tile. Returns a metadata row dict."""
|
|
row: dict[str, Any] = {
|
|
"machine": self.machine["label"],
|
|
"machine_id": self.machine["machine_id"],
|
|
"scan_id": tile["scan_id"],
|
|
"scan_time": tile.get("scan_time", ""),
|
|
"row_index": tile["row_index"],
|
|
"col_index": tile["col_index"],
|
|
"x_mm": tile["x_mm"],
|
|
"y_mm": tile["y_mm"],
|
|
"url": tile["url"],
|
|
"local_path": str(dest),
|
|
"downloaded_at": "",
|
|
"file_size_bytes": "",
|
|
}
|
|
if dry_run:
|
|
return row
|
|
size = self.download_file(tile["url"], dest)
|
|
if size:
|
|
row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
|
|
row["file_size_bytes"] = size
|
|
return row
|