e122f6435a
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
214 lines
8.5 KiB
Python
214 lines
8.5 KiB
Python
"""
|
|
Pure HTML / text parsing functions for the RootView web application.
|
|
|
|
All functions are side-effect-free: string (or list[str]) in, dict/list out.
|
|
No network access, no filesystem access.
|
|
"""
|
|
|
|
import math
|
|
import re
|
|
from typing import Any
|
|
from urllib.parse import unquote
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Machine descriptor
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def parse_machine_option(label: str, raw_value: str) -> dict[str, Any]:
|
|
"""Decode the pipe-delimited <option> value for a machine."""
|
|
decoded = unquote(raw_value)
|
|
parts = decoded.split("|")
|
|
return {
|
|
"label": label,
|
|
"option_value": raw_value,
|
|
"name": parts[0] if len(parts) > 0 else label,
|
|
"ip": parts[1] if len(parts) > 1 else "",
|
|
"port1": parts[2] if len(parts) > 2 else "",
|
|
"machine_id": parts[3] if len(parts) > 3 else "",
|
|
"port2": parts[4] if len(parts) > 4 else "",
|
|
"version": parts[5] if len(parts) > 5 else "",
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scan list row
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def parse_scan_row(cells: list[str]) -> dict[str, Any] | None:
|
|
"""
|
|
Parse one table row from the scan list into a scan dict.
|
|
|
|
Expected columns (from the observed HTML):
|
|
ID, Name, Scan Time, Step Units, (X,Y)-(X,Y)-(DX,DY),
|
|
Dwell Time ms, Scan Lines, Scan Mode, Start Time, End Time,
|
|
Cancelled, User, Scan Status, Archived, [View link]
|
|
|
|
Returns None for header rows or rows whose first cell is not a digit.
|
|
"""
|
|
if not cells or not cells[0].strip().isdigit():
|
|
return None
|
|
try:
|
|
scan_id = int(cells[0].strip())
|
|
return {
|
|
"scan_id": scan_id,
|
|
"name": cells[1].strip() if len(cells) > 1 else "",
|
|
"scan_time": cells[2].strip() if len(cells) > 2 else "",
|
|
"step_units": cells[3].strip() if len(cells) > 3 else "",
|
|
"coord_str": cells[4].strip() if len(cells) > 4 else "",
|
|
"dwell_ms": cells[5].strip() if len(cells) > 5 else "",
|
|
"scan_lines": cells[6].strip() if len(cells) > 6 else "",
|
|
"scan_mode": cells[7].strip() if len(cells) > 7 else "",
|
|
"start_datetime": cells[8].strip() if len(cells) > 8 else "",
|
|
"end_datetime": cells[9].strip() if len(cells) > 9 else "",
|
|
"cancelled": cells[10].strip() if len(cells) > 10 else "",
|
|
"user": cells[11].strip() if len(cells) > 11 else "",
|
|
"status": cells[12].strip() if len(cells) > 12 else "",
|
|
"archived": cells[13].strip() if len(cells) > 13 else "",
|
|
}
|
|
except (ValueError, IndexError):
|
|
return None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scan view page
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def parse_scan_view(html: str) -> dict[str, Any]:
|
|
"""
|
|
Extract grid parameters from a scan view page.
|
|
|
|
Returns a dict with keys:
|
|
scan_id, name, scan_time, start_x, start_y, end_x, end_y,
|
|
dx, dy, nx, ny, total_tiles, disk_space_mb, ...
|
|
"""
|
|
result: dict[str, Any] = {}
|
|
|
|
# Extract grid params from the show_tile() URL inside the page JS.
|
|
# The scan view embeds them as query params in a JS string, e.g.:
|
|
# "include/tile_view.php?...&sX=0&sY=0&eX=310&eY=740&dX=3.01&dY=2.26&..."
|
|
tile_url_m = re.search(r'tile_view\.php\?([^"\']+)', html)
|
|
if tile_url_m:
|
|
qs = tile_url_m.group(1)
|
|
param_map = {
|
|
"sX": "startX",
|
|
"sY": "startY",
|
|
"eX": "endX",
|
|
"eY": "endY",
|
|
"dX": "deltaX",
|
|
"dY": "deltaY",
|
|
}
|
|
for qs_key, result_key in param_map.items():
|
|
m = re.search(rf"(?:^|&){re.escape(qs_key)}=([\d.]+)", qs)
|
|
if m:
|
|
result[result_key] = float(m.group(1))
|
|
|
|
# Fallback: look for standalone JS var declarations (present in tile_view.php)
|
|
js_var_patterns = {
|
|
"startX": r"var\s+startX\s*=\s*([\d.]+)",
|
|
"startY": r"var\s+startY\s*=\s*([\d.]+)",
|
|
"endX": r"var\s+endX\s*=\s*([\d.]+)",
|
|
"endY": r"var\s+endY\s*=\s*([\d.]+)",
|
|
"deltaX": r"var\s+deltaX\s*=\s*([\d.]+)",
|
|
"deltaY": r"var\s+deltaY\s*=\s*([\d.]+)",
|
|
}
|
|
for key, pattern in js_var_patterns.items():
|
|
if key not in result:
|
|
m = re.search(pattern, html)
|
|
if m:
|
|
result[key] = float(m.group(1))
|
|
|
|
# Extract from the data table
|
|
soup = BeautifulSoup(html, "lxml")
|
|
for row in soup.find_all("tr"):
|
|
cells = [td.get_text(strip=True) for td in row.find_all("td")]
|
|
for i, cell in enumerate(cells):
|
|
if cell == "Scan ID:" and i + 1 < len(cells):
|
|
try:
|
|
result["scan_id"] = int(cells[i + 1])
|
|
except ValueError:
|
|
pass
|
|
elif cell == "Name:" and i + 1 < len(cells):
|
|
result["name"] = cells[i + 1]
|
|
elif cell == "Scan Time:" and i + 1 < len(cells):
|
|
result["scan_time"] = cells[i + 1]
|
|
elif cell == "Starting X:" and i + 1 < len(cells):
|
|
result["start_x_label"] = cells[i + 1]
|
|
elif cell == "Starting Y:" and i + 1 < len(cells):
|
|
result["start_y_label"] = cells[i + 1]
|
|
elif cell == "Ending X:" and i + 1 < len(cells):
|
|
result["end_x_label"] = cells[i + 1]
|
|
elif cell == "Ending Y:" and i + 1 < len(cells):
|
|
result["end_y_label"] = cells[i + 1]
|
|
elif cell == "DX:" and i + 1 < len(cells):
|
|
result["dx_label"] = cells[i + 1]
|
|
elif cell == "DY:" and i + 1 < len(cells):
|
|
result["dy_label"] = cells[i + 1]
|
|
elif cell == "Scan Lines:" and i + 1 < len(cells):
|
|
result["scan_lines"] = cells[i + 1]
|
|
elif cell == "Scan Mode:" and i + 1 < len(cells):
|
|
result["scan_mode"] = cells[i + 1]
|
|
elif cell == "Start Time:" and i + 1 < len(cells):
|
|
result["start_datetime"] = cells[i + 1]
|
|
elif cell == "End Time:" and i + 1 < len(cells):
|
|
result["end_datetime"] = cells[i + 1]
|
|
elif cell == "Scan Status:" and i + 1 < len(cells):
|
|
result["status"] = cells[i + 1]
|
|
elif cell == "User:" and i + 1 < len(cells):
|
|
result["user"] = cells[i + 1]
|
|
elif cell == "Total number of images:" and i + 1 < len(cells):
|
|
# Format: "33784 (103x328)"
|
|
m = re.match(r"(\d+)\s*\((\d+)x(\d+)\)", cells[i + 1])
|
|
if m:
|
|
result["total_tiles"] = int(m.group(1))
|
|
result["nx"] = int(m.group(2))
|
|
result["ny"] = int(m.group(3))
|
|
elif cell == "Total Disk Space:" and i + 1 < len(cells):
|
|
m = re.search(r"([\d.]+)\s*Mb", cells[i + 1])
|
|
if m:
|
|
result["disk_space_mb"] = float(m.group(1))
|
|
|
|
# Promote JS/URL grid param names to canonical keys
|
|
for raw, canon in (
|
|
("startX", "start_x"),
|
|
("startY", "start_y"),
|
|
("endX", "end_x"),
|
|
("endY", "end_y"),
|
|
("deltaX", "dx"),
|
|
("deltaY", "dy"),
|
|
):
|
|
if raw in result:
|
|
result[canon] = result.pop(raw)
|
|
|
|
# Compute nx/ny from grid params if not parsed from table
|
|
if "nx" not in result and all(k in result for k in ("start_x", "end_x", "dx")):
|
|
result["nx"] = _grid_count(result["start_x"], result["end_x"], result["dx"])
|
|
if "ny" not in result and all(k in result for k in ("start_y", "end_y", "dy")):
|
|
result["ny"] = _grid_count(result["start_y"], result["end_y"], result["dy"])
|
|
if "total_tiles" not in result and "nx" in result and "ny" in result:
|
|
result["total_tiles"] = result["nx"] * result["ny"]
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Grid helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _grid_count(start: float, end: float, step: float) -> int:
|
|
"""Number of grid positions from start up to (but not including) end."""
|
|
if step <= 0:
|
|
return 0
|
|
return math.ceil((end - start) / step)
|
|
|
|
|
|
def _grid_values(start: float, count: int, step: float) -> list[float]:
|
|
"""Generate `count` evenly-spaced grid positions, rounded to 2 dp."""
|
|
return [round(start + i * step, 2) for i in range(count)]
|