Initial commit
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
This commit is contained in:
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
Pure HTML / text parsing functions for the RootView web application.
|
||||
|
||||
All functions are side-effect-free: string (or list[str]) in, dict/list out.
|
||||
No network access, no filesystem access.
|
||||
"""
|
||||
|
||||
import math
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import unquote
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Machine descriptor
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def parse_machine_option(label: str, raw_value: str) -> dict[str, Any]:
|
||||
"""Decode the pipe-delimited <option> value for a machine."""
|
||||
decoded = unquote(raw_value)
|
||||
parts = decoded.split("|")
|
||||
return {
|
||||
"label": label,
|
||||
"option_value": raw_value,
|
||||
"name": parts[0] if len(parts) > 0 else label,
|
||||
"ip": parts[1] if len(parts) > 1 else "",
|
||||
"port1": parts[2] if len(parts) > 2 else "",
|
||||
"machine_id": parts[3] if len(parts) > 3 else "",
|
||||
"port2": parts[4] if len(parts) > 4 else "",
|
||||
"version": parts[5] if len(parts) > 5 else "",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scan list row
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def parse_scan_row(cells: list[str]) -> dict[str, Any] | None:
|
||||
"""
|
||||
Parse one table row from the scan list into a scan dict.
|
||||
|
||||
Expected columns (from the observed HTML):
|
||||
ID, Name, Scan Time, Step Units, (X,Y)-(X,Y)-(DX,DY),
|
||||
Dwell Time ms, Scan Lines, Scan Mode, Start Time, End Time,
|
||||
Cancelled, User, Scan Status, Archived, [View link]
|
||||
|
||||
Returns None for header rows or rows whose first cell is not a digit.
|
||||
"""
|
||||
if not cells or not cells[0].strip().isdigit():
|
||||
return None
|
||||
try:
|
||||
scan_id = int(cells[0].strip())
|
||||
return {
|
||||
"scan_id": scan_id,
|
||||
"name": cells[1].strip() if len(cells) > 1 else "",
|
||||
"scan_time": cells[2].strip() if len(cells) > 2 else "",
|
||||
"step_units": cells[3].strip() if len(cells) > 3 else "",
|
||||
"coord_str": cells[4].strip() if len(cells) > 4 else "",
|
||||
"dwell_ms": cells[5].strip() if len(cells) > 5 else "",
|
||||
"scan_lines": cells[6].strip() if len(cells) > 6 else "",
|
||||
"scan_mode": cells[7].strip() if len(cells) > 7 else "",
|
||||
"start_datetime": cells[8].strip() if len(cells) > 8 else "",
|
||||
"end_datetime": cells[9].strip() if len(cells) > 9 else "",
|
||||
"cancelled": cells[10].strip() if len(cells) > 10 else "",
|
||||
"user": cells[11].strip() if len(cells) > 11 else "",
|
||||
"status": cells[12].strip() if len(cells) > 12 else "",
|
||||
"archived": cells[13].strip() if len(cells) > 13 else "",
|
||||
}
|
||||
except (ValueError, IndexError):
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scan view page
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def parse_scan_view(html: str) -> dict[str, Any]:
|
||||
"""
|
||||
Extract grid parameters from a scan view page.
|
||||
|
||||
Returns a dict with keys:
|
||||
scan_id, name, scan_time, start_x, start_y, end_x, end_y,
|
||||
dx, dy, nx, ny, total_tiles, disk_space_mb, ...
|
||||
"""
|
||||
result: dict[str, Any] = {}
|
||||
|
||||
# Extract grid params from the show_tile() URL inside the page JS.
|
||||
# The scan view embeds them as query params in a JS string, e.g.:
|
||||
# "include/tile_view.php?...&sX=0&sY=0&eX=310&eY=740&dX=3.01&dY=2.26&..."
|
||||
tile_url_m = re.search(r'tile_view\.php\?([^"\']+)', html)
|
||||
if tile_url_m:
|
||||
qs = tile_url_m.group(1)
|
||||
param_map = {
|
||||
"sX": "startX",
|
||||
"sY": "startY",
|
||||
"eX": "endX",
|
||||
"eY": "endY",
|
||||
"dX": "deltaX",
|
||||
"dY": "deltaY",
|
||||
}
|
||||
for qs_key, result_key in param_map.items():
|
||||
m = re.search(rf"(?:^|&){re.escape(qs_key)}=([\d.]+)", qs)
|
||||
if m:
|
||||
result[result_key] = float(m.group(1))
|
||||
|
||||
# Fallback: look for standalone JS var declarations (present in tile_view.php)
|
||||
js_var_patterns = {
|
||||
"startX": r"var\s+startX\s*=\s*([\d.]+)",
|
||||
"startY": r"var\s+startY\s*=\s*([\d.]+)",
|
||||
"endX": r"var\s+endX\s*=\s*([\d.]+)",
|
||||
"endY": r"var\s+endY\s*=\s*([\d.]+)",
|
||||
"deltaX": r"var\s+deltaX\s*=\s*([\d.]+)",
|
||||
"deltaY": r"var\s+deltaY\s*=\s*([\d.]+)",
|
||||
}
|
||||
for key, pattern in js_var_patterns.items():
|
||||
if key not in result:
|
||||
m = re.search(pattern, html)
|
||||
if m:
|
||||
result[key] = float(m.group(1))
|
||||
|
||||
# Extract from the data table
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
for row in soup.find_all("tr"):
|
||||
cells = [td.get_text(strip=True) for td in row.find_all("td")]
|
||||
for i, cell in enumerate(cells):
|
||||
if cell == "Scan ID:" and i + 1 < len(cells):
|
||||
try:
|
||||
result["scan_id"] = int(cells[i + 1])
|
||||
except ValueError:
|
||||
pass
|
||||
elif cell == "Name:" and i + 1 < len(cells):
|
||||
result["name"] = cells[i + 1]
|
||||
elif cell == "Scan Time:" and i + 1 < len(cells):
|
||||
result["scan_time"] = cells[i + 1]
|
||||
elif cell == "Starting X:" and i + 1 < len(cells):
|
||||
result["start_x_label"] = cells[i + 1]
|
||||
elif cell == "Starting Y:" and i + 1 < len(cells):
|
||||
result["start_y_label"] = cells[i + 1]
|
||||
elif cell == "Ending X:" and i + 1 < len(cells):
|
||||
result["end_x_label"] = cells[i + 1]
|
||||
elif cell == "Ending Y:" and i + 1 < len(cells):
|
||||
result["end_y_label"] = cells[i + 1]
|
||||
elif cell == "DX:" and i + 1 < len(cells):
|
||||
result["dx_label"] = cells[i + 1]
|
||||
elif cell == "DY:" and i + 1 < len(cells):
|
||||
result["dy_label"] = cells[i + 1]
|
||||
elif cell == "Scan Lines:" and i + 1 < len(cells):
|
||||
result["scan_lines"] = cells[i + 1]
|
||||
elif cell == "Scan Mode:" and i + 1 < len(cells):
|
||||
result["scan_mode"] = cells[i + 1]
|
||||
elif cell == "Start Time:" and i + 1 < len(cells):
|
||||
result["start_datetime"] = cells[i + 1]
|
||||
elif cell == "End Time:" and i + 1 < len(cells):
|
||||
result["end_datetime"] = cells[i + 1]
|
||||
elif cell == "Scan Status:" and i + 1 < len(cells):
|
||||
result["status"] = cells[i + 1]
|
||||
elif cell == "User:" and i + 1 < len(cells):
|
||||
result["user"] = cells[i + 1]
|
||||
elif cell == "Total number of images:" and i + 1 < len(cells):
|
||||
# Format: "33784 (103x328)"
|
||||
m = re.match(r"(\d+)\s*\((\d+)x(\d+)\)", cells[i + 1])
|
||||
if m:
|
||||
result["total_tiles"] = int(m.group(1))
|
||||
result["nx"] = int(m.group(2))
|
||||
result["ny"] = int(m.group(3))
|
||||
elif cell == "Total Disk Space:" and i + 1 < len(cells):
|
||||
m = re.search(r"([\d.]+)\s*Mb", cells[i + 1])
|
||||
if m:
|
||||
result["disk_space_mb"] = float(m.group(1))
|
||||
|
||||
# Promote JS/URL grid param names to canonical keys
|
||||
for raw, canon in (
|
||||
("startX", "start_x"),
|
||||
("startY", "start_y"),
|
||||
("endX", "end_x"),
|
||||
("endY", "end_y"),
|
||||
("deltaX", "dx"),
|
||||
("deltaY", "dy"),
|
||||
):
|
||||
if raw in result:
|
||||
result[canon] = result.pop(raw)
|
||||
|
||||
# Compute nx/ny from grid params if not parsed from table
|
||||
if "nx" not in result and all(k in result for k in ("start_x", "end_x", "dx")):
|
||||
result["nx"] = _grid_count(result["start_x"], result["end_x"], result["dx"])
|
||||
if "ny" not in result and all(k in result for k in ("start_y", "end_y", "dy")):
|
||||
result["ny"] = _grid_count(result["start_y"], result["end_y"], result["dy"])
|
||||
if "total_tiles" not in result and "nx" in result and "ny" in result:
|
||||
result["total_tiles"] = result["nx"] * result["ny"]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Grid helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _grid_count(start: float, end: float, step: float) -> int:
|
||||
"""Number of grid positions from start up to (but not including) end."""
|
||||
if step <= 0:
|
||||
return 0
|
||||
return math.ceil((end - start) / step)
|
||||
|
||||
|
||||
def _grid_values(start: float, count: int, step: float) -> list[float]:
|
||||
"""Generate `count` evenly-spaced grid positions, rounded to 2 dp."""
|
||||
return [round(start + i * step, 2) for i in range(count)]
|
||||
Reference in New Issue
Block a user