"""
Pure HTML / text parsing functions for the RootView web application.
All functions are side-effect-free: string (or list[str]) in, dict/list out.
No network access, no filesystem access.
"""
import math
import re
from typing import Any
from urllib.parse import unquote
from bs4 import BeautifulSoup
# ---------------------------------------------------------------------------
# Machine descriptor
# ---------------------------------------------------------------------------
def parse_machine_option(label: str, raw_value: str) -> dict[str, Any]:
"""Decode the pipe-delimited value for a machine."""
decoded = unquote(raw_value)
parts = decoded.split("|")
return {
"label": label,
"option_value": raw_value,
"name": parts[0] if len(parts) > 0 else label,
"ip": parts[1] if len(parts) > 1 else "",
"port1": parts[2] if len(parts) > 2 else "",
"machine_id": parts[3] if len(parts) > 3 else "",
"port2": parts[4] if len(parts) > 4 else "",
"version": parts[5] if len(parts) > 5 else "",
}
# ---------------------------------------------------------------------------
# Scan list row
# ---------------------------------------------------------------------------
def parse_scan_row(cells: list[str]) -> dict[str, Any] | None:
"""
Parse one table row from the scan list into a scan dict.
Expected columns (from the observed HTML):
ID, Name, Scan Time, Step Units, (X,Y)-(X,Y)-(DX,DY),
Dwell Time ms, Scan Lines, Scan Mode, Start Time, End Time,
Cancelled, User, Scan Status, Archived, [View link]
Returns None for header rows or rows whose first cell is not a digit.
"""
if not cells or not cells[0].strip().isdigit():
return None
try:
scan_id = int(cells[0].strip())
return {
"scan_id": scan_id,
"name": cells[1].strip() if len(cells) > 1 else "",
"scan_time": cells[2].strip() if len(cells) > 2 else "",
"step_units": cells[3].strip() if len(cells) > 3 else "",
"coord_str": cells[4].strip() if len(cells) > 4 else "",
"dwell_ms": cells[5].strip() if len(cells) > 5 else "",
"scan_lines": cells[6].strip() if len(cells) > 6 else "",
"scan_mode": cells[7].strip() if len(cells) > 7 else "",
"start_datetime": cells[8].strip() if len(cells) > 8 else "",
"end_datetime": cells[9].strip() if len(cells) > 9 else "",
"cancelled": cells[10].strip() if len(cells) > 10 else "",
"user": cells[11].strip() if len(cells) > 11 else "",
"status": cells[12].strip() if len(cells) > 12 else "",
"archived": cells[13].strip() if len(cells) > 13 else "",
}
except (ValueError, IndexError):
return None
# ---------------------------------------------------------------------------
# Scan view page
# ---------------------------------------------------------------------------
def parse_scan_view(html: str) -> dict[str, Any]:
"""
Extract grid parameters from a scan view page.
Returns a dict with keys:
scan_id, name, scan_time, start_x, start_y, end_x, end_y,
dx, dy, nx, ny, total_tiles, disk_space_mb, ...
"""
result: dict[str, Any] = {}
# Extract grid params from the show_tile() URL inside the page JS.
# The scan view embeds them as query params in a JS string, e.g.:
# "include/tile_view.php?...&sX=0&sY=0&eX=310&eY=740&dX=3.01&dY=2.26&..."
tile_url_m = re.search(r'tile_view\.php\?([^"\']+)', html)
if tile_url_m:
qs = tile_url_m.group(1)
param_map = {
"sX": "startX",
"sY": "startY",
"eX": "endX",
"eY": "endY",
"dX": "deltaX",
"dY": "deltaY",
}
for qs_key, result_key in param_map.items():
m = re.search(rf"(?:^|&){re.escape(qs_key)}=([\d.]+)", qs)
if m:
result[result_key] = float(m.group(1))
# Fallback: look for standalone JS var declarations (present in tile_view.php)
js_var_patterns = {
"startX": r"var\s+startX\s*=\s*([\d.]+)",
"startY": r"var\s+startY\s*=\s*([\d.]+)",
"endX": r"var\s+endX\s*=\s*([\d.]+)",
"endY": r"var\s+endY\s*=\s*([\d.]+)",
"deltaX": r"var\s+deltaX\s*=\s*([\d.]+)",
"deltaY": r"var\s+deltaY\s*=\s*([\d.]+)",
}
for key, pattern in js_var_patterns.items():
if key not in result:
m = re.search(pattern, html)
if m:
result[key] = float(m.group(1))
# Extract from the data table
soup = BeautifulSoup(html, "lxml")
for row in soup.find_all("tr"):
cells = [td.get_text(strip=True) for td in row.find_all("td")]
for i, cell in enumerate(cells):
if cell == "Scan ID:" and i + 1 < len(cells):
try:
result["scan_id"] = int(cells[i + 1])
except ValueError:
pass
elif cell == "Name:" and i + 1 < len(cells):
result["name"] = cells[i + 1]
elif cell == "Scan Time:" and i + 1 < len(cells):
result["scan_time"] = cells[i + 1]
elif cell == "Starting X:" and i + 1 < len(cells):
result["start_x_label"] = cells[i + 1]
elif cell == "Starting Y:" and i + 1 < len(cells):
result["start_y_label"] = cells[i + 1]
elif cell == "Ending X:" and i + 1 < len(cells):
result["end_x_label"] = cells[i + 1]
elif cell == "Ending Y:" and i + 1 < len(cells):
result["end_y_label"] = cells[i + 1]
elif cell == "DX:" and i + 1 < len(cells):
result["dx_label"] = cells[i + 1]
elif cell == "DY:" and i + 1 < len(cells):
result["dy_label"] = cells[i + 1]
elif cell == "Scan Lines:" and i + 1 < len(cells):
result["scan_lines"] = cells[i + 1]
elif cell == "Scan Mode:" and i + 1 < len(cells):
result["scan_mode"] = cells[i + 1]
elif cell == "Start Time:" and i + 1 < len(cells):
result["start_datetime"] = cells[i + 1]
elif cell == "End Time:" and i + 1 < len(cells):
result["end_datetime"] = cells[i + 1]
elif cell == "Scan Status:" and i + 1 < len(cells):
result["status"] = cells[i + 1]
elif cell == "User:" and i + 1 < len(cells):
result["user"] = cells[i + 1]
elif cell == "Total number of images:" and i + 1 < len(cells):
# Format: "33784 (103x328)"
m = re.match(r"(\d+)\s*\((\d+)x(\d+)\)", cells[i + 1])
if m:
result["total_tiles"] = int(m.group(1))
result["nx"] = int(m.group(2))
result["ny"] = int(m.group(3))
elif cell == "Total Disk Space:" and i + 1 < len(cells):
m = re.search(r"([\d.]+)\s*Mb", cells[i + 1])
if m:
result["disk_space_mb"] = float(m.group(1))
# Promote JS/URL grid param names to canonical keys
for raw, canon in (
("startX", "start_x"),
("startY", "start_y"),
("endX", "end_x"),
("endY", "end_y"),
("deltaX", "dx"),
("deltaY", "dy"),
):
if raw in result:
result[canon] = result.pop(raw)
# Compute nx/ny from grid params if not parsed from table
if "nx" not in result and all(k in result for k in ("start_x", "end_x", "dx")):
result["nx"] = _grid_count(result["start_x"], result["end_x"], result["dx"])
if "ny" not in result and all(k in result for k in ("start_y", "end_y", "dy")):
result["ny"] = _grid_count(result["start_y"], result["end_y"], result["dy"])
if "total_tiles" not in result and "nx" in result and "ny" in result:
result["total_tiles"] = result["nx"] * result["ny"]
return result
# ---------------------------------------------------------------------------
# Grid helpers
# ---------------------------------------------------------------------------
def _grid_count(start: float, end: float, step: float) -> int:
"""Number of grid positions from start up to (but not including) end."""
if step <= 0:
return 0
return math.ceil((end - start) / step)
def _grid_values(start: float, count: int, step: float) -> list[float]:
"""Generate `count` evenly-spaced grid positions, rounded to 2 dp."""
return [round(start + i * step, 2) for i in range(count)]