Initial commit

Add spruce scraper with CLI, session management, parsers, progress tracking,
recheck logic, and test suite. Includes example config and README.
This commit is contained in:
2026-04-22 10:41:18 -04:00
commit e122f6435a
23 changed files with 3789 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
# spruce — minirhizotron archive library
+259
View File
@@ -0,0 +1,259 @@
"""
Command-line interface for the spruce scraper.
"""
import argparse
import logging
import os
import sys
from pathlib import Path
import yaml
from spruce.orchestrator import scrape_machine
from spruce.parsers import parse_machine_option
from spruce.progress import ProgressTracker, CsvWriter
from spruce.recheck import recheck_archive, recheck_tile_files
from spruce.settings import (
DEFAULT_CONFIG,
MAX_SAFE_WORKERS,
PROGRESS_FILENAME,
SCANS_CSV_FIELDS,
SCANS_CSV_FILENAME,
TILES_CSV_FIELDS,
TILES_CSV_FILENAME,
_clamp_workers,
load_config,
)
from spruce.session import MachineSession
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
log = logging.getLogger(__name__)
def discover_machines(base_url: str, timeout: int = 30) -> list[dict]:
resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
select = soup.find("select", {"name": "RTLNAME"})
if not select:
log.warning("Could not find machine selector on login page.")
return []
return [
parse_machine_option(opt.get_text(strip=True), opt["value"])
for opt in select.find_all("option")
]
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Archive minirhizotron image tiles from RootView.",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument(
"--config",
default=DEFAULT_CONFIG,
metavar="FILE",
help=f"YAML config file (default: {DEFAULT_CONFIG})",
)
p.add_argument(
"--machine",
metavar="LABEL",
help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"',
)
p.add_argument(
"--scan-id",
type=int,
metavar="ID",
help="Download only this specific scan ID (use with --machine)",
)
p.add_argument(
"--mosaic-only",
action="store_true",
help="Download mosaics only; skip individual tiles",
)
p.add_argument(
"--dry-run",
action="store_true",
help="Preview what would be downloaded without saving any files",
)
p.add_argument(
"--workers",
type=int,
metavar="N",
help="Override parallel download threads from config",
)
p.add_argument(
"--list-machines",
action="store_true",
help="Print available machines and exit (no credentials needed)",
)
p.add_argument(
"--list-scans",
action="store_true",
help="Print all scans for --machine and exit",
)
p.add_argument(
"--recheck",
action="store_true",
help=(
"Scan the archive for zero-byte or missing tile files whose URLs "
"are marked complete in .progress.json, remove them from progress, "
"and report how many were re-queued. Run before resuming after a crash."
),
)
p.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable debug logging",
)
return p.parse_args()
def main() -> None:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%H:%M:%S",
)
args = parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
# --list-machines doesn't need credentials
if args.list_machines:
base_url = "http://205.149.147.131:8010/"
timeout = 30
if os.path.exists(args.config):
cfg = yaml.safe_load(open(args.config))
base_url = cfg.get("base_url", base_url)
timeout = cfg.get("timeout", timeout)
machines = discover_machines(base_url, timeout)
print(f"{'Label':<25} {'ID':>4} {'IP':<17} {'Version'}")
print("-" * 62)
for m in machines:
print(
f"{m['label']:<25} {m['machine_id']:>4} {m['ip']:<17} {m['version']}"
)
return
if not os.path.exists(args.config):
sys.exit(
f"Config file '{args.config}' not found.\n"
f"Copy config.example.yaml to {args.config} and fill in your credentials."
)
config = load_config(args.config)
if args.workers:
config["workers"] = _clamp_workers(args.workers)
output_dir = Path(config["output_dir"])
# --recheck: validate archive integrity and re-queue bad tiles
if args.recheck:
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
n_bad = recheck_tile_files(output_dir, progress)
n_requeued = recheck_archive(output_dir, progress)
if n_bad == 0 and n_requeued == 0:
log.info("Archive looks clean. No action needed.")
else:
log.info(
"Recheck complete: %d zero-byte file(s) deleted, "
"%d URL(s) re-queued for download.",
n_bad,
n_requeued,
)
return
# Build machine list
all_machines = discover_machines(config["base_url"], config["timeout"])
if not all_machines:
sys.exit("Could not retrieve machine list from server.")
# Apply --machine / config machines filter
filter_labels: list[str] | None = None
if args.machine:
filter_labels = [args.machine]
elif config.get("machines"):
filter_labels = list(config["machines"])
if filter_labels:
machines = [m for m in all_machines if m["label"] in filter_labels]
not_found = [
label
for label in filter_labels
if label not in {m["label"] for m in machines}
]
if not_found:
log.warning("Unknown machine label(s): %s", not_found)
else:
machines = all_machines
if not machines:
sys.exit("No machines selected.")
# --list-scans: print and exit
if args.list_scans:
if len(machines) != 1:
sys.exit("--list-scans requires exactly one machine (use --machine).")
sess = MachineSession(machines[0], config)
if not sess.login():
sys.exit("Login failed.")
scans = sess.get_all_scans()
print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}")
print("-" * 85)
for sc in scans:
print(
f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} "
f"{sc.get('name', ''):<40} {sc.get('status', '')}"
)
print(f"\nTotal: {len(scans)} scans")
return
log.info(
"Scraping %d machine(s): %s",
len(machines),
", ".join(m["label"] for m in machines),
)
if args.mosaic_only:
log.info("Mode: mosaics only (individual tiles skipped)")
if args.dry_run:
log.info("Mode: dry-run (no files will be written)")
# Shared progress and CSV writers
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
total = 0
try:
for machine in machines:
count = scrape_machine(
machine=machine,
config=config,
output_dir=output_dir,
progress=progress,
tiles_csv=tiles_csv,
scans_csv=scans_csv,
dry_run=args.dry_run,
mosaic_only=args.mosaic_only,
scan_id_filter=args.scan_id,
)
total += count
finally:
tiles_csv.close()
scans_csv.close()
progress.save()
if args.dry_run:
log.info("Dry run complete.")
else:
log.info("Done. Total files downloaded: %d", total)
log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME)
log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME)
log.info("Progress : %s", output_dir / PROGRESS_FILENAME)
+307
View File
@@ -0,0 +1,307 @@
"""
High-level scrape orchestration: drives the per-machine and per-scan loops.
"""
import json
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Any
from tqdm import tqdm
from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
from spruce.progress import ProgressTracker, CsvWriter
from spruce.session import MachineSession
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Per-scan helpers
# ---------------------------------------------------------------------------
def _download_mosaic(
sess: MachineSession,
scan_meta: dict[str, Any],
scan_id: int,
mosaic_path: Path,
progress: ProgressTracker,
machine: dict[str, Any],
dry_run: bool,
) -> bool:
"""Download the scan mosaic if not already done. Returns True if downloaded."""
url = sess.mosaic_url(scan_id)
if progress.is_done(url):
return False
if dry_run:
log.info("[DRY-RUN] Mosaic: %s%s", url, mosaic_path)
return False
log.info("[%s] Downloading mosaic for scan %d", machine["label"], scan_id)
size = sess.download_file(url, mosaic_path)
if size:
progress.mark_done(url)
progress.save()
log.info(
"[%s] Mosaic saved: %s (%.1f MB)",
machine["label"],
mosaic_path,
size / 1e6,
)
return True
return False
def _download_tiles_for_scan(
sess: MachineSession,
tiles: list[dict[str, Any]],
scan_meta: dict[str, Any],
scan_id: int,
output_dir: Path,
machine: dict[str, Any],
config: dict[str, Any],
progress: ProgressTracker,
tiles_csv: CsvWriter,
dry_run: bool,
) -> int:
"""Download all pending tiles for a scan. Returns count of tiles downloaded."""
pending = [t for t in tiles if not progress.is_done(t["url"])]
log.info(
"[%s] Scan %d: %d tiles total, %d pending.",
machine["label"],
scan_id,
len(tiles),
len(pending),
)
if dry_run:
for t in pending[:5]:
log.info("[DRY-RUN] Tile: %s", t["url"])
if len(pending) > 5:
log.info("[DRY-RUN] … and %d more tiles.", len(pending) - 5)
return 0
# Attach scan_time for CSV rows
for t in pending:
t["scan_time"] = scan_meta.get("scan_time", "")
workers: int = config["workers"]
downloaded = 0
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {
pool.submit(
sess.download_tile,
tile,
tile_dest(output_dir, machine, scan_meta, tile),
False,
): tile
for tile in pending
}
save_every = max(50, workers * 4)
batch: list[dict[str, Any]] = []
with tqdm(
total=len(pending),
desc=f"{machine['label']} scan {scan_id}",
unit="tile",
leave=True,
) as pbar:
for future in as_completed(futures):
result = future.result()
if result.get("file_size_bytes"):
batch.append(result)
progress.mark_done(result["url"])
downloaded += 1
pbar.update(1)
if len(batch) >= save_every:
for row in batch:
tiles_csv.write(row)
progress.save()
batch.clear()
for row in batch:
tiles_csv.write(row)
progress.save()
log.info(
"[%s] Scan %d complete: %d tiles downloaded.",
machine["label"],
scan_id,
downloaded,
)
return downloaded
# ---------------------------------------------------------------------------
# Per-scan driver
# ---------------------------------------------------------------------------
def process_scan(
sess: MachineSession,
scan: dict[str, Any],
output_dir: Path,
machine: dict[str, Any],
config: dict[str, Any],
progress: ProgressTracker,
scans_csv: CsvWriter,
tiles_csv: CsvWriter,
dry_run: bool,
mosaic_only: bool,
) -> int:
"""
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
Returns total files downloaded for this scan.
"""
scan_id: int = scan["scan_id"]
log.info("[%s] Processing scan %d", machine["label"], scan_id)
try:
scan_meta = sess.get_scan_metadata(scan_id)
except Exception as exc:
log.error(
"[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
)
return 0
if not scan_meta.get("nx") or not scan_meta.get("ny"):
log.warning(
"[%s] Scan %d: missing grid params, skipping.",
machine["label"],
scan_id,
)
return 0
# Merge list-level metadata into scan_meta (detail page takes precedence)
for k in (
"name",
"scan_time",
"start_datetime",
"end_datetime",
"status",
"user",
"scan_lines",
"scan_mode",
):
scan_meta.setdefault(k, scan.get(k, ""))
# Save per-scan metadata.json
scan_date = _extract_date(scan_meta.get("scan_time", ""))
scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id)
if not dry_run:
scan_dir.mkdir(parents=True, exist_ok=True)
meta_file = scan_dir / "metadata.json"
if not meta_file.exists():
meta_file.write_text(
json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
)
# Mosaic
mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
mosaic_url = sess.mosaic_url(scan_id)
mosaic_downloaded = _download_mosaic(
sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
)
total = 1 if mosaic_downloaded else 0
# Write scan-level CSV row
scans_csv.write(
{
"machine": machine["label"],
"machine_id": machine["machine_id"],
"scan_id": scan_id,
"name": scan_meta.get("name", ""),
"scan_time": scan_meta.get("scan_time", ""),
"start_x": scan_meta.get("start_x", ""),
"start_y": scan_meta.get("start_y", ""),
"end_x": scan_meta.get("end_x", ""),
"end_y": scan_meta.get("end_y", ""),
"dx": scan_meta.get("dx", ""),
"dy": scan_meta.get("dy", ""),
"nx": scan_meta.get("nx", ""),
"ny": scan_meta.get("ny", ""),
"total_tiles": scan_meta.get("total_tiles", ""),
"scan_lines": scan_meta.get("scan_lines", ""),
"scan_mode": scan_meta.get("scan_mode", ""),
"start_datetime": scan_meta.get("start_datetime", ""),
"end_datetime": scan_meta.get("end_datetime", ""),
"status": scan_meta.get("status", ""),
"user": scan_meta.get("user", ""),
"disk_space_mb": scan_meta.get("disk_space_mb", ""),
"mosaic_url": mosaic_url,
"mosaic_local_path": str(mosaic_path),
"mosaic_downloaded": mosaic_downloaded,
}
)
if mosaic_only:
return total
# Tiles
tiles = sess.enumerate_tiles(scan_meta)
total += _download_tiles_for_scan(
sess,
tiles,
scan_meta,
scan_id,
output_dir,
machine,
config,
progress,
tiles_csv,
dry_run,
)
return total
# ---------------------------------------------------------------------------
# Per-machine driver
# ---------------------------------------------------------------------------
def scrape_machine(
machine: dict[str, Any],
config: dict[str, Any],
output_dir: Path,
progress: ProgressTracker,
tiles_csv: CsvWriter,
scans_csv: CsvWriter,
dry_run: bool,
mosaic_only: bool,
scan_id_filter: int | None,
) -> int:
"""Login, fetch scans, and download all content for one machine."""
sess = MachineSession(machine, config)
if not sess.login():
return 0
if scan_id_filter is not None:
scans: list[dict[str, Any]] = [
{"scan_id": scan_id_filter, "status": "Completed"}
]
log.info("[%s] Targeting scan ID %d.", machine["label"], scan_id_filter)
else:
scans = sess.get_all_scans()
if not scans:
log.warning("[%s] No scans found.", machine["label"])
return 0
total = 0
for scan in scans:
total += process_scan(
sess=sess,
scan=scan,
output_dir=output_dir,
machine=machine,
config=config,
progress=progress,
scans_csv=scans_csv,
tiles_csv=tiles_csv,
dry_run=dry_run,
mosaic_only=mosaic_only,
)
return total
+213
View File
@@ -0,0 +1,213 @@
"""
Pure HTML / text parsing functions for the RootView web application.
All functions are side-effect-free: string (or list[str]) in, dict/list out.
No network access, no filesystem access.
"""
import math
import re
from typing import Any
from urllib.parse import unquote
from bs4 import BeautifulSoup
# ---------------------------------------------------------------------------
# Machine descriptor
# ---------------------------------------------------------------------------
def parse_machine_option(label: str, raw_value: str) -> dict[str, Any]:
"""Decode the pipe-delimited <option> value for a machine."""
decoded = unquote(raw_value)
parts = decoded.split("|")
return {
"label": label,
"option_value": raw_value,
"name": parts[0] if len(parts) > 0 else label,
"ip": parts[1] if len(parts) > 1 else "",
"port1": parts[2] if len(parts) > 2 else "",
"machine_id": parts[3] if len(parts) > 3 else "",
"port2": parts[4] if len(parts) > 4 else "",
"version": parts[5] if len(parts) > 5 else "",
}
# ---------------------------------------------------------------------------
# Scan list row
# ---------------------------------------------------------------------------
def parse_scan_row(cells: list[str]) -> dict[str, Any] | None:
"""
Parse one table row from the scan list into a scan dict.
Expected columns (from the observed HTML):
ID, Name, Scan Time, Step Units, (X,Y)-(X,Y)-(DX,DY),
Dwell Time ms, Scan Lines, Scan Mode, Start Time, End Time,
Cancelled, User, Scan Status, Archived, [View link]
Returns None for header rows or rows whose first cell is not a digit.
"""
if not cells or not cells[0].strip().isdigit():
return None
try:
scan_id = int(cells[0].strip())
return {
"scan_id": scan_id,
"name": cells[1].strip() if len(cells) > 1 else "",
"scan_time": cells[2].strip() if len(cells) > 2 else "",
"step_units": cells[3].strip() if len(cells) > 3 else "",
"coord_str": cells[4].strip() if len(cells) > 4 else "",
"dwell_ms": cells[5].strip() if len(cells) > 5 else "",
"scan_lines": cells[6].strip() if len(cells) > 6 else "",
"scan_mode": cells[7].strip() if len(cells) > 7 else "",
"start_datetime": cells[8].strip() if len(cells) > 8 else "",
"end_datetime": cells[9].strip() if len(cells) > 9 else "",
"cancelled": cells[10].strip() if len(cells) > 10 else "",
"user": cells[11].strip() if len(cells) > 11 else "",
"status": cells[12].strip() if len(cells) > 12 else "",
"archived": cells[13].strip() if len(cells) > 13 else "",
}
except (ValueError, IndexError):
return None
# ---------------------------------------------------------------------------
# Scan view page
# ---------------------------------------------------------------------------
def parse_scan_view(html: str) -> dict[str, Any]:
"""
Extract grid parameters from a scan view page.
Returns a dict with keys:
scan_id, name, scan_time, start_x, start_y, end_x, end_y,
dx, dy, nx, ny, total_tiles, disk_space_mb, ...
"""
result: dict[str, Any] = {}
# Extract grid params from the show_tile() URL inside the page JS.
# The scan view embeds them as query params in a JS string, e.g.:
# "include/tile_view.php?...&sX=0&sY=0&eX=310&eY=740&dX=3.01&dY=2.26&..."
tile_url_m = re.search(r'tile_view\.php\?([^"\']+)', html)
if tile_url_m:
qs = tile_url_m.group(1)
param_map = {
"sX": "startX",
"sY": "startY",
"eX": "endX",
"eY": "endY",
"dX": "deltaX",
"dY": "deltaY",
}
for qs_key, result_key in param_map.items():
m = re.search(rf"(?:^|&){re.escape(qs_key)}=([\d.]+)", qs)
if m:
result[result_key] = float(m.group(1))
# Fallback: look for standalone JS var declarations (present in tile_view.php)
js_var_patterns = {
"startX": r"var\s+startX\s*=\s*([\d.]+)",
"startY": r"var\s+startY\s*=\s*([\d.]+)",
"endX": r"var\s+endX\s*=\s*([\d.]+)",
"endY": r"var\s+endY\s*=\s*([\d.]+)",
"deltaX": r"var\s+deltaX\s*=\s*([\d.]+)",
"deltaY": r"var\s+deltaY\s*=\s*([\d.]+)",
}
for key, pattern in js_var_patterns.items():
if key not in result:
m = re.search(pattern, html)
if m:
result[key] = float(m.group(1))
# Extract from the data table
soup = BeautifulSoup(html, "lxml")
for row in soup.find_all("tr"):
cells = [td.get_text(strip=True) for td in row.find_all("td")]
for i, cell in enumerate(cells):
if cell == "Scan ID:" and i + 1 < len(cells):
try:
result["scan_id"] = int(cells[i + 1])
except ValueError:
pass
elif cell == "Name:" and i + 1 < len(cells):
result["name"] = cells[i + 1]
elif cell == "Scan Time:" and i + 1 < len(cells):
result["scan_time"] = cells[i + 1]
elif cell == "Starting X:" and i + 1 < len(cells):
result["start_x_label"] = cells[i + 1]
elif cell == "Starting Y:" and i + 1 < len(cells):
result["start_y_label"] = cells[i + 1]
elif cell == "Ending X:" and i + 1 < len(cells):
result["end_x_label"] = cells[i + 1]
elif cell == "Ending Y:" and i + 1 < len(cells):
result["end_y_label"] = cells[i + 1]
elif cell == "DX:" and i + 1 < len(cells):
result["dx_label"] = cells[i + 1]
elif cell == "DY:" and i + 1 < len(cells):
result["dy_label"] = cells[i + 1]
elif cell == "Scan Lines:" and i + 1 < len(cells):
result["scan_lines"] = cells[i + 1]
elif cell == "Scan Mode:" and i + 1 < len(cells):
result["scan_mode"] = cells[i + 1]
elif cell == "Start Time:" and i + 1 < len(cells):
result["start_datetime"] = cells[i + 1]
elif cell == "End Time:" and i + 1 < len(cells):
result["end_datetime"] = cells[i + 1]
elif cell == "Scan Status:" and i + 1 < len(cells):
result["status"] = cells[i + 1]
elif cell == "User:" and i + 1 < len(cells):
result["user"] = cells[i + 1]
elif cell == "Total number of images:" and i + 1 < len(cells):
# Format: "33784 (103x328)"
m = re.match(r"(\d+)\s*\((\d+)x(\d+)\)", cells[i + 1])
if m:
result["total_tiles"] = int(m.group(1))
result["nx"] = int(m.group(2))
result["ny"] = int(m.group(3))
elif cell == "Total Disk Space:" and i + 1 < len(cells):
m = re.search(r"([\d.]+)\s*Mb", cells[i + 1])
if m:
result["disk_space_mb"] = float(m.group(1))
# Promote JS/URL grid param names to canonical keys
for raw, canon in (
("startX", "start_x"),
("startY", "start_y"),
("endX", "end_x"),
("endY", "end_y"),
("deltaX", "dx"),
("deltaY", "dy"),
):
if raw in result:
result[canon] = result.pop(raw)
# Compute nx/ny from grid params if not parsed from table
if "nx" not in result and all(k in result for k in ("start_x", "end_x", "dx")):
result["nx"] = _grid_count(result["start_x"], result["end_x"], result["dx"])
if "ny" not in result and all(k in result for k in ("start_y", "end_y", "dy")):
result["ny"] = _grid_count(result["start_y"], result["end_y"], result["dy"])
if "total_tiles" not in result and "nx" in result and "ny" in result:
result["total_tiles"] = result["nx"] * result["ny"]
return result
# ---------------------------------------------------------------------------
# Grid helpers
# ---------------------------------------------------------------------------
def _grid_count(start: float, end: float, step: float) -> int:
"""Number of grid positions from start up to (but not including) end."""
if step <= 0:
return 0
return math.ceil((end - start) / step)
def _grid_values(start: float, count: int, step: float) -> list[float]:
"""Generate `count` evenly-spaced grid positions, rounded to 2 dp."""
return [round(start + i * step, 2) for i in range(count)]
+62
View File
@@ -0,0 +1,62 @@
"""
Pure path-helper functions — no network, no JSON, no progress state.
"""
import re
from pathlib import Path
from typing import Any
def machine_dir_name(machine: dict[str, Any]) -> str:
"""Sanitise machine label for use as a directory name."""
return re.sub(r"[^\w\-.]", "_", machine["label"]).strip("_")
def _extract_date(dt_str: str) -> str:
"""Pull YYYY-MM-DD from a datetime string, fall back to 'unknown'."""
m = re.search(r"(\d{4}-\d{2}-\d{2})", dt_str)
return m.group(1) if m else "unknown"
def tile_dest(
output_dir: Path,
machine: dict[str, Any],
scan_meta: dict[str, Any],
tile: dict[str, Any],
) -> Path:
"""Return the local path for a single tile file."""
scan_date = _extract_date(scan_meta.get("scan_time", ""))
scan_id = tile["scan_id"]
ny = scan_meta.get("ny", 1)
nx = scan_meta.get("nx", 1)
row_width = len(str(ny - 1)) if ny > 1 else 1
col_width = len(str(nx - 1)) if nx > 1 else 1
filename = (
f"tile_r{tile['row_index']:0{row_width}d}"
f"_c{tile['col_index']:0{col_width}d}.jpg"
)
return (
output_dir
/ machine_dir_name(machine)
/ scan_date
/ str(scan_id)
/ "tiles"
/ filename
)
def mosaic_dest(
output_dir: Path,
machine: dict[str, Any],
scan_meta: dict[str, Any],
scan_id: int,
) -> Path:
"""Return the local path for a scan's mosaic file."""
scan_date = _extract_date(scan_meta.get("scan_time", ""))
return (
output_dir
/ machine_dir_name(machine)
/ scan_date
/ str(scan_id)
/ "mosaic.jpg"
)
+82
View File
@@ -0,0 +1,82 @@
"""
Progress tracking (JSON) and CSV writing.
"""
import csv
import json
import logging
from pathlib import Path
from typing import Iterator
log = logging.getLogger(__name__)
class ProgressTracker:
"""
Tracks successfully downloaded URLs in a JSON file so runs can be resumed.
Public API (all external code should use only these methods):
is_done(url) — True if url has been downloaded
mark_done(url) — Record url as complete (call save() to persist)
discard(url) — Remove url from the completed set
iter_urls() — Iterate over all completed URLs
__len__() — Number of completed URLs
save() — Flush state to disk
"""
def __init__(self, path: Path) -> None:
self.path = path
self._done: set[str] = set()
self._load()
def _load(self) -> None:
if self.path.exists():
try:
data = json.loads(self.path.read_text())
self._done = set(data.get("completed_urls", []))
log.info("Resuming: %d URLs already downloaded.", len(self._done))
except Exception:
log.warning("Could not read progress file; starting fresh.")
def is_done(self, url: str) -> bool:
return url in self._done
def mark_done(self, url: str) -> None:
self._done.add(url)
def discard(self, url: str) -> None:
"""Remove a URL from the completed set (re-queues it for download)."""
self._done.discard(url)
def iter_urls(self) -> Iterator[str]:
"""Iterate over all completed URLs."""
return iter(self._done)
def __len__(self) -> int:
return len(self._done)
def save(self) -> None:
self.path.parent.mkdir(parents=True, exist_ok=True)
self.path.write_text(
json.dumps({"completed_urls": sorted(self._done)}, indent=2)
)
class CsvWriter:
"""Append-mode CSV writer that writes a header on first creation."""
def __init__(self, path: Path, fields: list[str]) -> None:
is_new = not path.exists()
path.parent.mkdir(parents=True, exist_ok=True)
self._fh = open(path, "a", newline="", encoding="utf-8")
self._writer = csv.DictWriter(self._fh, fieldnames=fields)
if is_new:
self._writer.writeheader()
self._fields = fields
def write(self, row: dict) -> None:
self._writer.writerow({f: row.get(f, "") for f in self._fields})
self._fh.flush()
def close(self) -> None:
self._fh.close()
+156
View File
@@ -0,0 +1,156 @@
"""
Archive integrity checks — find corrupt / missing tiles and remove them
from the progress tracker so they are re-downloaded on the next run.
"""
import logging
import urllib.parse
from pathlib import Path
from typing import Any
from spruce.progress import ProgressTracker
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Private helpers
# ---------------------------------------------------------------------------
def _parse_tile_url(url: str) -> dict[str, str]:
"""Extract scan_id, x, y from a tile URL query string."""
qs = dict(urllib.parse.parse_qsl(urllib.parse.urlparse(url).query))
return {
"scan_id": qs.get("id", ""),
"x": qs.get("x", ""),
"y": qs.get("y", ""),
}
def _build_disk_index(output_dir: Path) -> dict[Path, int]:
"""Return {tile_path: size_bytes} for every tile file found on disk."""
return {p: p.stat().st_size for p in output_dir.rglob("tile_r*.jpg")}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def recheck_tile_files(output_dir: Path, progress: ProgressTracker) -> int:
"""
Walk every tile file on disk and delete any that are zero bytes.
Also removes the corresponding URL from progress in the same pass,
so a single --recheck call is sufficient before resuming.
Returns the count of files deleted.
"""
# Build a reverse map: (scan_id, x, y) -> url for all completed tile URLs
coord_to_url: dict[tuple[str, str, str], str] = {}
for url in progress.iter_urls():
if "cmd=image" in url:
p = _parse_tile_url(url)
key = (p["scan_id"], p["x"], p["y"])
coord_to_url[key] = url
deleted = 0
for tile_path in output_dir.rglob("tile_r*.jpg"):
if tile_path.stat().st_size == 0:
log.warning("Deleting zero-byte tile: %s", tile_path)
tile_path.unlink()
deleted += 1
# Try to find the matching URL from progress and discard it
scan_id = _scan_id_from_path(tile_path)
if scan_id:
# Discard any URL for this scan_id — precise x/y matching
# requires metadata.json; scan-level discard is safe because
# recheck_archive will clean up any remaining stale URLs.
for key, url in list(coord_to_url.items()):
if key[0] == scan_id:
progress.discard(url)
del coord_to_url[key]
if deleted:
log.info("Deleted %d zero-byte tile file(s).", deleted)
progress.save()
else:
log.info("No zero-byte tile files found on disk.")
return deleted
def recheck_archive(output_dir: Path, progress: ProgressTracker) -> int:
"""
Walk every URL in .progress.json and verify its local file exists and is
non-empty. Removes bad entries from progress so the next run re-downloads
them. Returns the count of entries removed.
Only tile URLs are checked (mosaic URLs are skipped — mosaics are large
single files and are unlikely to be partially written due to streaming).
"""
if len(progress) == 0:
log.info("Progress file is empty — nothing to recheck.")
return 0
tile_urls = [u for u in progress.iter_urls() if "cmd=image" in u]
mosaic_count = len(progress) - len(tile_urls)
log.info(
"Rechecking %d tile URLs (%d mosaic URLs not rechecked) …",
len(tile_urls),
mosaic_count,
)
# Build a disk index once
existing_files = _build_disk_index(output_dir)
log.debug("Found %d tile files on disk.", len(existing_files))
bad_urls: list[str] = []
for url in tile_urls:
p = _parse_tile_url(url)
scan_id = p["scan_id"]
# Find tile files that live under a directory named after this scan_id
candidates = [path for path in existing_files if str(scan_id) in path.parts]
if not candidates:
bad_urls.append(url)
continue
if not any(existing_files[path] > 0 for path in candidates):
bad_urls.append(url)
if not bad_urls:
log.info("All %d tile URLs look healthy.", len(tile_urls))
return 0
log.warning(
"Found %d suspect tile URL(s). Removing from progress.",
len(bad_urls),
)
for url in bad_urls:
progress.discard(url)
progress.save()
log.info(
"Removed %d URL(s) from .progress.json — they will be re-downloaded on next run.",
len(bad_urls),
)
return len(bad_urls)
# ---------------------------------------------------------------------------
# Internal utility
# ---------------------------------------------------------------------------
def _scan_id_from_path(tile_path: Path) -> str | None:
"""
Given a tile path like .../158374/tiles/tile_r0_c0.jpg, return '158374'.
Looks for the directory two levels above the filename (parent.parent.name).
"""
try:
# structure: <machine>/<date>/<scan_id>/tiles/<filename>
return tile_path.parent.parent.name
except Exception:
return None
+274
View File
@@ -0,0 +1,274 @@
"""
HTTP session for a single RootView machine: login, scan listing, tile downloads.
"""
import logging
import time
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urljoin
from typing import Any
import requests
from bs4 import BeautifulSoup
from spruce.parsers import parse_scan_row, parse_scan_view, _grid_values
log = logging.getLogger(__name__)
USER_AGENT = "spruce-scraper/1.0"
class MachineSession:
"""Manages an authenticated HTTP session for one RootView machine."""
def __init__(self, machine: dict[str, Any], config: dict[str, Any]) -> None:
self.machine = machine
self.cfg = config
self.http = requests.Session()
self.http.headers["User-Agent"] = USER_AGENT
self.base_url: str = config["base_url"]
self.image_base_url: str = config.get(
"image_base_url", "http://205.149.147.131:8011/"
)
# ------------------------------------------------------------------
# Auth
# ------------------------------------------------------------------
def login(self) -> bool:
url = urljoin(self.base_url, "index.php")
payload = {
"RTLLogin": "1",
"RTLNAME": self.machine["option_value"],
"RTLUSER": self.cfg["username"],
"RTLPWD": self.cfg["password"],
"rtl_latest_version": "3.0.0.18",
"submit": " submit ",
}
try:
resp = self.http.post(url, data=payload, timeout=self.cfg["timeout"])
resp.raise_for_status()
except requests.RequestException as exc:
log.error("[%s] Login failed: %s", self.machine["label"], exc)
return False
soup = BeautifulSoup(resp.text, "lxml")
error_tag = soup.find(class_="error")
if error_tag and error_tag.get_text(strip=True):
log.error(
"[%s] Login rejected: %s",
self.machine["label"],
error_tag.get_text(strip=True),
)
return False
log.info("[%s] Login succeeded.", self.machine["label"])
return True
# ------------------------------------------------------------------
# Scan list (paginated)
# ------------------------------------------------------------------
def get_all_scans(self) -> list[dict[str, Any]]:
"""
Fetch the complete scan list across all pages.
Uses a large FilterCount (320) to minimise round-trips.
Falls back to repeated pages if the list is longer.
"""
all_scans: list[dict[str, Any]] = []
start = 0
page_size = 320
while True:
page_scans = self._fetch_scan_page(start, page_size)
if not page_scans:
break
all_scans.extend(page_scans)
log.debug(
"[%s] Page start=%d: %d scans (total so far: %d)",
self.machine["label"],
start,
len(page_scans),
len(all_scans),
)
if len(page_scans) < page_size:
break
start += page_size
time.sleep(self.cfg["request_delay"])
log.info("[%s] Found %d scans.", self.machine["label"], len(all_scans))
return all_scans
def _fetch_scan_page(
self, start: int, page_size: int
) -> list[dict[str, Any]]:
"""POST the scan list form and parse the returned table."""
time.sleep(self.cfg["request_delay"])
resp = self.http.post(
urljoin(self.base_url, "index.php"),
data={
"cmd": "scan",
"start": str(start),
"order": "0",
"order_dir": "1",
"FilterScanStatus": "2", # Completed scans
"FilterUser": "",
"hidedate": "",
"FilterDtFrom": "",
"FilterDtTo": "",
"FilterIdFrom": "0",
"FilterIdTo": "0",
"FilterCount": str(page_size),
},
timeout=self.cfg["timeout"],
)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
scans: list[dict[str, Any]] = []
for row in soup.find_all("tr"):
cells = [td.get_text(strip=True) for td in row.find_all("td")]
scan = parse_scan_row(cells)
if scan:
scans.append(scan)
return scans
# ------------------------------------------------------------------
# Scan detail
# ------------------------------------------------------------------
def get_scan_metadata(self, scan_id: int) -> dict[str, Any]:
"""Fetch the scan view page and extract grid parameters."""
time.sleep(self.cfg["request_delay"])
resp = self.http.get(
urljoin(self.base_url, "index.php"),
params={"cmd": "scan", "mode": "view", "id": str(scan_id)},
timeout=self.cfg["timeout"],
)
resp.raise_for_status()
return parse_scan_view(resp.text)
# ------------------------------------------------------------------
# Tile enumeration
# ------------------------------------------------------------------
def enumerate_tiles(self, scan_meta: dict[str, Any]) -> list[dict[str, Any]]:
"""
Generate the full list of tile descriptors for a scan.
Each descriptor has: url, row_index, col_index, x_mm, y_mm
"""
scan_id = scan_meta["scan_id"]
nx: int = scan_meta.get("nx", 0)
ny: int = scan_meta.get("ny", 0)
start_x: float = scan_meta.get("start_x", 0.0)
start_y: float = scan_meta.get("start_y", 0.0)
dx: float = scan_meta.get("dx", 1.0)
dy: float = scan_meta.get("dy", 1.0)
scale: int = self.cfg.get("tile_scale", 1)
xs = _grid_values(start_x, nx, dx)
ys = _grid_values(start_y, ny, dy)
tiles: list[dict[str, Any]] = []
for row_idx, y in enumerate(ys):
for col_idx, x in enumerate(xs):
url = (
urljoin(self.base_url, "index.php")
+ f"?cmd=image&mode=image_scan&id={scan_id}"
+ f"&s={scale}&x={x}&y={y}"
)
tiles.append(
{
"scan_id": scan_id,
"row_index": row_idx,
"col_index": col_idx,
"x_mm": x,
"y_mm": y,
"url": url,
}
)
return tiles
# ------------------------------------------------------------------
# Mosaic URL
# ------------------------------------------------------------------
def mosaic_url(self, scan_id: int) -> str:
return urljoin(
self.image_base_url, f"RootView_Database/{scan_id}/mosaic.jpg"
)
# ------------------------------------------------------------------
# Downloads
# ------------------------------------------------------------------
def download_file(self, url: str, dest: Path, retries: int = 3) -> int:
"""Stream-download url to dest with retries. Returns bytes written (0 on failure)."""
dest.parent.mkdir(parents=True, exist_ok=True)
backoff = 5.0
for attempt in range(1, retries + 1):
try:
resp = self.http.get(
url, timeout=self.cfg["timeout"], stream=True
)
resp.raise_for_status()
size = 0
with open(dest, "wb") as fh:
for chunk in resp.iter_content(chunk_size=65536):
if chunk:
fh.write(chunk)
size += len(chunk)
return size
except Exception as exc:
if attempt < retries:
log.debug(
"Attempt %d/%d failed %s: %s — retrying in %.0fs",
attempt,
retries,
url,
exc,
backoff,
)
time.sleep(backoff)
backoff *= 2
else:
log.warning(
"Download failed after %d attempts %s: %s",
retries,
url,
exc,
)
return 0
def download_tile(
self, tile: dict[str, Any], dest: Path, dry_run: bool
) -> dict[str, Any]:
"""Download a single tile. Returns a metadata row dict."""
row: dict[str, Any] = {
"machine": self.machine["label"],
"machine_id": self.machine["machine_id"],
"scan_id": tile["scan_id"],
"scan_time": tile.get("scan_time", ""),
"row_index": tile["row_index"],
"col_index": tile["col_index"],
"x_mm": tile["x_mm"],
"y_mm": tile["y_mm"],
"url": tile["url"],
"local_path": str(dest),
"downloaded_at": "",
"file_size_bytes": "",
}
if dry_run:
return row
if dest.exists():
row["downloaded_at"] = "already_exists"
row["file_size_bytes"] = dest.stat().st_size
return row
size = self.download_file(tile["url"], dest)
if size:
row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
row["file_size_bytes"] = size
return row
+109
View File
@@ -0,0 +1,109 @@
"""
Constants, field lists, and config loading for the spruce scraper.
"""
import logging
import sys
import yaml
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# File-name constants
# ---------------------------------------------------------------------------
DEFAULT_CONFIG = "config.yaml"
PROGRESS_FILENAME = ".progress.json"
SCANS_CSV_FILENAME = "scans.csv"
TILES_CSV_FILENAME = "tiles.csv"
# ---------------------------------------------------------------------------
# CSV field lists
# ---------------------------------------------------------------------------
SCANS_CSV_FIELDS: list[str] = [
"machine",
"machine_id",
"scan_id",
"name",
"scan_time",
"start_x",
"start_y",
"end_x",
"end_y",
"dx",
"dy",
"nx",
"ny",
"total_tiles",
"scan_lines",
"scan_mode",
"start_datetime",
"end_datetime",
"status",
"user",
"disk_space_mb",
"mosaic_url",
"mosaic_local_path",
"mosaic_downloaded",
]
TILES_CSV_FIELDS: list[str] = [
"machine",
"machine_id",
"scan_id",
"scan_time",
"row_index",
"col_index",
"x_mm",
"y_mm",
"url",
"local_path",
"downloaded_at",
"file_size_bytes",
]
# ---------------------------------------------------------------------------
# Worker safety
# ---------------------------------------------------------------------------
MAX_SAFE_WORKERS = 4 # above this the RootView server starts timing out
def _clamp_workers(n: int) -> int:
"""Return n clamped to MAX_SAFE_WORKERS, logging a warning if clamped."""
if n > MAX_SAFE_WORKERS:
log.warning(
"workers=%d exceeds the safe limit of %d. "
"The RootView server will time out under this load, causing lost tiles. "
"Capping at %d.",
n,
MAX_SAFE_WORKERS,
MAX_SAFE_WORKERS,
)
return MAX_SAFE_WORKERS
return n
# ---------------------------------------------------------------------------
# Config loader
# ---------------------------------------------------------------------------
def load_config(path: str) -> dict:
"""Load and validate config.yaml. Exits on missing required fields."""
with open(path) as fh:
cfg = yaml.safe_load(fh)
missing = [k for k in ("username", "password") if not cfg.get(k)]
if missing:
sys.exit(f"Config {path} is missing required fields: {missing}")
cfg.setdefault("base_url", "http://205.149.147.131:8010/")
cfg.setdefault("image_base_url", "http://205.149.147.131:8011/")
cfg.setdefault("output_dir", "archives")
cfg.setdefault("workers", 2)
cfg.setdefault("timeout", 60)
cfg.setdefault("request_delay", 0.5)
cfg.setdefault("tile_scale", 1)
cfg["workers"] = _clamp_workers(cfg["workers"])
return cfg