Initial commit
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
This commit is contained in:
@@ -0,0 +1 @@
|
||||
# spruce — minirhizotron archive library
|
||||
+259
@@ -0,0 +1,259 @@
|
||||
"""
|
||||
Command-line interface for the spruce scraper.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from spruce.orchestrator import scrape_machine
|
||||
from spruce.parsers import parse_machine_option
|
||||
from spruce.progress import ProgressTracker, CsvWriter
|
||||
from spruce.recheck import recheck_archive, recheck_tile_files
|
||||
from spruce.settings import (
|
||||
DEFAULT_CONFIG,
|
||||
MAX_SAFE_WORKERS,
|
||||
PROGRESS_FILENAME,
|
||||
SCANS_CSV_FIELDS,
|
||||
SCANS_CSV_FILENAME,
|
||||
TILES_CSV_FIELDS,
|
||||
TILES_CSV_FILENAME,
|
||||
_clamp_workers,
|
||||
load_config,
|
||||
)
|
||||
from spruce.session import MachineSession
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def discover_machines(base_url: str, timeout: int = 30) -> list[dict]:
|
||||
resp = requests.get(urljoin(base_url, "index.php"), timeout=timeout)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
select = soup.find("select", {"name": "RTLNAME"})
|
||||
if not select:
|
||||
log.warning("Could not find machine selector on login page.")
|
||||
return []
|
||||
return [
|
||||
parse_machine_option(opt.get_text(strip=True), opt["value"])
|
||||
for opt in select.find_all("option")
|
||||
]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(
|
||||
description="Archive minirhizotron image tiles from RootView.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
p.add_argument(
|
||||
"--config",
|
||||
default=DEFAULT_CONFIG,
|
||||
metavar="FILE",
|
||||
help=f"YAML config file (default: {DEFAULT_CONFIG})",
|
||||
)
|
||||
p.add_argument(
|
||||
"--machine",
|
||||
metavar="LABEL",
|
||||
help='Scrape only this machine label, e.g. "BW3-20 [AMR-26]"',
|
||||
)
|
||||
p.add_argument(
|
||||
"--scan-id",
|
||||
type=int,
|
||||
metavar="ID",
|
||||
help="Download only this specific scan ID (use with --machine)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--mosaic-only",
|
||||
action="store_true",
|
||||
help="Download mosaics only; skip individual tiles",
|
||||
)
|
||||
p.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Preview what would be downloaded without saving any files",
|
||||
)
|
||||
p.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
metavar="N",
|
||||
help="Override parallel download threads from config",
|
||||
)
|
||||
p.add_argument(
|
||||
"--list-machines",
|
||||
action="store_true",
|
||||
help="Print available machines and exit (no credentials needed)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--list-scans",
|
||||
action="store_true",
|
||||
help="Print all scans for --machine and exit",
|
||||
)
|
||||
p.add_argument(
|
||||
"--recheck",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Scan the archive for zero-byte or missing tile files whose URLs "
|
||||
"are marked complete in .progress.json, remove them from progress, "
|
||||
"and report how many were re-queued. Run before resuming after a crash."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
help="Enable debug logging",
|
||||
)
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)-8s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
args = parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
# --list-machines doesn't need credentials
|
||||
if args.list_machines:
|
||||
base_url = "http://205.149.147.131:8010/"
|
||||
timeout = 30
|
||||
if os.path.exists(args.config):
|
||||
cfg = yaml.safe_load(open(args.config))
|
||||
base_url = cfg.get("base_url", base_url)
|
||||
timeout = cfg.get("timeout", timeout)
|
||||
machines = discover_machines(base_url, timeout)
|
||||
print(f"{'Label':<25} {'ID':>4} {'IP':<17} {'Version'}")
|
||||
print("-" * 62)
|
||||
for m in machines:
|
||||
print(
|
||||
f"{m['label']:<25} {m['machine_id']:>4} {m['ip']:<17} {m['version']}"
|
||||
)
|
||||
return
|
||||
|
||||
if not os.path.exists(args.config):
|
||||
sys.exit(
|
||||
f"Config file '{args.config}' not found.\n"
|
||||
f"Copy config.example.yaml to {args.config} and fill in your credentials."
|
||||
)
|
||||
|
||||
config = load_config(args.config)
|
||||
if args.workers:
|
||||
config["workers"] = _clamp_workers(args.workers)
|
||||
|
||||
output_dir = Path(config["output_dir"])
|
||||
|
||||
# --recheck: validate archive integrity and re-queue bad tiles
|
||||
if args.recheck:
|
||||
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
|
||||
n_bad = recheck_tile_files(output_dir, progress)
|
||||
n_requeued = recheck_archive(output_dir, progress)
|
||||
if n_bad == 0 and n_requeued == 0:
|
||||
log.info("Archive looks clean. No action needed.")
|
||||
else:
|
||||
log.info(
|
||||
"Recheck complete: %d zero-byte file(s) deleted, "
|
||||
"%d URL(s) re-queued for download.",
|
||||
n_bad,
|
||||
n_requeued,
|
||||
)
|
||||
return
|
||||
|
||||
# Build machine list
|
||||
all_machines = discover_machines(config["base_url"], config["timeout"])
|
||||
if not all_machines:
|
||||
sys.exit("Could not retrieve machine list from server.")
|
||||
|
||||
# Apply --machine / config machines filter
|
||||
filter_labels: list[str] | None = None
|
||||
if args.machine:
|
||||
filter_labels = [args.machine]
|
||||
elif config.get("machines"):
|
||||
filter_labels = list(config["machines"])
|
||||
|
||||
if filter_labels:
|
||||
machines = [m for m in all_machines if m["label"] in filter_labels]
|
||||
not_found = [
|
||||
label
|
||||
for label in filter_labels
|
||||
if label not in {m["label"] for m in machines}
|
||||
]
|
||||
if not_found:
|
||||
log.warning("Unknown machine label(s): %s", not_found)
|
||||
else:
|
||||
machines = all_machines
|
||||
|
||||
if not machines:
|
||||
sys.exit("No machines selected.")
|
||||
|
||||
# --list-scans: print and exit
|
||||
if args.list_scans:
|
||||
if len(machines) != 1:
|
||||
sys.exit("--list-scans requires exactly one machine (use --machine).")
|
||||
sess = MachineSession(machines[0], config)
|
||||
if not sess.login():
|
||||
sys.exit("Login failed.")
|
||||
scans = sess.get_all_scans()
|
||||
print(f"{'ID':>8} {'Date':<22} {'Name':<40} {'Status'}")
|
||||
print("-" * 85)
|
||||
for sc in scans:
|
||||
print(
|
||||
f"{sc['scan_id']:>8} {sc.get('scan_time', ''):<22} "
|
||||
f"{sc.get('name', ''):<40} {sc.get('status', '')}"
|
||||
)
|
||||
print(f"\nTotal: {len(scans)} scans")
|
||||
return
|
||||
|
||||
log.info(
|
||||
"Scraping %d machine(s): %s",
|
||||
len(machines),
|
||||
", ".join(m["label"] for m in machines),
|
||||
)
|
||||
if args.mosaic_only:
|
||||
log.info("Mode: mosaics only (individual tiles skipped)")
|
||||
if args.dry_run:
|
||||
log.info("Mode: dry-run (no files will be written)")
|
||||
|
||||
# Shared progress and CSV writers
|
||||
progress = ProgressTracker(output_dir / PROGRESS_FILENAME)
|
||||
tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
|
||||
scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
|
||||
|
||||
total = 0
|
||||
try:
|
||||
for machine in machines:
|
||||
count = scrape_machine(
|
||||
machine=machine,
|
||||
config=config,
|
||||
output_dir=output_dir,
|
||||
progress=progress,
|
||||
tiles_csv=tiles_csv,
|
||||
scans_csv=scans_csv,
|
||||
dry_run=args.dry_run,
|
||||
mosaic_only=args.mosaic_only,
|
||||
scan_id_filter=args.scan_id,
|
||||
)
|
||||
total += count
|
||||
finally:
|
||||
tiles_csv.close()
|
||||
scans_csv.close()
|
||||
progress.save()
|
||||
|
||||
if args.dry_run:
|
||||
log.info("Dry run complete.")
|
||||
else:
|
||||
log.info("Done. Total files downloaded: %d", total)
|
||||
log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME)
|
||||
log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME)
|
||||
log.info("Progress : %s", output_dir / PROGRESS_FILENAME)
|
||||
@@ -0,0 +1,307 @@
|
||||
"""
|
||||
High-level scrape orchestration: drives the per-machine and per-scan loops.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
|
||||
from spruce.progress import ProgressTracker, CsvWriter
|
||||
from spruce.session import MachineSession
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-scan helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _download_mosaic(
|
||||
sess: MachineSession,
|
||||
scan_meta: dict[str, Any],
|
||||
scan_id: int,
|
||||
mosaic_path: Path,
|
||||
progress: ProgressTracker,
|
||||
machine: dict[str, Any],
|
||||
dry_run: bool,
|
||||
) -> bool:
|
||||
"""Download the scan mosaic if not already done. Returns True if downloaded."""
|
||||
url = sess.mosaic_url(scan_id)
|
||||
if progress.is_done(url):
|
||||
return False
|
||||
if dry_run:
|
||||
log.info("[DRY-RUN] Mosaic: %s → %s", url, mosaic_path)
|
||||
return False
|
||||
log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id)
|
||||
size = sess.download_file(url, mosaic_path)
|
||||
if size:
|
||||
progress.mark_done(url)
|
||||
progress.save()
|
||||
log.info(
|
||||
"[%s] Mosaic saved: %s (%.1f MB)",
|
||||
machine["label"],
|
||||
mosaic_path,
|
||||
size / 1e6,
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _download_tiles_for_scan(
|
||||
sess: MachineSession,
|
||||
tiles: list[dict[str, Any]],
|
||||
scan_meta: dict[str, Any],
|
||||
scan_id: int,
|
||||
output_dir: Path,
|
||||
machine: dict[str, Any],
|
||||
config: dict[str, Any],
|
||||
progress: ProgressTracker,
|
||||
tiles_csv: CsvWriter,
|
||||
dry_run: bool,
|
||||
) -> int:
|
||||
"""Download all pending tiles for a scan. Returns count of tiles downloaded."""
|
||||
pending = [t for t in tiles if not progress.is_done(t["url"])]
|
||||
log.info(
|
||||
"[%s] Scan %d: %d tiles total, %d pending.",
|
||||
machine["label"],
|
||||
scan_id,
|
||||
len(tiles),
|
||||
len(pending),
|
||||
)
|
||||
|
||||
if dry_run:
|
||||
for t in pending[:5]:
|
||||
log.info("[DRY-RUN] Tile: %s", t["url"])
|
||||
if len(pending) > 5:
|
||||
log.info("[DRY-RUN] … and %d more tiles.", len(pending) - 5)
|
||||
return 0
|
||||
|
||||
# Attach scan_time for CSV rows
|
||||
for t in pending:
|
||||
t["scan_time"] = scan_meta.get("scan_time", "")
|
||||
|
||||
workers: int = config["workers"]
|
||||
downloaded = 0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||
futures = {
|
||||
pool.submit(
|
||||
sess.download_tile,
|
||||
tile,
|
||||
tile_dest(output_dir, machine, scan_meta, tile),
|
||||
False,
|
||||
): tile
|
||||
for tile in pending
|
||||
}
|
||||
|
||||
save_every = max(50, workers * 4)
|
||||
batch: list[dict[str, Any]] = []
|
||||
|
||||
with tqdm(
|
||||
total=len(pending),
|
||||
desc=f"{machine['label']} scan {scan_id}",
|
||||
unit="tile",
|
||||
leave=True,
|
||||
) as pbar:
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
if result.get("file_size_bytes"):
|
||||
batch.append(result)
|
||||
progress.mark_done(result["url"])
|
||||
downloaded += 1
|
||||
pbar.update(1)
|
||||
|
||||
if len(batch) >= save_every:
|
||||
for row in batch:
|
||||
tiles_csv.write(row)
|
||||
progress.save()
|
||||
batch.clear()
|
||||
|
||||
for row in batch:
|
||||
tiles_csv.write(row)
|
||||
progress.save()
|
||||
|
||||
log.info(
|
||||
"[%s] Scan %d complete: %d tiles downloaded.",
|
||||
machine["label"],
|
||||
scan_id,
|
||||
downloaded,
|
||||
)
|
||||
return downloaded
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-scan driver
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def process_scan(
|
||||
sess: MachineSession,
|
||||
scan: dict[str, Any],
|
||||
output_dir: Path,
|
||||
machine: dict[str, Any],
|
||||
config: dict[str, Any],
|
||||
progress: ProgressTracker,
|
||||
scans_csv: CsvWriter,
|
||||
tiles_csv: CsvWriter,
|
||||
dry_run: bool,
|
||||
mosaic_only: bool,
|
||||
) -> int:
|
||||
"""
|
||||
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
|
||||
Returns total files downloaded for this scan.
|
||||
"""
|
||||
scan_id: int = scan["scan_id"]
|
||||
log.info("[%s] Processing scan %d …", machine["label"], scan_id)
|
||||
|
||||
try:
|
||||
scan_meta = sess.get_scan_metadata(scan_id)
|
||||
except Exception as exc:
|
||||
log.error(
|
||||
"[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
|
||||
)
|
||||
return 0
|
||||
|
||||
if not scan_meta.get("nx") or not scan_meta.get("ny"):
|
||||
log.warning(
|
||||
"[%s] Scan %d: missing grid params, skipping.",
|
||||
machine["label"],
|
||||
scan_id,
|
||||
)
|
||||
return 0
|
||||
|
||||
# Merge list-level metadata into scan_meta (detail page takes precedence)
|
||||
for k in (
|
||||
"name",
|
||||
"scan_time",
|
||||
"start_datetime",
|
||||
"end_datetime",
|
||||
"status",
|
||||
"user",
|
||||
"scan_lines",
|
||||
"scan_mode",
|
||||
):
|
||||
scan_meta.setdefault(k, scan.get(k, ""))
|
||||
|
||||
# Save per-scan metadata.json
|
||||
scan_date = _extract_date(scan_meta.get("scan_time", ""))
|
||||
scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id)
|
||||
if not dry_run:
|
||||
scan_dir.mkdir(parents=True, exist_ok=True)
|
||||
meta_file = scan_dir / "metadata.json"
|
||||
if not meta_file.exists():
|
||||
meta_file.write_text(
|
||||
json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
|
||||
)
|
||||
|
||||
# Mosaic
|
||||
mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
|
||||
mosaic_url = sess.mosaic_url(scan_id)
|
||||
mosaic_downloaded = _download_mosaic(
|
||||
sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
|
||||
)
|
||||
total = 1 if mosaic_downloaded else 0
|
||||
|
||||
# Write scan-level CSV row
|
||||
scans_csv.write(
|
||||
{
|
||||
"machine": machine["label"],
|
||||
"machine_id": machine["machine_id"],
|
||||
"scan_id": scan_id,
|
||||
"name": scan_meta.get("name", ""),
|
||||
"scan_time": scan_meta.get("scan_time", ""),
|
||||
"start_x": scan_meta.get("start_x", ""),
|
||||
"start_y": scan_meta.get("start_y", ""),
|
||||
"end_x": scan_meta.get("end_x", ""),
|
||||
"end_y": scan_meta.get("end_y", ""),
|
||||
"dx": scan_meta.get("dx", ""),
|
||||
"dy": scan_meta.get("dy", ""),
|
||||
"nx": scan_meta.get("nx", ""),
|
||||
"ny": scan_meta.get("ny", ""),
|
||||
"total_tiles": scan_meta.get("total_tiles", ""),
|
||||
"scan_lines": scan_meta.get("scan_lines", ""),
|
||||
"scan_mode": scan_meta.get("scan_mode", ""),
|
||||
"start_datetime": scan_meta.get("start_datetime", ""),
|
||||
"end_datetime": scan_meta.get("end_datetime", ""),
|
||||
"status": scan_meta.get("status", ""),
|
||||
"user": scan_meta.get("user", ""),
|
||||
"disk_space_mb": scan_meta.get("disk_space_mb", ""),
|
||||
"mosaic_url": mosaic_url,
|
||||
"mosaic_local_path": str(mosaic_path),
|
||||
"mosaic_downloaded": mosaic_downloaded,
|
||||
}
|
||||
)
|
||||
|
||||
if mosaic_only:
|
||||
return total
|
||||
|
||||
# Tiles
|
||||
tiles = sess.enumerate_tiles(scan_meta)
|
||||
total += _download_tiles_for_scan(
|
||||
sess,
|
||||
tiles,
|
||||
scan_meta,
|
||||
scan_id,
|
||||
output_dir,
|
||||
machine,
|
||||
config,
|
||||
progress,
|
||||
tiles_csv,
|
||||
dry_run,
|
||||
)
|
||||
return total
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-machine driver
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def scrape_machine(
|
||||
machine: dict[str, Any],
|
||||
config: dict[str, Any],
|
||||
output_dir: Path,
|
||||
progress: ProgressTracker,
|
||||
tiles_csv: CsvWriter,
|
||||
scans_csv: CsvWriter,
|
||||
dry_run: bool,
|
||||
mosaic_only: bool,
|
||||
scan_id_filter: int | None,
|
||||
) -> int:
|
||||
"""Login, fetch scans, and download all content for one machine."""
|
||||
sess = MachineSession(machine, config)
|
||||
if not sess.login():
|
||||
return 0
|
||||
|
||||
if scan_id_filter is not None:
|
||||
scans: list[dict[str, Any]] = [
|
||||
{"scan_id": scan_id_filter, "status": "Completed"}
|
||||
]
|
||||
log.info("[%s] Targeting scan ID %d.", machine["label"], scan_id_filter)
|
||||
else:
|
||||
scans = sess.get_all_scans()
|
||||
if not scans:
|
||||
log.warning("[%s] No scans found.", machine["label"])
|
||||
return 0
|
||||
|
||||
total = 0
|
||||
for scan in scans:
|
||||
total += process_scan(
|
||||
sess=sess,
|
||||
scan=scan,
|
||||
output_dir=output_dir,
|
||||
machine=machine,
|
||||
config=config,
|
||||
progress=progress,
|
||||
scans_csv=scans_csv,
|
||||
tiles_csv=tiles_csv,
|
||||
dry_run=dry_run,
|
||||
mosaic_only=mosaic_only,
|
||||
)
|
||||
return total
|
||||
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
Pure HTML / text parsing functions for the RootView web application.
|
||||
|
||||
All functions are side-effect-free: string (or list[str]) in, dict/list out.
|
||||
No network access, no filesystem access.
|
||||
"""
|
||||
|
||||
import math
|
||||
import re
|
||||
from typing import Any
|
||||
from urllib.parse import unquote
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Machine descriptor
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def parse_machine_option(label: str, raw_value: str) -> dict[str, Any]:
|
||||
"""Decode the pipe-delimited <option> value for a machine."""
|
||||
decoded = unquote(raw_value)
|
||||
parts = decoded.split("|")
|
||||
return {
|
||||
"label": label,
|
||||
"option_value": raw_value,
|
||||
"name": parts[0] if len(parts) > 0 else label,
|
||||
"ip": parts[1] if len(parts) > 1 else "",
|
||||
"port1": parts[2] if len(parts) > 2 else "",
|
||||
"machine_id": parts[3] if len(parts) > 3 else "",
|
||||
"port2": parts[4] if len(parts) > 4 else "",
|
||||
"version": parts[5] if len(parts) > 5 else "",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scan list row
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def parse_scan_row(cells: list[str]) -> dict[str, Any] | None:
|
||||
"""
|
||||
Parse one table row from the scan list into a scan dict.
|
||||
|
||||
Expected columns (from the observed HTML):
|
||||
ID, Name, Scan Time, Step Units, (X,Y)-(X,Y)-(DX,DY),
|
||||
Dwell Time ms, Scan Lines, Scan Mode, Start Time, End Time,
|
||||
Cancelled, User, Scan Status, Archived, [View link]
|
||||
|
||||
Returns None for header rows or rows whose first cell is not a digit.
|
||||
"""
|
||||
if not cells or not cells[0].strip().isdigit():
|
||||
return None
|
||||
try:
|
||||
scan_id = int(cells[0].strip())
|
||||
return {
|
||||
"scan_id": scan_id,
|
||||
"name": cells[1].strip() if len(cells) > 1 else "",
|
||||
"scan_time": cells[2].strip() if len(cells) > 2 else "",
|
||||
"step_units": cells[3].strip() if len(cells) > 3 else "",
|
||||
"coord_str": cells[4].strip() if len(cells) > 4 else "",
|
||||
"dwell_ms": cells[5].strip() if len(cells) > 5 else "",
|
||||
"scan_lines": cells[6].strip() if len(cells) > 6 else "",
|
||||
"scan_mode": cells[7].strip() if len(cells) > 7 else "",
|
||||
"start_datetime": cells[8].strip() if len(cells) > 8 else "",
|
||||
"end_datetime": cells[9].strip() if len(cells) > 9 else "",
|
||||
"cancelled": cells[10].strip() if len(cells) > 10 else "",
|
||||
"user": cells[11].strip() if len(cells) > 11 else "",
|
||||
"status": cells[12].strip() if len(cells) > 12 else "",
|
||||
"archived": cells[13].strip() if len(cells) > 13 else "",
|
||||
}
|
||||
except (ValueError, IndexError):
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scan view page
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def parse_scan_view(html: str) -> dict[str, Any]:
|
||||
"""
|
||||
Extract grid parameters from a scan view page.
|
||||
|
||||
Returns a dict with keys:
|
||||
scan_id, name, scan_time, start_x, start_y, end_x, end_y,
|
||||
dx, dy, nx, ny, total_tiles, disk_space_mb, ...
|
||||
"""
|
||||
result: dict[str, Any] = {}
|
||||
|
||||
# Extract grid params from the show_tile() URL inside the page JS.
|
||||
# The scan view embeds them as query params in a JS string, e.g.:
|
||||
# "include/tile_view.php?...&sX=0&sY=0&eX=310&eY=740&dX=3.01&dY=2.26&..."
|
||||
tile_url_m = re.search(r'tile_view\.php\?([^"\']+)', html)
|
||||
if tile_url_m:
|
||||
qs = tile_url_m.group(1)
|
||||
param_map = {
|
||||
"sX": "startX",
|
||||
"sY": "startY",
|
||||
"eX": "endX",
|
||||
"eY": "endY",
|
||||
"dX": "deltaX",
|
||||
"dY": "deltaY",
|
||||
}
|
||||
for qs_key, result_key in param_map.items():
|
||||
m = re.search(rf"(?:^|&){re.escape(qs_key)}=([\d.]+)", qs)
|
||||
if m:
|
||||
result[result_key] = float(m.group(1))
|
||||
|
||||
# Fallback: look for standalone JS var declarations (present in tile_view.php)
|
||||
js_var_patterns = {
|
||||
"startX": r"var\s+startX\s*=\s*([\d.]+)",
|
||||
"startY": r"var\s+startY\s*=\s*([\d.]+)",
|
||||
"endX": r"var\s+endX\s*=\s*([\d.]+)",
|
||||
"endY": r"var\s+endY\s*=\s*([\d.]+)",
|
||||
"deltaX": r"var\s+deltaX\s*=\s*([\d.]+)",
|
||||
"deltaY": r"var\s+deltaY\s*=\s*([\d.]+)",
|
||||
}
|
||||
for key, pattern in js_var_patterns.items():
|
||||
if key not in result:
|
||||
m = re.search(pattern, html)
|
||||
if m:
|
||||
result[key] = float(m.group(1))
|
||||
|
||||
# Extract from the data table
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
for row in soup.find_all("tr"):
|
||||
cells = [td.get_text(strip=True) for td in row.find_all("td")]
|
||||
for i, cell in enumerate(cells):
|
||||
if cell == "Scan ID:" and i + 1 < len(cells):
|
||||
try:
|
||||
result["scan_id"] = int(cells[i + 1])
|
||||
except ValueError:
|
||||
pass
|
||||
elif cell == "Name:" and i + 1 < len(cells):
|
||||
result["name"] = cells[i + 1]
|
||||
elif cell == "Scan Time:" and i + 1 < len(cells):
|
||||
result["scan_time"] = cells[i + 1]
|
||||
elif cell == "Starting X:" and i + 1 < len(cells):
|
||||
result["start_x_label"] = cells[i + 1]
|
||||
elif cell == "Starting Y:" and i + 1 < len(cells):
|
||||
result["start_y_label"] = cells[i + 1]
|
||||
elif cell == "Ending X:" and i + 1 < len(cells):
|
||||
result["end_x_label"] = cells[i + 1]
|
||||
elif cell == "Ending Y:" and i + 1 < len(cells):
|
||||
result["end_y_label"] = cells[i + 1]
|
||||
elif cell == "DX:" and i + 1 < len(cells):
|
||||
result["dx_label"] = cells[i + 1]
|
||||
elif cell == "DY:" and i + 1 < len(cells):
|
||||
result["dy_label"] = cells[i + 1]
|
||||
elif cell == "Scan Lines:" and i + 1 < len(cells):
|
||||
result["scan_lines"] = cells[i + 1]
|
||||
elif cell == "Scan Mode:" and i + 1 < len(cells):
|
||||
result["scan_mode"] = cells[i + 1]
|
||||
elif cell == "Start Time:" and i + 1 < len(cells):
|
||||
result["start_datetime"] = cells[i + 1]
|
||||
elif cell == "End Time:" and i + 1 < len(cells):
|
||||
result["end_datetime"] = cells[i + 1]
|
||||
elif cell == "Scan Status:" and i + 1 < len(cells):
|
||||
result["status"] = cells[i + 1]
|
||||
elif cell == "User:" and i + 1 < len(cells):
|
||||
result["user"] = cells[i + 1]
|
||||
elif cell == "Total number of images:" and i + 1 < len(cells):
|
||||
# Format: "33784 (103x328)"
|
||||
m = re.match(r"(\d+)\s*\((\d+)x(\d+)\)", cells[i + 1])
|
||||
if m:
|
||||
result["total_tiles"] = int(m.group(1))
|
||||
result["nx"] = int(m.group(2))
|
||||
result["ny"] = int(m.group(3))
|
||||
elif cell == "Total Disk Space:" and i + 1 < len(cells):
|
||||
m = re.search(r"([\d.]+)\s*Mb", cells[i + 1])
|
||||
if m:
|
||||
result["disk_space_mb"] = float(m.group(1))
|
||||
|
||||
# Promote JS/URL grid param names to canonical keys
|
||||
for raw, canon in (
|
||||
("startX", "start_x"),
|
||||
("startY", "start_y"),
|
||||
("endX", "end_x"),
|
||||
("endY", "end_y"),
|
||||
("deltaX", "dx"),
|
||||
("deltaY", "dy"),
|
||||
):
|
||||
if raw in result:
|
||||
result[canon] = result.pop(raw)
|
||||
|
||||
# Compute nx/ny from grid params if not parsed from table
|
||||
if "nx" not in result and all(k in result for k in ("start_x", "end_x", "dx")):
|
||||
result["nx"] = _grid_count(result["start_x"], result["end_x"], result["dx"])
|
||||
if "ny" not in result and all(k in result for k in ("start_y", "end_y", "dy")):
|
||||
result["ny"] = _grid_count(result["start_y"], result["end_y"], result["dy"])
|
||||
if "total_tiles" not in result and "nx" in result and "ny" in result:
|
||||
result["total_tiles"] = result["nx"] * result["ny"]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Grid helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _grid_count(start: float, end: float, step: float) -> int:
|
||||
"""Number of grid positions from start up to (but not including) end."""
|
||||
if step <= 0:
|
||||
return 0
|
||||
return math.ceil((end - start) / step)
|
||||
|
||||
|
||||
def _grid_values(start: float, count: int, step: float) -> list[float]:
|
||||
"""Generate `count` evenly-spaced grid positions, rounded to 2 dp."""
|
||||
return [round(start + i * step, 2) for i in range(count)]
|
||||
@@ -0,0 +1,62 @@
|
||||
"""
|
||||
Pure path-helper functions — no network, no JSON, no progress state.
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def machine_dir_name(machine: dict[str, Any]) -> str:
|
||||
"""Sanitise machine label for use as a directory name."""
|
||||
return re.sub(r"[^\w\-.]", "_", machine["label"]).strip("_")
|
||||
|
||||
|
||||
def _extract_date(dt_str: str) -> str:
|
||||
"""Pull YYYY-MM-DD from a datetime string, fall back to 'unknown'."""
|
||||
m = re.search(r"(\d{4}-\d{2}-\d{2})", dt_str)
|
||||
return m.group(1) if m else "unknown"
|
||||
|
||||
|
||||
def tile_dest(
|
||||
output_dir: Path,
|
||||
machine: dict[str, Any],
|
||||
scan_meta: dict[str, Any],
|
||||
tile: dict[str, Any],
|
||||
) -> Path:
|
||||
"""Return the local path for a single tile file."""
|
||||
scan_date = _extract_date(scan_meta.get("scan_time", ""))
|
||||
scan_id = tile["scan_id"]
|
||||
ny = scan_meta.get("ny", 1)
|
||||
nx = scan_meta.get("nx", 1)
|
||||
row_width = len(str(ny - 1)) if ny > 1 else 1
|
||||
col_width = len(str(nx - 1)) if nx > 1 else 1
|
||||
filename = (
|
||||
f"tile_r{tile['row_index']:0{row_width}d}"
|
||||
f"_c{tile['col_index']:0{col_width}d}.jpg"
|
||||
)
|
||||
return (
|
||||
output_dir
|
||||
/ machine_dir_name(machine)
|
||||
/ scan_date
|
||||
/ str(scan_id)
|
||||
/ "tiles"
|
||||
/ filename
|
||||
)
|
||||
|
||||
|
||||
def mosaic_dest(
|
||||
output_dir: Path,
|
||||
machine: dict[str, Any],
|
||||
scan_meta: dict[str, Any],
|
||||
scan_id: int,
|
||||
) -> Path:
|
||||
"""Return the local path for a scan's mosaic file."""
|
||||
scan_date = _extract_date(scan_meta.get("scan_time", ""))
|
||||
return (
|
||||
output_dir
|
||||
/ machine_dir_name(machine)
|
||||
/ scan_date
|
||||
/ str(scan_id)
|
||||
/ "mosaic.jpg"
|
||||
)
|
||||
@@ -0,0 +1,82 @@
|
||||
"""
|
||||
Progress tracking (JSON) and CSV writing.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ProgressTracker:
|
||||
"""
|
||||
Tracks successfully downloaded URLs in a JSON file so runs can be resumed.
|
||||
|
||||
Public API (all external code should use only these methods):
|
||||
is_done(url) — True if url has been downloaded
|
||||
mark_done(url) — Record url as complete (call save() to persist)
|
||||
discard(url) — Remove url from the completed set
|
||||
iter_urls() — Iterate over all completed URLs
|
||||
__len__() — Number of completed URLs
|
||||
save() — Flush state to disk
|
||||
"""
|
||||
|
||||
def __init__(self, path: Path) -> None:
|
||||
self.path = path
|
||||
self._done: set[str] = set()
|
||||
self._load()
|
||||
|
||||
def _load(self) -> None:
|
||||
if self.path.exists():
|
||||
try:
|
||||
data = json.loads(self.path.read_text())
|
||||
self._done = set(data.get("completed_urls", []))
|
||||
log.info("Resuming: %d URLs already downloaded.", len(self._done))
|
||||
except Exception:
|
||||
log.warning("Could not read progress file; starting fresh.")
|
||||
|
||||
def is_done(self, url: str) -> bool:
|
||||
return url in self._done
|
||||
|
||||
def mark_done(self, url: str) -> None:
|
||||
self._done.add(url)
|
||||
|
||||
def discard(self, url: str) -> None:
|
||||
"""Remove a URL from the completed set (re-queues it for download)."""
|
||||
self._done.discard(url)
|
||||
|
||||
def iter_urls(self) -> Iterator[str]:
|
||||
"""Iterate over all completed URLs."""
|
||||
return iter(self._done)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._done)
|
||||
|
||||
def save(self) -> None:
|
||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.path.write_text(
|
||||
json.dumps({"completed_urls": sorted(self._done)}, indent=2)
|
||||
)
|
||||
|
||||
|
||||
class CsvWriter:
|
||||
"""Append-mode CSV writer that writes a header on first creation."""
|
||||
|
||||
def __init__(self, path: Path, fields: list[str]) -> None:
|
||||
is_new = not path.exists()
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._fh = open(path, "a", newline="", encoding="utf-8")
|
||||
self._writer = csv.DictWriter(self._fh, fieldnames=fields)
|
||||
if is_new:
|
||||
self._writer.writeheader()
|
||||
self._fields = fields
|
||||
|
||||
def write(self, row: dict) -> None:
|
||||
self._writer.writerow({f: row.get(f, "") for f in self._fields})
|
||||
self._fh.flush()
|
||||
|
||||
def close(self) -> None:
|
||||
self._fh.close()
|
||||
@@ -0,0 +1,156 @@
|
||||
"""
|
||||
Archive integrity checks — find corrupt / missing tiles and remove them
|
||||
from the progress tracker so they are re-downloaded on the next run.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from spruce.progress import ProgressTracker
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _parse_tile_url(url: str) -> dict[str, str]:
|
||||
"""Extract scan_id, x, y from a tile URL query string."""
|
||||
qs = dict(urllib.parse.parse_qsl(urllib.parse.urlparse(url).query))
|
||||
return {
|
||||
"scan_id": qs.get("id", ""),
|
||||
"x": qs.get("x", ""),
|
||||
"y": qs.get("y", ""),
|
||||
}
|
||||
|
||||
|
||||
def _build_disk_index(output_dir: Path) -> dict[Path, int]:
|
||||
"""Return {tile_path: size_bytes} for every tile file found on disk."""
|
||||
return {p: p.stat().st_size for p in output_dir.rglob("tile_r*.jpg")}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def recheck_tile_files(output_dir: Path, progress: ProgressTracker) -> int:
|
||||
"""
|
||||
Walk every tile file on disk and delete any that are zero bytes.
|
||||
Also removes the corresponding URL from progress in the same pass,
|
||||
so a single --recheck call is sufficient before resuming.
|
||||
|
||||
Returns the count of files deleted.
|
||||
"""
|
||||
# Build a reverse map: (scan_id, x, y) -> url for all completed tile URLs
|
||||
coord_to_url: dict[tuple[str, str, str], str] = {}
|
||||
for url in progress.iter_urls():
|
||||
if "cmd=image" in url:
|
||||
p = _parse_tile_url(url)
|
||||
key = (p["scan_id"], p["x"], p["y"])
|
||||
coord_to_url[key] = url
|
||||
|
||||
deleted = 0
|
||||
for tile_path in output_dir.rglob("tile_r*.jpg"):
|
||||
if tile_path.stat().st_size == 0:
|
||||
log.warning("Deleting zero-byte tile: %s", tile_path)
|
||||
tile_path.unlink()
|
||||
deleted += 1
|
||||
|
||||
# Try to find the matching URL from progress and discard it
|
||||
scan_id = _scan_id_from_path(tile_path)
|
||||
if scan_id:
|
||||
# Discard any URL for this scan_id — precise x/y matching
|
||||
# requires metadata.json; scan-level discard is safe because
|
||||
# recheck_archive will clean up any remaining stale URLs.
|
||||
for key, url in list(coord_to_url.items()):
|
||||
if key[0] == scan_id:
|
||||
progress.discard(url)
|
||||
del coord_to_url[key]
|
||||
|
||||
if deleted:
|
||||
log.info("Deleted %d zero-byte tile file(s).", deleted)
|
||||
progress.save()
|
||||
else:
|
||||
log.info("No zero-byte tile files found on disk.")
|
||||
return deleted
|
||||
|
||||
|
||||
def recheck_archive(output_dir: Path, progress: ProgressTracker) -> int:
|
||||
"""
|
||||
Walk every URL in .progress.json and verify its local file exists and is
|
||||
non-empty. Removes bad entries from progress so the next run re-downloads
|
||||
them. Returns the count of entries removed.
|
||||
|
||||
Only tile URLs are checked (mosaic URLs are skipped — mosaics are large
|
||||
single files and are unlikely to be partially written due to streaming).
|
||||
"""
|
||||
if len(progress) == 0:
|
||||
log.info("Progress file is empty — nothing to recheck.")
|
||||
return 0
|
||||
|
||||
tile_urls = [u for u in progress.iter_urls() if "cmd=image" in u]
|
||||
mosaic_count = len(progress) - len(tile_urls)
|
||||
log.info(
|
||||
"Rechecking %d tile URLs (%d mosaic URLs not rechecked) …",
|
||||
len(tile_urls),
|
||||
mosaic_count,
|
||||
)
|
||||
|
||||
# Build a disk index once
|
||||
existing_files = _build_disk_index(output_dir)
|
||||
log.debug("Found %d tile files on disk.", len(existing_files))
|
||||
|
||||
bad_urls: list[str] = []
|
||||
|
||||
for url in tile_urls:
|
||||
p = _parse_tile_url(url)
|
||||
scan_id = p["scan_id"]
|
||||
|
||||
# Find tile files that live under a directory named after this scan_id
|
||||
candidates = [path for path in existing_files if str(scan_id) in path.parts]
|
||||
|
||||
if not candidates:
|
||||
bad_urls.append(url)
|
||||
continue
|
||||
|
||||
if not any(existing_files[path] > 0 for path in candidates):
|
||||
bad_urls.append(url)
|
||||
|
||||
if not bad_urls:
|
||||
log.info("All %d tile URLs look healthy.", len(tile_urls))
|
||||
return 0
|
||||
|
||||
log.warning(
|
||||
"Found %d suspect tile URL(s). Removing from progress.",
|
||||
len(bad_urls),
|
||||
)
|
||||
for url in bad_urls:
|
||||
progress.discard(url)
|
||||
progress.save()
|
||||
log.info(
|
||||
"Removed %d URL(s) from .progress.json — they will be re-downloaded on next run.",
|
||||
len(bad_urls),
|
||||
)
|
||||
return len(bad_urls)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal utility
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _scan_id_from_path(tile_path: Path) -> str | None:
|
||||
"""
|
||||
Given a tile path like .../158374/tiles/tile_r0_c0.jpg, return '158374'.
|
||||
Looks for the directory two levels above the filename (parent.parent.name).
|
||||
"""
|
||||
try:
|
||||
# structure: <machine>/<date>/<scan_id>/tiles/<filename>
|
||||
return tile_path.parent.parent.name
|
||||
except Exception:
|
||||
return None
|
||||
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
HTTP session for a single RootView machine: login, scan listing, tile downloads.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from spruce.parsers import parse_scan_row, parse_scan_view, _grid_values
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
USER_AGENT = "spruce-scraper/1.0"
|
||||
|
||||
|
||||
class MachineSession:
|
||||
"""Manages an authenticated HTTP session for one RootView machine."""
|
||||
|
||||
def __init__(self, machine: dict[str, Any], config: dict[str, Any]) -> None:
|
||||
self.machine = machine
|
||||
self.cfg = config
|
||||
self.http = requests.Session()
|
||||
self.http.headers["User-Agent"] = USER_AGENT
|
||||
self.base_url: str = config["base_url"]
|
||||
self.image_base_url: str = config.get(
|
||||
"image_base_url", "http://205.149.147.131:8011/"
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Auth
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def login(self) -> bool:
|
||||
url = urljoin(self.base_url, "index.php")
|
||||
payload = {
|
||||
"RTLLogin": "1",
|
||||
"RTLNAME": self.machine["option_value"],
|
||||
"RTLUSER": self.cfg["username"],
|
||||
"RTLPWD": self.cfg["password"],
|
||||
"rtl_latest_version": "3.0.0.18",
|
||||
"submit": " submit ",
|
||||
}
|
||||
try:
|
||||
resp = self.http.post(url, data=payload, timeout=self.cfg["timeout"])
|
||||
resp.raise_for_status()
|
||||
except requests.RequestException as exc:
|
||||
log.error("[%s] Login failed: %s", self.machine["label"], exc)
|
||||
return False
|
||||
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
error_tag = soup.find(class_="error")
|
||||
if error_tag and error_tag.get_text(strip=True):
|
||||
log.error(
|
||||
"[%s] Login rejected: %s",
|
||||
self.machine["label"],
|
||||
error_tag.get_text(strip=True),
|
||||
)
|
||||
return False
|
||||
|
||||
log.info("[%s] Login succeeded.", self.machine["label"])
|
||||
return True
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Scan list (paginated)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_all_scans(self) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Fetch the complete scan list across all pages.
|
||||
|
||||
Uses a large FilterCount (320) to minimise round-trips.
|
||||
Falls back to repeated pages if the list is longer.
|
||||
"""
|
||||
all_scans: list[dict[str, Any]] = []
|
||||
start = 0
|
||||
page_size = 320
|
||||
|
||||
while True:
|
||||
page_scans = self._fetch_scan_page(start, page_size)
|
||||
if not page_scans:
|
||||
break
|
||||
all_scans.extend(page_scans)
|
||||
log.debug(
|
||||
"[%s] Page start=%d: %d scans (total so far: %d)",
|
||||
self.machine["label"],
|
||||
start,
|
||||
len(page_scans),
|
||||
len(all_scans),
|
||||
)
|
||||
if len(page_scans) < page_size:
|
||||
break
|
||||
start += page_size
|
||||
time.sleep(self.cfg["request_delay"])
|
||||
|
||||
log.info("[%s] Found %d scans.", self.machine["label"], len(all_scans))
|
||||
return all_scans
|
||||
|
||||
def _fetch_scan_page(
|
||||
self, start: int, page_size: int
|
||||
) -> list[dict[str, Any]]:
|
||||
"""POST the scan list form and parse the returned table."""
|
||||
time.sleep(self.cfg["request_delay"])
|
||||
resp = self.http.post(
|
||||
urljoin(self.base_url, "index.php"),
|
||||
data={
|
||||
"cmd": "scan",
|
||||
"start": str(start),
|
||||
"order": "0",
|
||||
"order_dir": "1",
|
||||
"FilterScanStatus": "2", # Completed scans
|
||||
"FilterUser": "",
|
||||
"hidedate": "",
|
||||
"FilterDtFrom": "",
|
||||
"FilterDtTo": "",
|
||||
"FilterIdFrom": "0",
|
||||
"FilterIdTo": "0",
|
||||
"FilterCount": str(page_size),
|
||||
},
|
||||
timeout=self.cfg["timeout"],
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
scans: list[dict[str, Any]] = []
|
||||
for row in soup.find_all("tr"):
|
||||
cells = [td.get_text(strip=True) for td in row.find_all("td")]
|
||||
scan = parse_scan_row(cells)
|
||||
if scan:
|
||||
scans.append(scan)
|
||||
return scans
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Scan detail
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_scan_metadata(self, scan_id: int) -> dict[str, Any]:
|
||||
"""Fetch the scan view page and extract grid parameters."""
|
||||
time.sleep(self.cfg["request_delay"])
|
||||
resp = self.http.get(
|
||||
urljoin(self.base_url, "index.php"),
|
||||
params={"cmd": "scan", "mode": "view", "id": str(scan_id)},
|
||||
timeout=self.cfg["timeout"],
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return parse_scan_view(resp.text)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tile enumeration
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def enumerate_tiles(self, scan_meta: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Generate the full list of tile descriptors for a scan.
|
||||
|
||||
Each descriptor has: url, row_index, col_index, x_mm, y_mm
|
||||
"""
|
||||
scan_id = scan_meta["scan_id"]
|
||||
nx: int = scan_meta.get("nx", 0)
|
||||
ny: int = scan_meta.get("ny", 0)
|
||||
start_x: float = scan_meta.get("start_x", 0.0)
|
||||
start_y: float = scan_meta.get("start_y", 0.0)
|
||||
dx: float = scan_meta.get("dx", 1.0)
|
||||
dy: float = scan_meta.get("dy", 1.0)
|
||||
scale: int = self.cfg.get("tile_scale", 1)
|
||||
|
||||
xs = _grid_values(start_x, nx, dx)
|
||||
ys = _grid_values(start_y, ny, dy)
|
||||
|
||||
tiles: list[dict[str, Any]] = []
|
||||
for row_idx, y in enumerate(ys):
|
||||
for col_idx, x in enumerate(xs):
|
||||
url = (
|
||||
urljoin(self.base_url, "index.php")
|
||||
+ f"?cmd=image&mode=image_scan&id={scan_id}"
|
||||
+ f"&s={scale}&x={x}&y={y}"
|
||||
)
|
||||
tiles.append(
|
||||
{
|
||||
"scan_id": scan_id,
|
||||
"row_index": row_idx,
|
||||
"col_index": col_idx,
|
||||
"x_mm": x,
|
||||
"y_mm": y,
|
||||
"url": url,
|
||||
}
|
||||
)
|
||||
return tiles
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Mosaic URL
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def mosaic_url(self, scan_id: int) -> str:
|
||||
return urljoin(
|
||||
self.image_base_url, f"RootView_Database/{scan_id}/mosaic.jpg"
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Downloads
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def download_file(self, url: str, dest: Path, retries: int = 3) -> int:
|
||||
"""Stream-download url to dest with retries. Returns bytes written (0 on failure)."""
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
backoff = 5.0
|
||||
for attempt in range(1, retries + 1):
|
||||
try:
|
||||
resp = self.http.get(
|
||||
url, timeout=self.cfg["timeout"], stream=True
|
||||
)
|
||||
resp.raise_for_status()
|
||||
size = 0
|
||||
with open(dest, "wb") as fh:
|
||||
for chunk in resp.iter_content(chunk_size=65536):
|
||||
if chunk:
|
||||
fh.write(chunk)
|
||||
size += len(chunk)
|
||||
return size
|
||||
except Exception as exc:
|
||||
if attempt < retries:
|
||||
log.debug(
|
||||
"Attempt %d/%d failed %s: %s — retrying in %.0fs",
|
||||
attempt,
|
||||
retries,
|
||||
url,
|
||||
exc,
|
||||
backoff,
|
||||
)
|
||||
time.sleep(backoff)
|
||||
backoff *= 2
|
||||
else:
|
||||
log.warning(
|
||||
"Download failed after %d attempts %s: %s",
|
||||
retries,
|
||||
url,
|
||||
exc,
|
||||
)
|
||||
return 0
|
||||
|
||||
def download_tile(
|
||||
self, tile: dict[str, Any], dest: Path, dry_run: bool
|
||||
) -> dict[str, Any]:
|
||||
"""Download a single tile. Returns a metadata row dict."""
|
||||
row: dict[str, Any] = {
|
||||
"machine": self.machine["label"],
|
||||
"machine_id": self.machine["machine_id"],
|
||||
"scan_id": tile["scan_id"],
|
||||
"scan_time": tile.get("scan_time", ""),
|
||||
"row_index": tile["row_index"],
|
||||
"col_index": tile["col_index"],
|
||||
"x_mm": tile["x_mm"],
|
||||
"y_mm": tile["y_mm"],
|
||||
"url": tile["url"],
|
||||
"local_path": str(dest),
|
||||
"downloaded_at": "",
|
||||
"file_size_bytes": "",
|
||||
}
|
||||
if dry_run:
|
||||
return row
|
||||
if dest.exists():
|
||||
row["downloaded_at"] = "already_exists"
|
||||
row["file_size_bytes"] = dest.stat().st_size
|
||||
return row
|
||||
size = self.download_file(tile["url"], dest)
|
||||
if size:
|
||||
row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
|
||||
row["file_size_bytes"] = size
|
||||
return row
|
||||
@@ -0,0 +1,109 @@
|
||||
"""
|
||||
Constants, field lists, and config loading for the spruce scraper.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File-name constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DEFAULT_CONFIG = "config.yaml"
|
||||
PROGRESS_FILENAME = ".progress.json"
|
||||
SCANS_CSV_FILENAME = "scans.csv"
|
||||
TILES_CSV_FILENAME = "tiles.csv"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CSV field lists
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SCANS_CSV_FIELDS: list[str] = [
|
||||
"machine",
|
||||
"machine_id",
|
||||
"scan_id",
|
||||
"name",
|
||||
"scan_time",
|
||||
"start_x",
|
||||
"start_y",
|
||||
"end_x",
|
||||
"end_y",
|
||||
"dx",
|
||||
"dy",
|
||||
"nx",
|
||||
"ny",
|
||||
"total_tiles",
|
||||
"scan_lines",
|
||||
"scan_mode",
|
||||
"start_datetime",
|
||||
"end_datetime",
|
||||
"status",
|
||||
"user",
|
||||
"disk_space_mb",
|
||||
"mosaic_url",
|
||||
"mosaic_local_path",
|
||||
"mosaic_downloaded",
|
||||
]
|
||||
|
||||
TILES_CSV_FIELDS: list[str] = [
|
||||
"machine",
|
||||
"machine_id",
|
||||
"scan_id",
|
||||
"scan_time",
|
||||
"row_index",
|
||||
"col_index",
|
||||
"x_mm",
|
||||
"y_mm",
|
||||
"url",
|
||||
"local_path",
|
||||
"downloaded_at",
|
||||
"file_size_bytes",
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Worker safety
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
MAX_SAFE_WORKERS = 4 # above this the RootView server starts timing out
|
||||
|
||||
|
||||
def _clamp_workers(n: int) -> int:
|
||||
"""Return n clamped to MAX_SAFE_WORKERS, logging a warning if clamped."""
|
||||
if n > MAX_SAFE_WORKERS:
|
||||
log.warning(
|
||||
"workers=%d exceeds the safe limit of %d. "
|
||||
"The RootView server will time out under this load, causing lost tiles. "
|
||||
"Capping at %d.",
|
||||
n,
|
||||
MAX_SAFE_WORKERS,
|
||||
MAX_SAFE_WORKERS,
|
||||
)
|
||||
return MAX_SAFE_WORKERS
|
||||
return n
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config loader
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def load_config(path: str) -> dict:
|
||||
"""Load and validate config.yaml. Exits on missing required fields."""
|
||||
with open(path) as fh:
|
||||
cfg = yaml.safe_load(fh)
|
||||
missing = [k for k in ("username", "password") if not cfg.get(k)]
|
||||
if missing:
|
||||
sys.exit(f"Config {path} is missing required fields: {missing}")
|
||||
cfg.setdefault("base_url", "http://205.149.147.131:8010/")
|
||||
cfg.setdefault("image_base_url", "http://205.149.147.131:8011/")
|
||||
cfg.setdefault("output_dir", "archives")
|
||||
cfg.setdefault("workers", 2)
|
||||
cfg.setdefault("timeout", 60)
|
||||
cfg.setdefault("request_delay", 0.5)
|
||||
cfg.setdefault("tile_scale", 1)
|
||||
cfg["workers"] = _clamp_workers(cfg["workers"])
|
||||
return cfg
|
||||
Reference in New Issue
Block a user