Initial commit

Add spruce scraper with CLI, session management, parsers, progress tracking,
recheck logic, and test suite. Includes example config and README.
This commit is contained in:
2026-04-22 10:41:18 -04:00
commit e122f6435a
23 changed files with 3789 additions and 0 deletions
+109
View File
@@ -0,0 +1,109 @@
"""
Constants, field lists, and config loading for the spruce scraper.
"""
import logging
import sys
import yaml
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# File-name constants
# ---------------------------------------------------------------------------
DEFAULT_CONFIG = "config.yaml"
PROGRESS_FILENAME = ".progress.json"
SCANS_CSV_FILENAME = "scans.csv"
TILES_CSV_FILENAME = "tiles.csv"
# ---------------------------------------------------------------------------
# CSV field lists
# ---------------------------------------------------------------------------
SCANS_CSV_FIELDS: list[str] = [
"machine",
"machine_id",
"scan_id",
"name",
"scan_time",
"start_x",
"start_y",
"end_x",
"end_y",
"dx",
"dy",
"nx",
"ny",
"total_tiles",
"scan_lines",
"scan_mode",
"start_datetime",
"end_datetime",
"status",
"user",
"disk_space_mb",
"mosaic_url",
"mosaic_local_path",
"mosaic_downloaded",
]
TILES_CSV_FIELDS: list[str] = [
"machine",
"machine_id",
"scan_id",
"scan_time",
"row_index",
"col_index",
"x_mm",
"y_mm",
"url",
"local_path",
"downloaded_at",
"file_size_bytes",
]
# ---------------------------------------------------------------------------
# Worker safety
# ---------------------------------------------------------------------------
MAX_SAFE_WORKERS = 4 # above this the RootView server starts timing out
def _clamp_workers(n: int) -> int:
"""Return n clamped to MAX_SAFE_WORKERS, logging a warning if clamped."""
if n > MAX_SAFE_WORKERS:
log.warning(
"workers=%d exceeds the safe limit of %d. "
"The RootView server will time out under this load, causing lost tiles. "
"Capping at %d.",
n,
MAX_SAFE_WORKERS,
MAX_SAFE_WORKERS,
)
return MAX_SAFE_WORKERS
return n
# ---------------------------------------------------------------------------
# Config loader
# ---------------------------------------------------------------------------
def load_config(path: str) -> dict:
"""Load and validate config.yaml. Exits on missing required fields."""
with open(path) as fh:
cfg = yaml.safe_load(fh)
missing = [k for k in ("username", "password") if not cfg.get(k)]
if missing:
sys.exit(f"Config {path} is missing required fields: {missing}")
cfg.setdefault("base_url", "http://205.149.147.131:8010/")
cfg.setdefault("image_base_url", "http://205.149.147.131:8011/")
cfg.setdefault("output_dir", "archives")
cfg.setdefault("workers", 2)
cfg.setdefault("timeout", 60)
cfg.setdefault("request_delay", 0.5)
cfg.setdefault("tile_scale", 1)
cfg["workers"] = _clamp_workers(cfg["workers"])
return cfg