e122f6435a
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
110 lines
2.8 KiB
Python
110 lines
2.8 KiB
Python
"""
|
|
Constants, field lists, and config loading for the spruce scraper.
|
|
"""
|
|
|
|
import logging
|
|
import sys
|
|
|
|
import yaml
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# File-name constants
|
|
# ---------------------------------------------------------------------------
|
|
|
|
DEFAULT_CONFIG = "config.yaml"
|
|
PROGRESS_FILENAME = ".progress.json"
|
|
SCANS_CSV_FILENAME = "scans.csv"
|
|
TILES_CSV_FILENAME = "tiles.csv"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CSV field lists
|
|
# ---------------------------------------------------------------------------
|
|
|
|
SCANS_CSV_FIELDS: list[str] = [
|
|
"machine",
|
|
"machine_id",
|
|
"scan_id",
|
|
"name",
|
|
"scan_time",
|
|
"start_x",
|
|
"start_y",
|
|
"end_x",
|
|
"end_y",
|
|
"dx",
|
|
"dy",
|
|
"nx",
|
|
"ny",
|
|
"total_tiles",
|
|
"scan_lines",
|
|
"scan_mode",
|
|
"start_datetime",
|
|
"end_datetime",
|
|
"status",
|
|
"user",
|
|
"disk_space_mb",
|
|
"mosaic_url",
|
|
"mosaic_local_path",
|
|
"mosaic_downloaded",
|
|
]
|
|
|
|
TILES_CSV_FIELDS: list[str] = [
|
|
"machine",
|
|
"machine_id",
|
|
"scan_id",
|
|
"scan_time",
|
|
"row_index",
|
|
"col_index",
|
|
"x_mm",
|
|
"y_mm",
|
|
"url",
|
|
"local_path",
|
|
"downloaded_at",
|
|
"file_size_bytes",
|
|
]
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Worker safety
|
|
# ---------------------------------------------------------------------------
|
|
|
|
MAX_SAFE_WORKERS = 4 # above this the RootView server starts timing out
|
|
|
|
|
|
def _clamp_workers(n: int) -> int:
|
|
"""Return n clamped to MAX_SAFE_WORKERS, logging a warning if clamped."""
|
|
if n > MAX_SAFE_WORKERS:
|
|
log.warning(
|
|
"workers=%d exceeds the safe limit of %d. "
|
|
"The RootView server will time out under this load, causing lost tiles. "
|
|
"Capping at %d.",
|
|
n,
|
|
MAX_SAFE_WORKERS,
|
|
MAX_SAFE_WORKERS,
|
|
)
|
|
return MAX_SAFE_WORKERS
|
|
return n
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Config loader
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def load_config(path: str) -> dict:
|
|
"""Load and validate config.yaml. Exits on missing required fields."""
|
|
with open(path) as fh:
|
|
cfg = yaml.safe_load(fh)
|
|
missing = [k for k in ("username", "password") if not cfg.get(k)]
|
|
if missing:
|
|
sys.exit(f"Config {path} is missing required fields: {missing}")
|
|
cfg.setdefault("base_url", "http://205.149.147.131:8010/")
|
|
cfg.setdefault("image_base_url", "http://205.149.147.131:8011/")
|
|
cfg.setdefault("output_dir", "archives")
|
|
cfg.setdefault("workers", 2)
|
|
cfg.setdefault("timeout", 60)
|
|
cfg.setdefault("request_delay", 0.5)
|
|
cfg.setdefault("tile_scale", 1)
|
|
cfg["workers"] = _clamp_workers(cfg["workers"])
|
|
return cfg
|