""" Constants, field lists, and config loading for the spruce scraper. """ import logging import sys import yaml log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # File-name constants # --------------------------------------------------------------------------- DEFAULT_CONFIG = "config.yaml" PROGRESS_FILENAME = ".progress.json" SCANS_CSV_FILENAME = "scans.csv" TILES_CSV_FILENAME = "tiles.csv" # --------------------------------------------------------------------------- # CSV field lists # --------------------------------------------------------------------------- SCANS_CSV_FIELDS: list[str] = [ "machine", "machine_id", "scan_id", "name", "scan_time", "start_x", "start_y", "end_x", "end_y", "dx", "dy", "nx", "ny", "total_tiles", "scan_lines", "scan_mode", "start_datetime", "end_datetime", "status", "user", "disk_space_mb", "mosaic_url", "mosaic_local_path", "mosaic_on_disk", "mosaic_download_status", "mosaic_error", "mosaic_error_code", "mosaic_error_class", ] TILES_CSV_FIELDS: list[str] = [ "machine", "machine_id", "scan_id", "scan_time", "row_index", "col_index", "x_mm", "y_mm", "url", "local_path", "status", "error", "error_code", "error_class", "downloaded_at", "file_size_bytes", ] # --------------------------------------------------------------------------- # Worker safety # --------------------------------------------------------------------------- MAX_SAFE_WORKERS = 4 # above this the RootView server starts timing out def _clamp_workers(n: int) -> int: """Return n clamped to MAX_SAFE_WORKERS, logging a warning if clamped.""" if n > MAX_SAFE_WORKERS: log.warning( "workers=%d exceeds the safe limit of %d. " "The RootView server will time out under this load, causing lost tiles. " "Capping at %d.", n, MAX_SAFE_WORKERS, MAX_SAFE_WORKERS, ) return MAX_SAFE_WORKERS return n # --------------------------------------------------------------------------- # Config loader # --------------------------------------------------------------------------- def load_config(path: str) -> dict: """Load and validate config.yaml. Exits on missing required fields.""" with open(path) as fh: cfg = yaml.safe_load(fh) missing = [k for k in ("username", "password") if not cfg.get(k)] if missing: sys.exit(f"Config {path} is missing required fields: {missing}") cfg.setdefault("base_url", "http://205.149.147.131:8010/") cfg.setdefault("image_base_url", "http://205.149.147.131:8011/") cfg.setdefault("output_dir", "archives") cfg.setdefault("workers", 2) cfg.setdefault("timeout", 60) cfg.setdefault("request_delay", 0.5) cfg.setdefault("tile_scale", 1) cfg.setdefault("write_exif", True) cfg.setdefault("machine_metadata", {}) cfg["workers"] = _clamp_workers(cfg["workers"]) return cfg