120 lines
3.1 KiB
Python
120 lines
3.1 KiB
Python
"""
|
|
Constants, field lists, and config loading for the spruce scraper.
|
|
"""
|
|
|
|
import logging
|
|
import sys
|
|
|
|
import yaml
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# File-name constants
|
|
# ---------------------------------------------------------------------------
|
|
|
|
DEFAULT_CONFIG = "config.yaml"
|
|
PROGRESS_FILENAME = ".progress.json"
|
|
SCANS_CSV_FILENAME = "scans.csv"
|
|
TILES_CSV_FILENAME = "tiles.csv"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CSV field lists
|
|
# ---------------------------------------------------------------------------
|
|
|
|
SCANS_CSV_FIELDS: list[str] = [
|
|
"machine",
|
|
"machine_id",
|
|
"scan_id",
|
|
"name",
|
|
"scan_time",
|
|
"start_x",
|
|
"start_y",
|
|
"end_x",
|
|
"end_y",
|
|
"dx",
|
|
"dy",
|
|
"nx",
|
|
"ny",
|
|
"total_tiles",
|
|
"scan_lines",
|
|
"scan_mode",
|
|
"start_datetime",
|
|
"end_datetime",
|
|
"status",
|
|
"user",
|
|
"disk_space_mb",
|
|
"mosaic_url",
|
|
"mosaic_local_path",
|
|
"mosaic_on_disk",
|
|
"mosaic_download_status",
|
|
"mosaic_error",
|
|
"mosaic_error_code",
|
|
"mosaic_error_class",
|
|
]
|
|
|
|
TILES_CSV_FIELDS: list[str] = [
|
|
"machine",
|
|
"machine_id",
|
|
"scan_id",
|
|
"scan_time",
|
|
"row_index",
|
|
"col_index",
|
|
"x_mm",
|
|
"y_mm",
|
|
"url",
|
|
"local_path",
|
|
"status",
|
|
"error",
|
|
"error_code",
|
|
"error_class",
|
|
"downloaded_at",
|
|
"file_size_bytes",
|
|
]
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Worker safety
|
|
# ---------------------------------------------------------------------------
|
|
|
|
MAX_SAFE_WORKERS = 4 # above this the RootView server starts timing out
|
|
|
|
|
|
def _clamp_workers(n: int) -> int:
|
|
"""Return n clamped to MAX_SAFE_WORKERS, logging a warning if clamped."""
|
|
if n > MAX_SAFE_WORKERS:
|
|
log.warning(
|
|
"workers=%d exceeds the safe limit of %d. "
|
|
"The RootView server will time out under this load, causing lost tiles. "
|
|
"Capping at %d.",
|
|
n,
|
|
MAX_SAFE_WORKERS,
|
|
MAX_SAFE_WORKERS,
|
|
)
|
|
return MAX_SAFE_WORKERS
|
|
return n
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Config loader
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def load_config(path: str) -> dict:
|
|
"""Load and validate config.yaml. Exits on missing required fields."""
|
|
with open(path) as fh:
|
|
cfg = yaml.safe_load(fh)
|
|
missing = [k for k in ("username", "password") if not cfg.get(k)]
|
|
if missing:
|
|
sys.exit(f"Config {path} is missing required fields: {missing}")
|
|
cfg.setdefault("base_url", "http://205.149.147.131:8010/")
|
|
cfg.setdefault("image_base_url", "http://205.149.147.131:8011/")
|
|
cfg.setdefault("output_dir", "archives")
|
|
cfg.setdefault("workers", 2)
|
|
cfg.setdefault("timeout", 60)
|
|
cfg.setdefault("request_delay", 0.5)
|
|
cfg.setdefault("tile_scale", 1)
|
|
cfg.setdefault("write_exif", True)
|
|
cfg.setdefault("machine_metadata", {})
|
|
cfg["workers"] = _clamp_workers(cfg["workers"])
|
|
return cfg
|