Initial commit
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
This commit is contained in:
@@ -0,0 +1,109 @@
|
||||
"""
|
||||
Constants, field lists, and config loading for the spruce scraper.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File-name constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DEFAULT_CONFIG = "config.yaml"
|
||||
PROGRESS_FILENAME = ".progress.json"
|
||||
SCANS_CSV_FILENAME = "scans.csv"
|
||||
TILES_CSV_FILENAME = "tiles.csv"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CSV field lists
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SCANS_CSV_FIELDS: list[str] = [
|
||||
"machine",
|
||||
"machine_id",
|
||||
"scan_id",
|
||||
"name",
|
||||
"scan_time",
|
||||
"start_x",
|
||||
"start_y",
|
||||
"end_x",
|
||||
"end_y",
|
||||
"dx",
|
||||
"dy",
|
||||
"nx",
|
||||
"ny",
|
||||
"total_tiles",
|
||||
"scan_lines",
|
||||
"scan_mode",
|
||||
"start_datetime",
|
||||
"end_datetime",
|
||||
"status",
|
||||
"user",
|
||||
"disk_space_mb",
|
||||
"mosaic_url",
|
||||
"mosaic_local_path",
|
||||
"mosaic_downloaded",
|
||||
]
|
||||
|
||||
TILES_CSV_FIELDS: list[str] = [
|
||||
"machine",
|
||||
"machine_id",
|
||||
"scan_id",
|
||||
"scan_time",
|
||||
"row_index",
|
||||
"col_index",
|
||||
"x_mm",
|
||||
"y_mm",
|
||||
"url",
|
||||
"local_path",
|
||||
"downloaded_at",
|
||||
"file_size_bytes",
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Worker safety
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
MAX_SAFE_WORKERS = 4 # above this the RootView server starts timing out
|
||||
|
||||
|
||||
def _clamp_workers(n: int) -> int:
|
||||
"""Return n clamped to MAX_SAFE_WORKERS, logging a warning if clamped."""
|
||||
if n > MAX_SAFE_WORKERS:
|
||||
log.warning(
|
||||
"workers=%d exceeds the safe limit of %d. "
|
||||
"The RootView server will time out under this load, causing lost tiles. "
|
||||
"Capping at %d.",
|
||||
n,
|
||||
MAX_SAFE_WORKERS,
|
||||
MAX_SAFE_WORKERS,
|
||||
)
|
||||
return MAX_SAFE_WORKERS
|
||||
return n
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config loader
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def load_config(path: str) -> dict:
|
||||
"""Load and validate config.yaml. Exits on missing required fields."""
|
||||
with open(path) as fh:
|
||||
cfg = yaml.safe_load(fh)
|
||||
missing = [k for k in ("username", "password") if not cfg.get(k)]
|
||||
if missing:
|
||||
sys.exit(f"Config {path} is missing required fields: {missing}")
|
||||
cfg.setdefault("base_url", "http://205.149.147.131:8010/")
|
||||
cfg.setdefault("image_base_url", "http://205.149.147.131:8011/")
|
||||
cfg.setdefault("output_dir", "archives")
|
||||
cfg.setdefault("workers", 2)
|
||||
cfg.setdefault("timeout", 60)
|
||||
cfg.setdefault("request_delay", 0.5)
|
||||
cfg.setdefault("tile_scale", 1)
|
||||
cfg["workers"] = _clamp_workers(cfg["workers"])
|
||||
return cfg
|
||||
Reference in New Issue
Block a user