Files
poprhythm 08a29d124a Add offline mosaic EXIF tagging (stitch --write-exif, tag_mosaic_exif CLI)
- spruce.exif: tag_mosaic_jpeg_for_scan_dir, resolve_machine_label_for_scan_dir; ProcessingSoftware for tile-stitched mosaics
- spruce.settings: load_config(require_credentials=False) for config without login
- scripts/tag_mosaic_exif.py and tests; stitch script --write-exif path
2026-04-26 20:47:23 -04:00

127 lines
3.3 KiB
Python

"""
Constants, field lists, and config loading for the spruce scraper.
"""
import logging
import sys
import yaml
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# File-name constants
# ---------------------------------------------------------------------------
DEFAULT_CONFIG = "config.yaml"
PROGRESS_FILENAME = ".progress.json"
SCANS_CSV_FILENAME = "scans.csv"
TILES_CSV_FILENAME = "tiles.csv"
# ---------------------------------------------------------------------------
# CSV field lists
# ---------------------------------------------------------------------------
SCANS_CSV_FIELDS: list[str] = [
"machine",
"machine_id",
"scan_id",
"name",
"scan_time",
"start_x",
"start_y",
"end_x",
"end_y",
"dx",
"dy",
"nx",
"ny",
"total_tiles",
"scan_lines",
"scan_mode",
"start_datetime",
"end_datetime",
"status",
"user",
"disk_space_mb",
"mosaic_url",
"mosaic_local_path",
"mosaic_on_disk",
"mosaic_download_status",
"mosaic_error",
"mosaic_error_code",
"mosaic_error_class",
]
TILES_CSV_FIELDS: list[str] = [
"machine",
"machine_id",
"scan_id",
"scan_time",
"row_index",
"col_index",
"x_mm",
"y_mm",
"url",
"local_path",
"status",
"error",
"error_code",
"error_class",
"downloaded_at",
"file_size_bytes",
]
# ---------------------------------------------------------------------------
# Worker safety
# ---------------------------------------------------------------------------
MAX_SAFE_WORKERS = 4 # above this the RootView server starts timing out
def _clamp_workers(n: int) -> int:
"""Return n clamped to MAX_SAFE_WORKERS, logging a warning if clamped."""
if n > MAX_SAFE_WORKERS:
log.warning(
"workers=%d exceeds the safe limit of %d. "
"The RootView server will time out under this load, causing lost tiles. "
"Capping at %d.",
n,
MAX_SAFE_WORKERS,
MAX_SAFE_WORKERS,
)
return MAX_SAFE_WORKERS
return n
# ---------------------------------------------------------------------------
# Config loader
# ---------------------------------------------------------------------------
def load_config(path: str, *, require_credentials: bool = True) -> dict:
"""Load and validate config.yaml. Exits on missing required fields.
With ``require_credentials=False`` (e.g. offline EXIF tagging), username and
password are not required; only fields needed for tagging are defaulted.
"""
with open(path) as fh:
cfg = yaml.safe_load(fh)
if cfg is None:
cfg = {}
if require_credentials:
missing = [k for k in ("username", "password") if not cfg.get(k)]
if missing:
sys.exit(f"Config {path} is missing required fields: {missing}")
cfg.setdefault("base_url", "http://205.149.147.131:8010/")
cfg.setdefault("image_base_url", "http://205.149.147.131:8011/")
cfg.setdefault("output_dir", "archives")
cfg.setdefault("workers", 2)
cfg.setdefault("timeout", 60)
cfg.setdefault("request_delay", 0.5)
cfg.setdefault("tile_scale", 1)
cfg.setdefault("write_exif", True)
cfg.setdefault("machine_metadata", {})
cfg["workers"] = _clamp_workers(cfg["workers"])
return cfg