SPRUCE-scraper/spruce/orchestrator.py

"""
High-level scrape orchestration: drives the per-machine and per-scan loops.
"""

import json
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Any

from tqdm import tqdm

from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
from spruce.progress import ProgressTracker, CsvWriter
from spruce.session import MachineSession

log = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Per-scan helpers
# ---------------------------------------------------------------------------


def _download_mosaic(
    sess: MachineSession,
    scan_meta: dict[str, Any],
    scan_id: int,
    mosaic_path: Path,
    progress: ProgressTracker,
    machine: dict[str, Any],
    dry_run: bool,
) -> bool:
    """Download the scan mosaic if not already done. Returns True if downloaded."""
    url = sess.mosaic_url(scan_id)
    if progress.is_done(url):
        return False
    if dry_run:
        log.info("[DRY-RUN] Mosaic: %s → %s", url, mosaic_path)
        return False
    log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id)
    size = sess.download_file(url, mosaic_path)
    if size:
        progress.mark_done(url)
        progress.save()
        log.info(
            "[%s] Mosaic saved: %s (%.1f MB)",
            machine["label"],
            mosaic_path,
            size / 1e6,
        )
        return True
    return False


def _download_tiles_for_scan(
    sess: MachineSession,
    tiles: list[dict[str, Any]],
    scan_meta: dict[str, Any],
    scan_id: int,
    output_dir: Path,
    machine: dict[str, Any],
    config: dict[str, Any],
    progress: ProgressTracker,
    tiles_csv: CsvWriter,
    dry_run: bool,
) -> int:
    """Download all pending tiles for a scan. Returns count of tiles downloaded."""
    pending = [t for t in tiles if not progress.is_done(t["url"])]
    log.info(
        "[%s] Scan %d: %d tiles total, %d pending.",
        machine["label"],
        scan_id,
        len(tiles),
        len(pending),
    )

    if dry_run:
        for t in pending[:5]:
            log.info("[DRY-RUN] Tile: %s", t["url"])
        if len(pending) > 5:
            log.info("[DRY-RUN] … and %d more tiles.", len(pending) - 5)
        return 0

    # Attach scan_time for CSV rows
    for t in pending:
        t["scan_time"] = scan_meta.get("scan_time", "")

    workers: int = config["workers"]
    downloaded = 0

    with ThreadPoolExecutor(max_workers=workers) as pool:
        futures = {
            pool.submit(
                sess.download_tile,
                tile,
                tile_dest(output_dir, machine, scan_meta, tile),
                False,
            ): tile
            for tile in pending
        }

        save_every = max(50, workers * 4)
        batch: list[dict[str, Any]] = []

        with tqdm(
            total=len(pending),
            desc=f"{machine['label']} scan {scan_id}",
            unit="tile",
            leave=True,
        ) as pbar:
            for future in as_completed(futures):
                result = future.result()
                if result.get("file_size_bytes"):
                    batch.append(result)
                    progress.mark_done(result["url"])
                    downloaded += 1
                pbar.update(1)

                if len(batch) >= save_every:
                    for row in batch:
                        tiles_csv.write(row)
                    progress.save()
                    batch.clear()

        for row in batch:
            tiles_csv.write(row)
        progress.save()

    log.info(
        "[%s] Scan %d complete: %d tiles downloaded.",
        machine["label"],
        scan_id,
        downloaded,
    )
    return downloaded


# ---------------------------------------------------------------------------
# Per-scan driver
# ---------------------------------------------------------------------------


def process_scan(
    sess: MachineSession,
    scan: dict[str, Any],
    output_dir: Path,
    machine: dict[str, Any],
    config: dict[str, Any],
    progress: ProgressTracker,
    scans_csv: CsvWriter,
    tiles_csv: CsvWriter,
    dry_run: bool,
    mosaic_only: bool,
) -> int:
    """
    Process one scan: fetch metadata, download mosaic and (optionally) tiles.
    Returns total files downloaded for this scan.
    """
    scan_id: int = scan["scan_id"]
    log.info("[%s] Processing scan %d …", machine["label"], scan_id)

    try:
        scan_meta = sess.get_scan_metadata(scan_id)
    except Exception as exc:
        log.error(
            "[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
        )
        return 0

    if not scan_meta.get("nx") or not scan_meta.get("ny"):
        log.warning(
            "[%s] Scan %d: missing grid params, skipping.",
            machine["label"],
            scan_id,
        )
        return 0

    # Merge list-level metadata into scan_meta (detail page takes precedence)
    for k in (
        "name",
        "scan_time",
        "start_datetime",
        "end_datetime",
        "status",
        "user",
        "scan_lines",
        "scan_mode",
    ):
        scan_meta.setdefault(k, scan.get(k, ""))

    # Save per-scan metadata.json
    scan_date = _extract_date(scan_meta.get("scan_time", ""))
    scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id)
    if not dry_run:
        scan_dir.mkdir(parents=True, exist_ok=True)
        meta_file = scan_dir / "metadata.json"
        if not meta_file.exists():
            meta_file.write_text(
                json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
            )

    # Mosaic
    mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
    mosaic_url = sess.mosaic_url(scan_id)
    mosaic_downloaded = _download_mosaic(
        sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
    )
    total = 1 if mosaic_downloaded else 0

    # Write scan-level CSV row
    scans_csv.write(
        {
            "machine": machine["label"],
            "machine_id": machine["machine_id"],
            "scan_id": scan_id,
            "name": scan_meta.get("name", ""),
            "scan_time": scan_meta.get("scan_time", ""),
            "start_x": scan_meta.get("start_x", ""),
            "start_y": scan_meta.get("start_y", ""),
            "end_x": scan_meta.get("end_x", ""),
            "end_y": scan_meta.get("end_y", ""),
            "dx": scan_meta.get("dx", ""),
            "dy": scan_meta.get("dy", ""),
            "nx": scan_meta.get("nx", ""),
            "ny": scan_meta.get("ny", ""),
            "total_tiles": scan_meta.get("total_tiles", ""),
            "scan_lines": scan_meta.get("scan_lines", ""),
            "scan_mode": scan_meta.get("scan_mode", ""),
            "start_datetime": scan_meta.get("start_datetime", ""),
            "end_datetime": scan_meta.get("end_datetime", ""),
            "status": scan_meta.get("status", ""),
            "user": scan_meta.get("user", ""),
            "disk_space_mb": scan_meta.get("disk_space_mb", ""),
            "mosaic_url": mosaic_url,
            "mosaic_local_path": str(mosaic_path),
            "mosaic_downloaded": mosaic_downloaded,
        }
    )

    if mosaic_only:
        return total

    # Tiles
    tiles = sess.enumerate_tiles(scan_meta)
    total += _download_tiles_for_scan(
        sess,
        tiles,
        scan_meta,
        scan_id,
        output_dir,
        machine,
        config,
        progress,
        tiles_csv,
        dry_run,
    )
    return total


# ---------------------------------------------------------------------------
# Per-machine driver
# ---------------------------------------------------------------------------


def scrape_machine(
    machine: dict[str, Any],
    config: dict[str, Any],
    output_dir: Path,
    progress: ProgressTracker,
    tiles_csv: CsvWriter,
    scans_csv: CsvWriter,
    dry_run: bool,
    mosaic_only: bool,
    scan_id_filter: int | None,
) -> int:
    """Login, fetch scans, and download all content for one machine."""
    sess = MachineSession(machine, config)
    if not sess.login():
        return 0

    if scan_id_filter is not None:
        scans: list[dict[str, Any]] = [
            {"scan_id": scan_id_filter, "status": "Completed"}
        ]
        log.info("[%s] Targeting scan ID %d.", machine["label"], scan_id_filter)
    else:
        scans = sess.get_all_scans()
        if not scans:
            log.warning("[%s] No scans found.", machine["label"])
            return 0

    total = 0
    for scan in scans:
        total += process_scan(
            sess=sess,
            scan=scan,
            output_dir=output_dir,
            machine=machine,
            config=config,
            progress=progress,
            scans_csv=scans_csv,
            tiles_csv=tiles_csv,
            dry_run=dry_run,
            mosaic_only=mosaic_only,
        )
    return total