SPRUCE-scraper/spruce/recheck.py

"""
Archive integrity checks — find corrupt / missing tiles and remove them
from the progress tracker so they are re-downloaded on the next run.
"""

import logging
import urllib.parse
from pathlib import Path
from typing import Any

from spruce.progress import ProgressTracker

log = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Private helpers
# ---------------------------------------------------------------------------


def _parse_tile_url(url: str) -> dict[str, str]:
    """Extract scan_id, x, y from a tile URL query string."""
    qs = dict(urllib.parse.parse_qsl(urllib.parse.urlparse(url).query))
    return {
        "scan_id": qs.get("id", ""),
        "x": qs.get("x", ""),
        "y": qs.get("y", ""),
    }


def _build_disk_index(output_dir: Path) -> dict[Path, int]:
    """Return {tile_path: size_bytes} for every tile file found on disk."""
    return {p: p.stat().st_size for p in output_dir.rglob("tile_r*.jpg")}


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def recheck_tile_files(output_dir: Path, progress: ProgressTracker) -> int:
    """
    Walk every tile file on disk and delete any that are zero bytes.
    Also removes the corresponding URL from progress in the same pass,
    so a single --recheck call is sufficient before resuming.

    Returns the count of files deleted.
    """
    # Build a reverse map: (scan_id, x, y) -> url for all completed tile URLs
    coord_to_url: dict[tuple[str, str, str], str] = {}
    for url in progress.iter_urls():
        if "cmd=image" in url:
            p = _parse_tile_url(url)
            key = (p["scan_id"], p["x"], p["y"])
            coord_to_url[key] = url

    deleted = 0
    for tile_path in output_dir.rglob("tile_r*.jpg"):
        if tile_path.stat().st_size == 0:
            log.warning("Deleting zero-byte tile: %s", tile_path)
            tile_path.unlink()
            deleted += 1

            # Try to find the matching URL from progress and discard it
            scan_id = _scan_id_from_path(tile_path)
            if scan_id:
                # Discard any URL for this scan_id — precise x/y matching
                # requires metadata.json; scan-level discard is safe because
                # recheck_archive will clean up any remaining stale URLs.
                for key, url in list(coord_to_url.items()):
                    if key[0] == scan_id:
                        progress.discard(url)
                        del coord_to_url[key]

    if deleted:
        log.info("Deleted %d zero-byte tile file(s).", deleted)
        progress.save()
    else:
        log.info("No zero-byte tile files found on disk.")
    return deleted


def recheck_archive(output_dir: Path, progress: ProgressTracker) -> int:
    """
    Walk every URL in .progress.json and verify its local file exists and is
    non-empty. Removes bad entries from progress so the next run re-downloads
    them. Returns the count of entries removed.

    Both tile URLs and mosaic URLs are checked.
    """
    if len(progress) == 0:
        log.info("Progress file is empty — nothing to recheck.")
        return 0

    all_urls = list(progress.iter_urls())
    tile_urls = [u for u in all_urls if "cmd=image" in u]
    mosaic_urls = [u for u in all_urls if "mosaic.jpg" in u]
    log.info(
        "Rechecking %d tile URL(s) and %d mosaic URL(s) …",
        len(tile_urls),
        len(mosaic_urls),
    )

    # Build a disk index of all tile files once
    existing_tiles = _build_disk_index(output_dir)
    log.debug("Found %d tile files on disk.", len(existing_tiles))

    bad_urls: list[str] = []

    # --- Tile check ---
    for url in tile_urls:
        p = _parse_tile_url(url)
        scan_id = p["scan_id"]

        # Find tile files that live under a directory named after this scan_id
        candidates = [path for path in existing_tiles if str(scan_id) in path.parts]

        if not candidates:
            bad_urls.append(url)
            continue

        if not any(existing_tiles[path] > 0 for path in candidates):
            bad_urls.append(url)

    # --- Mosaic check ---
    for url in mosaic_urls:
        # Mosaic URLs: http://<host>:8011/RootView_Database/<scan_id>/mosaic.jpg
        # Corresponding local path: <output_dir>/**/<scan_id>/mosaic.jpg
        try:
            scan_id = url.rstrip("/").split("/")[-2]
        except IndexError:
            bad_urls.append(url)
            continue

        matches = list(output_dir.glob(f"*/*/{scan_id}/mosaic.jpg"))
        if not matches or not any(p.stat().st_size > 0 for p in matches):
            log.debug("Mosaic missing or zero-byte for scan %s: %s", scan_id, url)
            bad_urls.append(url)

    if not bad_urls:
        log.info(
            "All %d tile URL(s) and %d mosaic URL(s) look healthy.",
            len(tile_urls),
            len(mosaic_urls),
        )
        return 0

    log.warning(
        "Found %d suspect URL(s). Removing from progress.",
        len(bad_urls),
    )
    for url in bad_urls:
        progress.discard(url)
    progress.save()
    log.info(
        "Removed %d URL(s) from .progress.json — they will be re-downloaded on next run.",
        len(bad_urls),
    )
    return len(bad_urls)


# ---------------------------------------------------------------------------
# Internal utility
# ---------------------------------------------------------------------------


def _scan_id_from_path(tile_path: Path) -> str | None:
    """
    Given a tile path like .../158374/tiles/tile_r0_c0.jpg, return '158374'.
    Looks for the directory two levels above the filename (parent.parent.name).
    """
    try:
        # structure: <machine>/<date>/<scan_id>/tiles/<filename>
        return tile_path.parent.parent.name
    except Exception:
        return None