f2193011ca
- Add --metadata-only flag: fetches scan detail pages, writes metadata.json + scans.csv rows, skips all image downloads. Re-runs skip scans whose metadata.json already exists. - Atomic progress.json saves (temp-file rename). - Heal-on-resume: tiles on disk but not in progress are silently re-marked before building the pending list. - scans.csv dedup: skip row if mosaic URL already in progress. - Rename mosaic_downloaded -> mosaic_on_disk (reflects disk state). - --recheck now checks mosaics as well as tiles. - RunStats dataclass replaces raw int return; richer run summary. - Fix argparse allow_abbrev reverted; fix --scan-id + --metadata-only glob fallback when scan_time is absent. - Add .venv/ to .gitignore. - README: fix typo, update worker counts, document all new behaviour.
177 lines
5.8 KiB
Python
177 lines
5.8 KiB
Python
"""
|
|
Archive integrity checks — find corrupt / missing tiles and remove them
|
|
from the progress tracker so they are re-downloaded on the next run.
|
|
"""
|
|
|
|
import logging
|
|
import urllib.parse
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from spruce.progress import ProgressTracker
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Private helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _parse_tile_url(url: str) -> dict[str, str]:
|
|
"""Extract scan_id, x, y from a tile URL query string."""
|
|
qs = dict(urllib.parse.parse_qsl(urllib.parse.urlparse(url).query))
|
|
return {
|
|
"scan_id": qs.get("id", ""),
|
|
"x": qs.get("x", ""),
|
|
"y": qs.get("y", ""),
|
|
}
|
|
|
|
|
|
def _build_disk_index(output_dir: Path) -> dict[Path, int]:
|
|
"""Return {tile_path: size_bytes} for every tile file found on disk."""
|
|
return {p: p.stat().st_size for p in output_dir.rglob("tile_r*.jpg")}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public API
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def recheck_tile_files(output_dir: Path, progress: ProgressTracker) -> int:
|
|
"""
|
|
Walk every tile file on disk and delete any that are zero bytes.
|
|
Also removes the corresponding URL from progress in the same pass,
|
|
so a single --recheck call is sufficient before resuming.
|
|
|
|
Returns the count of files deleted.
|
|
"""
|
|
# Build a reverse map: (scan_id, x, y) -> url for all completed tile URLs
|
|
coord_to_url: dict[tuple[str, str, str], str] = {}
|
|
for url in progress.iter_urls():
|
|
if "cmd=image" in url:
|
|
p = _parse_tile_url(url)
|
|
key = (p["scan_id"], p["x"], p["y"])
|
|
coord_to_url[key] = url
|
|
|
|
deleted = 0
|
|
for tile_path in output_dir.rglob("tile_r*.jpg"):
|
|
if tile_path.stat().st_size == 0:
|
|
log.warning("Deleting zero-byte tile: %s", tile_path)
|
|
tile_path.unlink()
|
|
deleted += 1
|
|
|
|
# Try to find the matching URL from progress and discard it
|
|
scan_id = _scan_id_from_path(tile_path)
|
|
if scan_id:
|
|
# Discard any URL for this scan_id — precise x/y matching
|
|
# requires metadata.json; scan-level discard is safe because
|
|
# recheck_archive will clean up any remaining stale URLs.
|
|
for key, url in list(coord_to_url.items()):
|
|
if key[0] == scan_id:
|
|
progress.discard(url)
|
|
del coord_to_url[key]
|
|
|
|
if deleted:
|
|
log.info("Deleted %d zero-byte tile file(s).", deleted)
|
|
progress.save()
|
|
else:
|
|
log.info("No zero-byte tile files found on disk.")
|
|
return deleted
|
|
|
|
|
|
def recheck_archive(output_dir: Path, progress: ProgressTracker) -> int:
|
|
"""
|
|
Walk every URL in .progress.json and verify its local file exists and is
|
|
non-empty. Removes bad entries from progress so the next run re-downloads
|
|
them. Returns the count of entries removed.
|
|
|
|
Both tile URLs and mosaic URLs are checked.
|
|
"""
|
|
if len(progress) == 0:
|
|
log.info("Progress file is empty — nothing to recheck.")
|
|
return 0
|
|
|
|
all_urls = list(progress.iter_urls())
|
|
tile_urls = [u for u in all_urls if "cmd=image" in u]
|
|
mosaic_urls = [u for u in all_urls if "mosaic.jpg" in u]
|
|
log.info(
|
|
"Rechecking %d tile URL(s) and %d mosaic URL(s) …",
|
|
len(tile_urls),
|
|
len(mosaic_urls),
|
|
)
|
|
|
|
# Build a disk index of all tile files once
|
|
existing_tiles = _build_disk_index(output_dir)
|
|
log.debug("Found %d tile files on disk.", len(existing_tiles))
|
|
|
|
bad_urls: list[str] = []
|
|
|
|
# --- Tile check ---
|
|
for url in tile_urls:
|
|
p = _parse_tile_url(url)
|
|
scan_id = p["scan_id"]
|
|
|
|
# Find tile files that live under a directory named after this scan_id
|
|
candidates = [path for path in existing_tiles if str(scan_id) in path.parts]
|
|
|
|
if not candidates:
|
|
bad_urls.append(url)
|
|
continue
|
|
|
|
if not any(existing_tiles[path] > 0 for path in candidates):
|
|
bad_urls.append(url)
|
|
|
|
# --- Mosaic check ---
|
|
for url in mosaic_urls:
|
|
# Mosaic URLs: http://<host>:8011/RootView_Database/<scan_id>/mosaic.jpg
|
|
# Corresponding local path: <output_dir>/**/<scan_id>/mosaic.jpg
|
|
try:
|
|
scan_id = url.rstrip("/").split("/")[-2]
|
|
except IndexError:
|
|
bad_urls.append(url)
|
|
continue
|
|
|
|
matches = list(output_dir.glob(f"*/*/{scan_id}/mosaic.jpg"))
|
|
if not matches or not any(p.stat().st_size > 0 for p in matches):
|
|
log.debug("Mosaic missing or zero-byte for scan %s: %s", scan_id, url)
|
|
bad_urls.append(url)
|
|
|
|
if not bad_urls:
|
|
log.info(
|
|
"All %d tile URL(s) and %d mosaic URL(s) look healthy.",
|
|
len(tile_urls),
|
|
len(mosaic_urls),
|
|
)
|
|
return 0
|
|
|
|
log.warning(
|
|
"Found %d suspect URL(s). Removing from progress.",
|
|
len(bad_urls),
|
|
)
|
|
for url in bad_urls:
|
|
progress.discard(url)
|
|
progress.save()
|
|
log.info(
|
|
"Removed %d URL(s) from .progress.json — they will be re-downloaded on next run.",
|
|
len(bad_urls),
|
|
)
|
|
return len(bad_urls)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Internal utility
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _scan_id_from_path(tile_path: Path) -> str | None:
|
|
"""
|
|
Given a tile path like .../158374/tiles/tile_r0_c0.jpg, return '158374'.
|
|
Looks for the directory two levels above the filename (parent.parent.name).
|
|
"""
|
|
try:
|
|
# structure: <machine>/<date>/<scan_id>/tiles/<filename>
|
|
return tile_path.parent.parent.name
|
|
except Exception:
|
|
return None
|