e122f6435a
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
150 lines
4.4 KiB
Python
150 lines
4.4 KiB
Python
"""
|
|
Tests for spruce.recheck — synthetic archive tree under tmp_path.
|
|
|
|
These tests verify the key improvement: a single --recheck pass is enough.
|
|
Zero-byte tiles are deleted from disk AND their URLs removed from progress
|
|
without needing a second pass.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
from spruce.progress import ProgressTracker
|
|
from spruce.recheck import recheck_archive, recheck_tile_files
|
|
|
|
|
|
BASE_URL = "http://192.0.2.1:8010/index.php"
|
|
|
|
|
|
def _tile_url(scan_id: int, x: float, y: float) -> str:
|
|
return f"{BASE_URL}?cmd=image&mode=image_scan&id={scan_id}&s=1&x={x}&y={y}"
|
|
|
|
|
|
def _make_tile(path: Path, size: int = 1024) -> None:
|
|
"""Create a tile file. size=0 simulates a zero-byte / corrupt download."""
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_bytes(b"\xff" * size)
|
|
|
|
|
|
def _archive_tile_path(tmp_path: Path, scan_id: int, row: int, col: int) -> Path:
|
|
return (
|
|
tmp_path
|
|
/ "BW3-20__AMR-26_"
|
|
/ "2024-07-29"
|
|
/ str(scan_id)
|
|
/ "tiles"
|
|
/ f"tile_r{row:03d}_c{col:03d}.jpg"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# recheck_tile_files
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_recheck_tile_files_no_zero_bytes(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
|
|
_make_tile(tile, size=1024)
|
|
url = _tile_url(158374, 0.0, 0.0)
|
|
p.mark_done(url)
|
|
p.save()
|
|
|
|
deleted = recheck_tile_files(tmp_path, p)
|
|
assert deleted == 0
|
|
assert tile.exists()
|
|
assert p.is_done(url)
|
|
|
|
|
|
def test_recheck_tile_files_deletes_zero_byte(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
|
|
_make_tile(tile, size=0)
|
|
url = _tile_url(158374, 0.0, 0.0)
|
|
p.mark_done(url)
|
|
p.save()
|
|
|
|
deleted = recheck_tile_files(tmp_path, p)
|
|
assert deleted == 1
|
|
assert not tile.exists()
|
|
|
|
|
|
def test_recheck_tile_files_single_pass_removes_url(tmp_path):
|
|
"""
|
|
The two-run wart is fixed: after recheck_tile_files the URL is already
|
|
removed from progress — no second pass required.
|
|
"""
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
|
|
_make_tile(tile, size=0)
|
|
url = _tile_url(158374, 0.0, 0.0)
|
|
p.mark_done(url)
|
|
p.save()
|
|
|
|
recheck_tile_files(tmp_path, p)
|
|
# Reload progress from disk to confirm the change was persisted
|
|
p2 = ProgressTracker(tmp_path / ".progress.json")
|
|
assert not p2.is_done(url)
|
|
|
|
|
|
def test_recheck_tile_files_healthy_tiles_untouched(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
good = _archive_tile_path(tmp_path, 158374, 0, 0)
|
|
bad = _archive_tile_path(tmp_path, 158374, 0, 1)
|
|
_make_tile(good, size=512)
|
|
_make_tile(bad, size=0)
|
|
url_good = _tile_url(158374, 0.0, 0.0)
|
|
url_bad = _tile_url(158374, 3.01, 0.0)
|
|
p.mark_done(url_good)
|
|
p.mark_done(url_bad)
|
|
p.save()
|
|
|
|
deleted = recheck_tile_files(tmp_path, p)
|
|
assert deleted == 1
|
|
assert good.exists()
|
|
assert not bad.exists()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# recheck_archive
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_recheck_archive_empty_progress(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
removed = recheck_archive(tmp_path, p)
|
|
assert removed == 0
|
|
|
|
|
|
def test_recheck_archive_healthy(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
|
|
_make_tile(tile, size=1024)
|
|
p.mark_done(_tile_url(158374, 0.0, 0.0))
|
|
p.save()
|
|
|
|
removed = recheck_archive(tmp_path, p)
|
|
assert removed == 0
|
|
|
|
|
|
def test_recheck_archive_removes_missing_scan(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
# Mark a URL done but create no files on disk
|
|
p.mark_done(_tile_url(999999, 0.0, 0.0))
|
|
p.save()
|
|
|
|
removed = recheck_archive(tmp_path, p)
|
|
assert removed == 1
|
|
assert not p.is_done(_tile_url(999999, 0.0, 0.0))
|
|
|
|
|
|
def test_recheck_archive_skips_mosaic_urls(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
mosaic_url = "http://192.0.2.1:8011/RootView_Database/158374/mosaic.jpg"
|
|
p.mark_done(mosaic_url)
|
|
p.save()
|
|
|
|
removed = recheck_archive(tmp_path, p)
|
|
assert removed == 0
|
|
assert p.is_done(mosaic_url) # mosaics are never touched
|