Initial commit
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
This commit is contained in:
@@ -0,0 +1,149 @@
|
||||
"""
|
||||
Tests for spruce.recheck — synthetic archive tree under tmp_path.
|
||||
|
||||
These tests verify the key improvement: a single --recheck pass is enough.
|
||||
Zero-byte tiles are deleted from disk AND their URLs removed from progress
|
||||
without needing a second pass.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from spruce.progress import ProgressTracker
|
||||
from spruce.recheck import recheck_archive, recheck_tile_files
|
||||
|
||||
|
||||
BASE_URL = "http://192.0.2.1:8010/index.php"
|
||||
|
||||
|
||||
def _tile_url(scan_id: int, x: float, y: float) -> str:
|
||||
return f"{BASE_URL}?cmd=image&mode=image_scan&id={scan_id}&s=1&x={x}&y={y}"
|
||||
|
||||
|
||||
def _make_tile(path: Path, size: int = 1024) -> None:
|
||||
"""Create a tile file. size=0 simulates a zero-byte / corrupt download."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(b"\xff" * size)
|
||||
|
||||
|
||||
def _archive_tile_path(tmp_path: Path, scan_id: int, row: int, col: int) -> Path:
|
||||
return (
|
||||
tmp_path
|
||||
/ "BW3-20__AMR-26_"
|
||||
/ "2024-07-29"
|
||||
/ str(scan_id)
|
||||
/ "tiles"
|
||||
/ f"tile_r{row:03d}_c{col:03d}.jpg"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# recheck_tile_files
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_recheck_tile_files_no_zero_bytes(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
|
||||
_make_tile(tile, size=1024)
|
||||
url = _tile_url(158374, 0.0, 0.0)
|
||||
p.mark_done(url)
|
||||
p.save()
|
||||
|
||||
deleted = recheck_tile_files(tmp_path, p)
|
||||
assert deleted == 0
|
||||
assert tile.exists()
|
||||
assert p.is_done(url)
|
||||
|
||||
|
||||
def test_recheck_tile_files_deletes_zero_byte(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
|
||||
_make_tile(tile, size=0)
|
||||
url = _tile_url(158374, 0.0, 0.0)
|
||||
p.mark_done(url)
|
||||
p.save()
|
||||
|
||||
deleted = recheck_tile_files(tmp_path, p)
|
||||
assert deleted == 1
|
||||
assert not tile.exists()
|
||||
|
||||
|
||||
def test_recheck_tile_files_single_pass_removes_url(tmp_path):
|
||||
"""
|
||||
The two-run wart is fixed: after recheck_tile_files the URL is already
|
||||
removed from progress — no second pass required.
|
||||
"""
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
|
||||
_make_tile(tile, size=0)
|
||||
url = _tile_url(158374, 0.0, 0.0)
|
||||
p.mark_done(url)
|
||||
p.save()
|
||||
|
||||
recheck_tile_files(tmp_path, p)
|
||||
# Reload progress from disk to confirm the change was persisted
|
||||
p2 = ProgressTracker(tmp_path / ".progress.json")
|
||||
assert not p2.is_done(url)
|
||||
|
||||
|
||||
def test_recheck_tile_files_healthy_tiles_untouched(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
good = _archive_tile_path(tmp_path, 158374, 0, 0)
|
||||
bad = _archive_tile_path(tmp_path, 158374, 0, 1)
|
||||
_make_tile(good, size=512)
|
||||
_make_tile(bad, size=0)
|
||||
url_good = _tile_url(158374, 0.0, 0.0)
|
||||
url_bad = _tile_url(158374, 3.01, 0.0)
|
||||
p.mark_done(url_good)
|
||||
p.mark_done(url_bad)
|
||||
p.save()
|
||||
|
||||
deleted = recheck_tile_files(tmp_path, p)
|
||||
assert deleted == 1
|
||||
assert good.exists()
|
||||
assert not bad.exists()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# recheck_archive
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_recheck_archive_empty_progress(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
removed = recheck_archive(tmp_path, p)
|
||||
assert removed == 0
|
||||
|
||||
|
||||
def test_recheck_archive_healthy(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
|
||||
_make_tile(tile, size=1024)
|
||||
p.mark_done(_tile_url(158374, 0.0, 0.0))
|
||||
p.save()
|
||||
|
||||
removed = recheck_archive(tmp_path, p)
|
||||
assert removed == 0
|
||||
|
||||
|
||||
def test_recheck_archive_removes_missing_scan(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
# Mark a URL done but create no files on disk
|
||||
p.mark_done(_tile_url(999999, 0.0, 0.0))
|
||||
p.save()
|
||||
|
||||
removed = recheck_archive(tmp_path, p)
|
||||
assert removed == 1
|
||||
assert not p.is_done(_tile_url(999999, 0.0, 0.0))
|
||||
|
||||
|
||||
def test_recheck_archive_skips_mosaic_urls(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
mosaic_url = "http://192.0.2.1:8011/RootView_Database/158374/mosaic.jpg"
|
||||
p.mark_done(mosaic_url)
|
||||
p.save()
|
||||
|
||||
removed = recheck_archive(tmp_path, p)
|
||||
assert removed == 0
|
||||
assert p.is_done(mosaic_url) # mosaics are never touched
|
||||
Reference in New Issue
Block a user