Initial commit

Add spruce scraper with CLI, session management, parsers, progress tracking,
recheck logic, and test suite. Includes example config and README.
This commit is contained in:
2026-04-22 10:41:18 -04:00
commit e122f6435a
23 changed files with 3789 additions and 0 deletions
+149
View File
@@ -0,0 +1,149 @@
"""
Tests for spruce.recheck — synthetic archive tree under tmp_path.
These tests verify the key improvement: a single --recheck pass is enough.
Zero-byte tiles are deleted from disk AND their URLs removed from progress
without needing a second pass.
"""
from pathlib import Path
import pytest
from spruce.progress import ProgressTracker
from spruce.recheck import recheck_archive, recheck_tile_files
BASE_URL = "http://192.0.2.1:8010/index.php"
def _tile_url(scan_id: int, x: float, y: float) -> str:
return f"{BASE_URL}?cmd=image&mode=image_scan&id={scan_id}&s=1&x={x}&y={y}"
def _make_tile(path: Path, size: int = 1024) -> None:
"""Create a tile file. size=0 simulates a zero-byte / corrupt download."""
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(b"\xff" * size)
def _archive_tile_path(tmp_path: Path, scan_id: int, row: int, col: int) -> Path:
return (
tmp_path
/ "BW3-20__AMR-26_"
/ "2024-07-29"
/ str(scan_id)
/ "tiles"
/ f"tile_r{row:03d}_c{col:03d}.jpg"
)
# ---------------------------------------------------------------------------
# recheck_tile_files
# ---------------------------------------------------------------------------
def test_recheck_tile_files_no_zero_bytes(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
_make_tile(tile, size=1024)
url = _tile_url(158374, 0.0, 0.0)
p.mark_done(url)
p.save()
deleted = recheck_tile_files(tmp_path, p)
assert deleted == 0
assert tile.exists()
assert p.is_done(url)
def test_recheck_tile_files_deletes_zero_byte(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
_make_tile(tile, size=0)
url = _tile_url(158374, 0.0, 0.0)
p.mark_done(url)
p.save()
deleted = recheck_tile_files(tmp_path, p)
assert deleted == 1
assert not tile.exists()
def test_recheck_tile_files_single_pass_removes_url(tmp_path):
"""
The two-run wart is fixed: after recheck_tile_files the URL is already
removed from progress — no second pass required.
"""
p = ProgressTracker(tmp_path / ".progress.json")
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
_make_tile(tile, size=0)
url = _tile_url(158374, 0.0, 0.0)
p.mark_done(url)
p.save()
recheck_tile_files(tmp_path, p)
# Reload progress from disk to confirm the change was persisted
p2 = ProgressTracker(tmp_path / ".progress.json")
assert not p2.is_done(url)
def test_recheck_tile_files_healthy_tiles_untouched(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
good = _archive_tile_path(tmp_path, 158374, 0, 0)
bad = _archive_tile_path(tmp_path, 158374, 0, 1)
_make_tile(good, size=512)
_make_tile(bad, size=0)
url_good = _tile_url(158374, 0.0, 0.0)
url_bad = _tile_url(158374, 3.01, 0.0)
p.mark_done(url_good)
p.mark_done(url_bad)
p.save()
deleted = recheck_tile_files(tmp_path, p)
assert deleted == 1
assert good.exists()
assert not bad.exists()
# ---------------------------------------------------------------------------
# recheck_archive
# ---------------------------------------------------------------------------
def test_recheck_archive_empty_progress(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
removed = recheck_archive(tmp_path, p)
assert removed == 0
def test_recheck_archive_healthy(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
tile = _archive_tile_path(tmp_path, 158374, 0, 0)
_make_tile(tile, size=1024)
p.mark_done(_tile_url(158374, 0.0, 0.0))
p.save()
removed = recheck_archive(tmp_path, p)
assert removed == 0
def test_recheck_archive_removes_missing_scan(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
# Mark a URL done but create no files on disk
p.mark_done(_tile_url(999999, 0.0, 0.0))
p.save()
removed = recheck_archive(tmp_path, p)
assert removed == 1
assert not p.is_done(_tile_url(999999, 0.0, 0.0))
def test_recheck_archive_skips_mosaic_urls(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
mosaic_url = "http://192.0.2.1:8011/RootView_Database/158374/mosaic.jpg"
p.mark_done(mosaic_url)
p.save()
removed = recheck_archive(tmp_path, p)
assert removed == 0
assert p.is_done(mosaic_url) # mosaics are never touched