Initial commit

Add spruce scraper with CLI, session management, parsers, progress tracking,
recheck logic, and test suite. Includes example config and README.
This commit is contained in:
2026-04-22 10:41:18 -04:00
commit e122f6435a
23 changed files with 3789 additions and 0 deletions
+138
View File
@@ -0,0 +1,138 @@
"""Tests for spruce.progress — file I/O only, uses tmp_path."""
import csv
import json
from pathlib import Path
import pytest
from spruce.progress import CsvWriter, ProgressTracker
# ---------------------------------------------------------------------------
# ProgressTracker
# ---------------------------------------------------------------------------
def test_progress_mark_and_check(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
assert not p.is_done("http://example.com/a")
p.mark_done("http://example.com/a")
assert p.is_done("http://example.com/a")
def test_progress_roundtrip(tmp_path):
path = tmp_path / ".progress.json"
p = ProgressTracker(path)
p.mark_done("http://example.com/a")
p.mark_done("http://example.com/b")
p.save()
p2 = ProgressTracker(path)
assert p2.is_done("http://example.com/a")
assert p2.is_done("http://example.com/b")
assert not p2.is_done("http://example.com/c")
def test_progress_discard(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
p.mark_done("http://example.com/x")
assert p.is_done("http://example.com/x")
p.discard("http://example.com/x")
assert not p.is_done("http://example.com/x")
def test_progress_discard_nonexistent_is_noop(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
p.discard("http://example.com/never") # should not raise
def test_progress_iter_urls(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
p.mark_done("http://example.com/1")
p.mark_done("http://example.com/2")
assert set(p.iter_urls()) == {
"http://example.com/1",
"http://example.com/2",
}
def test_progress_len(tmp_path):
p = ProgressTracker(tmp_path / ".progress.json")
assert len(p) == 0
p.mark_done("http://example.com/1")
assert len(p) == 1
p.mark_done("http://example.com/2")
assert len(p) == 2
p.discard("http://example.com/1")
assert len(p) == 1
def test_progress_save_creates_parent(tmp_path):
path = tmp_path / "nested" / "dir" / ".progress.json"
p = ProgressTracker(path)
p.mark_done("http://example.com/z")
p.save()
assert path.exists()
data = json.loads(path.read_text())
assert "http://example.com/z" in data["completed_urls"]
def test_progress_corrupt_file_starts_fresh(tmp_path):
path = tmp_path / ".progress.json"
path.write_text("not valid json")
p = ProgressTracker(path)
assert len(p) == 0 # starts fresh, no exception
# ---------------------------------------------------------------------------
# CsvWriter
# ---------------------------------------------------------------------------
FIELDS = ["a", "b", "c"]
def test_csv_writer_creates_header(tmp_path):
path = tmp_path / "out.csv"
w = CsvWriter(path, FIELDS)
w.close()
rows = list(csv.DictReader(path.open()))
assert rows == []
header = path.read_text().splitlines()[0]
assert header == "a,b,c"
def test_csv_writer_write_row(tmp_path):
path = tmp_path / "out.csv"
w = CsvWriter(path, FIELDS)
w.write({"a": "1", "b": "2", "c": "3"})
w.close()
rows = list(csv.DictReader(path.open()))
assert len(rows) == 1
assert rows[0]["a"] == "1"
assert rows[0]["c"] == "3"
def test_csv_writer_missing_fields_fill_empty(tmp_path):
path = tmp_path / "out.csv"
w = CsvWriter(path, FIELDS)
w.write({"a": "hello"}) # b and c missing
w.close()
rows = list(csv.DictReader(path.open()))
assert rows[0]["b"] == ""
assert rows[0]["c"] == ""
def test_csv_writer_appends_on_second_open(tmp_path):
path = tmp_path / "out.csv"
w = CsvWriter(path, FIELDS)
w.write({"a": "first"})
w.close()
w2 = CsvWriter(path, FIELDS)
w2.write({"a": "second"})
w2.close()
rows = list(csv.DictReader(path.open()))
assert len(rows) == 2
assert rows[0]["a"] == "first"
assert rows[1]["a"] == "second"