Initial commit
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
This commit is contained in:
@@ -0,0 +1,138 @@
|
||||
"""Tests for spruce.progress — file I/O only, uses tmp_path."""
|
||||
|
||||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from spruce.progress import CsvWriter, ProgressTracker
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ProgressTracker
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_progress_mark_and_check(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
assert not p.is_done("http://example.com/a")
|
||||
p.mark_done("http://example.com/a")
|
||||
assert p.is_done("http://example.com/a")
|
||||
|
||||
|
||||
def test_progress_roundtrip(tmp_path):
|
||||
path = tmp_path / ".progress.json"
|
||||
p = ProgressTracker(path)
|
||||
p.mark_done("http://example.com/a")
|
||||
p.mark_done("http://example.com/b")
|
||||
p.save()
|
||||
|
||||
p2 = ProgressTracker(path)
|
||||
assert p2.is_done("http://example.com/a")
|
||||
assert p2.is_done("http://example.com/b")
|
||||
assert not p2.is_done("http://example.com/c")
|
||||
|
||||
|
||||
def test_progress_discard(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
p.mark_done("http://example.com/x")
|
||||
assert p.is_done("http://example.com/x")
|
||||
p.discard("http://example.com/x")
|
||||
assert not p.is_done("http://example.com/x")
|
||||
|
||||
|
||||
def test_progress_discard_nonexistent_is_noop(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
p.discard("http://example.com/never") # should not raise
|
||||
|
||||
|
||||
def test_progress_iter_urls(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
p.mark_done("http://example.com/1")
|
||||
p.mark_done("http://example.com/2")
|
||||
assert set(p.iter_urls()) == {
|
||||
"http://example.com/1",
|
||||
"http://example.com/2",
|
||||
}
|
||||
|
||||
|
||||
def test_progress_len(tmp_path):
|
||||
p = ProgressTracker(tmp_path / ".progress.json")
|
||||
assert len(p) == 0
|
||||
p.mark_done("http://example.com/1")
|
||||
assert len(p) == 1
|
||||
p.mark_done("http://example.com/2")
|
||||
assert len(p) == 2
|
||||
p.discard("http://example.com/1")
|
||||
assert len(p) == 1
|
||||
|
||||
|
||||
def test_progress_save_creates_parent(tmp_path):
|
||||
path = tmp_path / "nested" / "dir" / ".progress.json"
|
||||
p = ProgressTracker(path)
|
||||
p.mark_done("http://example.com/z")
|
||||
p.save()
|
||||
assert path.exists()
|
||||
data = json.loads(path.read_text())
|
||||
assert "http://example.com/z" in data["completed_urls"]
|
||||
|
||||
|
||||
def test_progress_corrupt_file_starts_fresh(tmp_path):
|
||||
path = tmp_path / ".progress.json"
|
||||
path.write_text("not valid json")
|
||||
p = ProgressTracker(path)
|
||||
assert len(p) == 0 # starts fresh, no exception
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CsvWriter
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
FIELDS = ["a", "b", "c"]
|
||||
|
||||
|
||||
def test_csv_writer_creates_header(tmp_path):
|
||||
path = tmp_path / "out.csv"
|
||||
w = CsvWriter(path, FIELDS)
|
||||
w.close()
|
||||
rows = list(csv.DictReader(path.open()))
|
||||
assert rows == []
|
||||
header = path.read_text().splitlines()[0]
|
||||
assert header == "a,b,c"
|
||||
|
||||
|
||||
def test_csv_writer_write_row(tmp_path):
|
||||
path = tmp_path / "out.csv"
|
||||
w = CsvWriter(path, FIELDS)
|
||||
w.write({"a": "1", "b": "2", "c": "3"})
|
||||
w.close()
|
||||
rows = list(csv.DictReader(path.open()))
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["a"] == "1"
|
||||
assert rows[0]["c"] == "3"
|
||||
|
||||
|
||||
def test_csv_writer_missing_fields_fill_empty(tmp_path):
|
||||
path = tmp_path / "out.csv"
|
||||
w = CsvWriter(path, FIELDS)
|
||||
w.write({"a": "hello"}) # b and c missing
|
||||
w.close()
|
||||
rows = list(csv.DictReader(path.open()))
|
||||
assert rows[0]["b"] == ""
|
||||
assert rows[0]["c"] == ""
|
||||
|
||||
|
||||
def test_csv_writer_appends_on_second_open(tmp_path):
|
||||
path = tmp_path / "out.csv"
|
||||
w = CsvWriter(path, FIELDS)
|
||||
w.write({"a": "first"})
|
||||
w.close()
|
||||
|
||||
w2 = CsvWriter(path, FIELDS)
|
||||
w2.write({"a": "second"})
|
||||
w2.close()
|
||||
|
||||
rows = list(csv.DictReader(path.open()))
|
||||
assert len(rows) == 2
|
||||
assert rows[0]["a"] == "first"
|
||||
assert rows[1]["a"] == "second"
|
||||
Reference in New Issue
Block a user