e122f6435a
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
139 lines
3.9 KiB
Python
139 lines
3.9 KiB
Python
"""Tests for spruce.progress — file I/O only, uses tmp_path."""
|
|
|
|
import csv
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
from spruce.progress import CsvWriter, ProgressTracker
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ProgressTracker
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_progress_mark_and_check(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
assert not p.is_done("http://example.com/a")
|
|
p.mark_done("http://example.com/a")
|
|
assert p.is_done("http://example.com/a")
|
|
|
|
|
|
def test_progress_roundtrip(tmp_path):
|
|
path = tmp_path / ".progress.json"
|
|
p = ProgressTracker(path)
|
|
p.mark_done("http://example.com/a")
|
|
p.mark_done("http://example.com/b")
|
|
p.save()
|
|
|
|
p2 = ProgressTracker(path)
|
|
assert p2.is_done("http://example.com/a")
|
|
assert p2.is_done("http://example.com/b")
|
|
assert not p2.is_done("http://example.com/c")
|
|
|
|
|
|
def test_progress_discard(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
p.mark_done("http://example.com/x")
|
|
assert p.is_done("http://example.com/x")
|
|
p.discard("http://example.com/x")
|
|
assert not p.is_done("http://example.com/x")
|
|
|
|
|
|
def test_progress_discard_nonexistent_is_noop(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
p.discard("http://example.com/never") # should not raise
|
|
|
|
|
|
def test_progress_iter_urls(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
p.mark_done("http://example.com/1")
|
|
p.mark_done("http://example.com/2")
|
|
assert set(p.iter_urls()) == {
|
|
"http://example.com/1",
|
|
"http://example.com/2",
|
|
}
|
|
|
|
|
|
def test_progress_len(tmp_path):
|
|
p = ProgressTracker(tmp_path / ".progress.json")
|
|
assert len(p) == 0
|
|
p.mark_done("http://example.com/1")
|
|
assert len(p) == 1
|
|
p.mark_done("http://example.com/2")
|
|
assert len(p) == 2
|
|
p.discard("http://example.com/1")
|
|
assert len(p) == 1
|
|
|
|
|
|
def test_progress_save_creates_parent(tmp_path):
|
|
path = tmp_path / "nested" / "dir" / ".progress.json"
|
|
p = ProgressTracker(path)
|
|
p.mark_done("http://example.com/z")
|
|
p.save()
|
|
assert path.exists()
|
|
data = json.loads(path.read_text())
|
|
assert "http://example.com/z" in data["completed_urls"]
|
|
|
|
|
|
def test_progress_corrupt_file_starts_fresh(tmp_path):
|
|
path = tmp_path / ".progress.json"
|
|
path.write_text("not valid json")
|
|
p = ProgressTracker(path)
|
|
assert len(p) == 0 # starts fresh, no exception
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CsvWriter
|
|
# ---------------------------------------------------------------------------
|
|
|
|
FIELDS = ["a", "b", "c"]
|
|
|
|
|
|
def test_csv_writer_creates_header(tmp_path):
|
|
path = tmp_path / "out.csv"
|
|
w = CsvWriter(path, FIELDS)
|
|
w.close()
|
|
rows = list(csv.DictReader(path.open()))
|
|
assert rows == []
|
|
header = path.read_text().splitlines()[0]
|
|
assert header == "a,b,c"
|
|
|
|
|
|
def test_csv_writer_write_row(tmp_path):
|
|
path = tmp_path / "out.csv"
|
|
w = CsvWriter(path, FIELDS)
|
|
w.write({"a": "1", "b": "2", "c": "3"})
|
|
w.close()
|
|
rows = list(csv.DictReader(path.open()))
|
|
assert len(rows) == 1
|
|
assert rows[0]["a"] == "1"
|
|
assert rows[0]["c"] == "3"
|
|
|
|
|
|
def test_csv_writer_missing_fields_fill_empty(tmp_path):
|
|
path = tmp_path / "out.csv"
|
|
w = CsvWriter(path, FIELDS)
|
|
w.write({"a": "hello"}) # b and c missing
|
|
w.close()
|
|
rows = list(csv.DictReader(path.open()))
|
|
assert rows[0]["b"] == ""
|
|
assert rows[0]["c"] == ""
|
|
|
|
|
|
def test_csv_writer_appends_on_second_open(tmp_path):
|
|
path = tmp_path / "out.csv"
|
|
w = CsvWriter(path, FIELDS)
|
|
w.write({"a": "first"})
|
|
w.close()
|
|
|
|
w2 = CsvWriter(path, FIELDS)
|
|
w2.write({"a": "second"})
|
|
w2.close()
|
|
|
|
rows = list(csv.DictReader(path.open()))
|
|
assert len(rows) == 2
|
|
assert rows[0]["a"] == "first"
|
|
assert rows[1]["a"] == "second"
|