Initial commit

Add spruce scraper with CLI, session management, parsers, progress tracking,
recheck logic, and test suite. Includes example config and README.
This commit is contained in:
2026-04-22 10:41:18 -04:00
commit e122f6435a
23 changed files with 3789 additions and 0 deletions
+82
View File
@@ -0,0 +1,82 @@
"""
Progress tracking (JSON) and CSV writing.
"""
import csv
import json
import logging
from pathlib import Path
from typing import Iterator
log = logging.getLogger(__name__)
class ProgressTracker:
"""
Tracks successfully downloaded URLs in a JSON file so runs can be resumed.
Public API (all external code should use only these methods):
is_done(url) — True if url has been downloaded
mark_done(url) — Record url as complete (call save() to persist)
discard(url) — Remove url from the completed set
iter_urls() — Iterate over all completed URLs
__len__() — Number of completed URLs
save() — Flush state to disk
"""
def __init__(self, path: Path) -> None:
self.path = path
self._done: set[str] = set()
self._load()
def _load(self) -> None:
if self.path.exists():
try:
data = json.loads(self.path.read_text())
self._done = set(data.get("completed_urls", []))
log.info("Resuming: %d URLs already downloaded.", len(self._done))
except Exception:
log.warning("Could not read progress file; starting fresh.")
def is_done(self, url: str) -> bool:
return url in self._done
def mark_done(self, url: str) -> None:
self._done.add(url)
def discard(self, url: str) -> None:
"""Remove a URL from the completed set (re-queues it for download)."""
self._done.discard(url)
def iter_urls(self) -> Iterator[str]:
"""Iterate over all completed URLs."""
return iter(self._done)
def __len__(self) -> int:
return len(self._done)
def save(self) -> None:
self.path.parent.mkdir(parents=True, exist_ok=True)
self.path.write_text(
json.dumps({"completed_urls": sorted(self._done)}, indent=2)
)
class CsvWriter:
"""Append-mode CSV writer that writes a header on first creation."""
def __init__(self, path: Path, fields: list[str]) -> None:
is_new = not path.exists()
path.parent.mkdir(parents=True, exist_ok=True)
self._fh = open(path, "a", newline="", encoding="utf-8")
self._writer = csv.DictWriter(self._fh, fieldnames=fields)
if is_new:
self._writer.writeheader()
self._fields = fields
def write(self, row: dict) -> None:
self._writer.writerow({f: row.get(f, "") for f in self._fields})
self._fh.flush()
def close(self) -> None:
self._fh.close()