Initial commit

Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
2026-04-22 10:41:18 -04:00
commit e122f6435a
23 changed files with 3789 additions and 0 deletions
@@ -0,0 +1,82 @@
+"""
+Progress tracking (JSON) and CSV writing.
+"""
+
+import csv
+import json
+import logging
+from pathlib import Path
+from typing import Iterator
+
+log = logging.getLogger(__name__)
+
+
+class ProgressTracker:
+    """
+    Tracks successfully downloaded URLs in a JSON file so runs can be resumed.
+
+    Public API (all external code should use only these methods):
+      is_done(url)    — True if url has been downloaded
+      mark_done(url)  — Record url as complete (call save() to persist)
+      discard(url)    — Remove url from the completed set
+      iter_urls()     — Iterate over all completed URLs
+      __len__()       — Number of completed URLs
+      save()          — Flush state to disk
+    """
+
+    def __init__(self, path: Path) -> None:
+        self.path = path
+        self._done: set[str] = set()
+        self._load()
+
+    def _load(self) -> None:
+        if self.path.exists():
+            try:
+                data = json.loads(self.path.read_text())
+                self._done = set(data.get("completed_urls", []))
+                log.info("Resuming: %d URLs already downloaded.", len(self._done))
+            except Exception:
+                log.warning("Could not read progress file; starting fresh.")
+
+    def is_done(self, url: str) -> bool:
+        return url in self._done
+
+    def mark_done(self, url: str) -> None:
+        self._done.add(url)
+
+    def discard(self, url: str) -> None:
+        """Remove a URL from the completed set (re-queues it for download)."""
+        self._done.discard(url)
+
+    def iter_urls(self) -> Iterator[str]:
+        """Iterate over all completed URLs."""
+        return iter(self._done)
+
+    def __len__(self) -> int:
+        return len(self._done)
+
+    def save(self) -> None:
+        self.path.parent.mkdir(parents=True, exist_ok=True)
+        self.path.write_text(
+            json.dumps({"completed_urls": sorted(self._done)}, indent=2)
+        )
+
+
+class CsvWriter:
+    """Append-mode CSV writer that writes a header on first creation."""
+
+    def __init__(self, path: Path, fields: list[str]) -> None:
+        is_new = not path.exists()
+        path.parent.mkdir(parents=True, exist_ok=True)
+        self._fh = open(path, "a", newline="", encoding="utf-8")
+        self._writer = csv.DictWriter(self._fh, fieldnames=fields)
+        if is_new:
+            self._writer.writeheader()
+        self._fields = fields
+
+    def write(self, row: dict) -> None:
+        self._writer.writerow({f: row.get(f, "") for f in self._fields})
+        self._fh.flush()
+
+    def close(self) -> None:
+        self._fh.close()