e122f6435a
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
83 lines
2.5 KiB
Python
83 lines
2.5 KiB
Python
"""
|
|
Progress tracking (JSON) and CSV writing.
|
|
"""
|
|
|
|
import csv
|
|
import json
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Iterator
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class ProgressTracker:
|
|
"""
|
|
Tracks successfully downloaded URLs in a JSON file so runs can be resumed.
|
|
|
|
Public API (all external code should use only these methods):
|
|
is_done(url) — True if url has been downloaded
|
|
mark_done(url) — Record url as complete (call save() to persist)
|
|
discard(url) — Remove url from the completed set
|
|
iter_urls() — Iterate over all completed URLs
|
|
__len__() — Number of completed URLs
|
|
save() — Flush state to disk
|
|
"""
|
|
|
|
def __init__(self, path: Path) -> None:
|
|
self.path = path
|
|
self._done: set[str] = set()
|
|
self._load()
|
|
|
|
def _load(self) -> None:
|
|
if self.path.exists():
|
|
try:
|
|
data = json.loads(self.path.read_text())
|
|
self._done = set(data.get("completed_urls", []))
|
|
log.info("Resuming: %d URLs already downloaded.", len(self._done))
|
|
except Exception:
|
|
log.warning("Could not read progress file; starting fresh.")
|
|
|
|
def is_done(self, url: str) -> bool:
|
|
return url in self._done
|
|
|
|
def mark_done(self, url: str) -> None:
|
|
self._done.add(url)
|
|
|
|
def discard(self, url: str) -> None:
|
|
"""Remove a URL from the completed set (re-queues it for download)."""
|
|
self._done.discard(url)
|
|
|
|
def iter_urls(self) -> Iterator[str]:
|
|
"""Iterate over all completed URLs."""
|
|
return iter(self._done)
|
|
|
|
def __len__(self) -> int:
|
|
return len(self._done)
|
|
|
|
def save(self) -> None:
|
|
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
self.path.write_text(
|
|
json.dumps({"completed_urls": sorted(self._done)}, indent=2)
|
|
)
|
|
|
|
|
|
class CsvWriter:
|
|
"""Append-mode CSV writer that writes a header on first creation."""
|
|
|
|
def __init__(self, path: Path, fields: list[str]) -> None:
|
|
is_new = not path.exists()
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
self._fh = open(path, "a", newline="", encoding="utf-8")
|
|
self._writer = csv.DictWriter(self._fh, fieldnames=fields)
|
|
if is_new:
|
|
self._writer.writeheader()
|
|
self._fields = fields
|
|
|
|
def write(self, row: dict) -> None:
|
|
self._writer.writerow({f: row.get(f, "") for f in self._fields})
|
|
self._fh.flush()
|
|
|
|
def close(self) -> None:
|
|
self._fh.close()
|