Files
SPRUCE-scraper/spruce/download_result.py
T

64 lines
1.5 KiB
Python

"""
Structured HTTP download result and error classification.
"""
from __future__ import annotations
from dataclasses import dataclass
import requests
# public constants for error_class
PERMANENT_MISSING = "permanent_missing"
TRANSIENT = "transient"
UNKNOWN = "unknown"
OK = "" # success (no error class)
@dataclass(frozen=True)
class DownloadResult:
"""Result of a streaming download (after all retries if applicable)."""
size: int
status_code: int | None
error: str | None
error_class: str
@property
def ok(self) -> bool:
return self.size > 0 and self.error is None
def classify_http_error(
status_code: int | None, exc: BaseException | None
) -> str:
"""
404/410 => likely gone forever.
5xx and transport/timeouts => retry may help.
"""
if status_code in (404, 410):
return PERMANENT_MISSING
if status_code is not None and 500 <= status_code < 600:
return TRANSIENT
if exc is not None:
if isinstance(
exc,
(
requests.Timeout,
requests.ConnectTimeout,
requests.ReadTimeout,
),
):
return TRANSIENT
if isinstance(exc, (requests.exceptions.ConnectionError, OSError)):
return TRANSIENT
if isinstance(exc, requests.exceptions.ChunkedEncodingError):
return TRANSIENT
return UNKNOWN
def error_code_str(status_code: int | None) -> str:
if status_code is None:
return ""
return str(status_code)