Files
SPRUCE-scraper/spruce/orchestrator.py
T
poprhythm e122f6435a Initial commit
Add spruce scraper with CLI, session management, parsers, progress tracking,
recheck logic, and test suite. Includes example config and README.
2026-04-22 10:41:18 -04:00

308 lines
8.9 KiB
Python

"""
High-level scrape orchestration: drives the per-machine and per-scan loops.
"""
import json
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Any
from tqdm import tqdm
from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
from spruce.progress import ProgressTracker, CsvWriter
from spruce.session import MachineSession
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Per-scan helpers
# ---------------------------------------------------------------------------
def _download_mosaic(
sess: MachineSession,
scan_meta: dict[str, Any],
scan_id: int,
mosaic_path: Path,
progress: ProgressTracker,
machine: dict[str, Any],
dry_run: bool,
) -> bool:
"""Download the scan mosaic if not already done. Returns True if downloaded."""
url = sess.mosaic_url(scan_id)
if progress.is_done(url):
return False
if dry_run:
log.info("[DRY-RUN] Mosaic: %s%s", url, mosaic_path)
return False
log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id)
size = sess.download_file(url, mosaic_path)
if size:
progress.mark_done(url)
progress.save()
log.info(
"[%s] Mosaic saved: %s (%.1f MB)",
machine["label"],
mosaic_path,
size / 1e6,
)
return True
return False
def _download_tiles_for_scan(
sess: MachineSession,
tiles: list[dict[str, Any]],
scan_meta: dict[str, Any],
scan_id: int,
output_dir: Path,
machine: dict[str, Any],
config: dict[str, Any],
progress: ProgressTracker,
tiles_csv: CsvWriter,
dry_run: bool,
) -> int:
"""Download all pending tiles for a scan. Returns count of tiles downloaded."""
pending = [t for t in tiles if not progress.is_done(t["url"])]
log.info(
"[%s] Scan %d: %d tiles total, %d pending.",
machine["label"],
scan_id,
len(tiles),
len(pending),
)
if dry_run:
for t in pending[:5]:
log.info("[DRY-RUN] Tile: %s", t["url"])
if len(pending) > 5:
log.info("[DRY-RUN] … and %d more tiles.", len(pending) - 5)
return 0
# Attach scan_time for CSV rows
for t in pending:
t["scan_time"] = scan_meta.get("scan_time", "")
workers: int = config["workers"]
downloaded = 0
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {
pool.submit(
sess.download_tile,
tile,
tile_dest(output_dir, machine, scan_meta, tile),
False,
): tile
for tile in pending
}
save_every = max(50, workers * 4)
batch: list[dict[str, Any]] = []
with tqdm(
total=len(pending),
desc=f"{machine['label']} scan {scan_id}",
unit="tile",
leave=True,
) as pbar:
for future in as_completed(futures):
result = future.result()
if result.get("file_size_bytes"):
batch.append(result)
progress.mark_done(result["url"])
downloaded += 1
pbar.update(1)
if len(batch) >= save_every:
for row in batch:
tiles_csv.write(row)
progress.save()
batch.clear()
for row in batch:
tiles_csv.write(row)
progress.save()
log.info(
"[%s] Scan %d complete: %d tiles downloaded.",
machine["label"],
scan_id,
downloaded,
)
return downloaded
# ---------------------------------------------------------------------------
# Per-scan driver
# ---------------------------------------------------------------------------
def process_scan(
sess: MachineSession,
scan: dict[str, Any],
output_dir: Path,
machine: dict[str, Any],
config: dict[str, Any],
progress: ProgressTracker,
scans_csv: CsvWriter,
tiles_csv: CsvWriter,
dry_run: bool,
mosaic_only: bool,
) -> int:
"""
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
Returns total files downloaded for this scan.
"""
scan_id: int = scan["scan_id"]
log.info("[%s] Processing scan %d …", machine["label"], scan_id)
try:
scan_meta = sess.get_scan_metadata(scan_id)
except Exception as exc:
log.error(
"[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
)
return 0
if not scan_meta.get("nx") or not scan_meta.get("ny"):
log.warning(
"[%s] Scan %d: missing grid params, skipping.",
machine["label"],
scan_id,
)
return 0
# Merge list-level metadata into scan_meta (detail page takes precedence)
for k in (
"name",
"scan_time",
"start_datetime",
"end_datetime",
"status",
"user",
"scan_lines",
"scan_mode",
):
scan_meta.setdefault(k, scan.get(k, ""))
# Save per-scan metadata.json
scan_date = _extract_date(scan_meta.get("scan_time", ""))
scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id)
if not dry_run:
scan_dir.mkdir(parents=True, exist_ok=True)
meta_file = scan_dir / "metadata.json"
if not meta_file.exists():
meta_file.write_text(
json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
)
# Mosaic
mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
mosaic_url = sess.mosaic_url(scan_id)
mosaic_downloaded = _download_mosaic(
sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
)
total = 1 if mosaic_downloaded else 0
# Write scan-level CSV row
scans_csv.write(
{
"machine": machine["label"],
"machine_id": machine["machine_id"],
"scan_id": scan_id,
"name": scan_meta.get("name", ""),
"scan_time": scan_meta.get("scan_time", ""),
"start_x": scan_meta.get("start_x", ""),
"start_y": scan_meta.get("start_y", ""),
"end_x": scan_meta.get("end_x", ""),
"end_y": scan_meta.get("end_y", ""),
"dx": scan_meta.get("dx", ""),
"dy": scan_meta.get("dy", ""),
"nx": scan_meta.get("nx", ""),
"ny": scan_meta.get("ny", ""),
"total_tiles": scan_meta.get("total_tiles", ""),
"scan_lines": scan_meta.get("scan_lines", ""),
"scan_mode": scan_meta.get("scan_mode", ""),
"start_datetime": scan_meta.get("start_datetime", ""),
"end_datetime": scan_meta.get("end_datetime", ""),
"status": scan_meta.get("status", ""),
"user": scan_meta.get("user", ""),
"disk_space_mb": scan_meta.get("disk_space_mb", ""),
"mosaic_url": mosaic_url,
"mosaic_local_path": str(mosaic_path),
"mosaic_downloaded": mosaic_downloaded,
}
)
if mosaic_only:
return total
# Tiles
tiles = sess.enumerate_tiles(scan_meta)
total += _download_tiles_for_scan(
sess,
tiles,
scan_meta,
scan_id,
output_dir,
machine,
config,
progress,
tiles_csv,
dry_run,
)
return total
# ---------------------------------------------------------------------------
# Per-machine driver
# ---------------------------------------------------------------------------
def scrape_machine(
machine: dict[str, Any],
config: dict[str, Any],
output_dir: Path,
progress: ProgressTracker,
tiles_csv: CsvWriter,
scans_csv: CsvWriter,
dry_run: bool,
mosaic_only: bool,
scan_id_filter: int | None,
) -> int:
"""Login, fetch scans, and download all content for one machine."""
sess = MachineSession(machine, config)
if not sess.login():
return 0
if scan_id_filter is not None:
scans: list[dict[str, Any]] = [
{"scan_id": scan_id_filter, "status": "Completed"}
]
log.info("[%s] Targeting scan ID %d.", machine["label"], scan_id_filter)
else:
scans = sess.get_all_scans()
if not scans:
log.warning("[%s] No scans found.", machine["label"])
return 0
total = 0
for scan in scans:
total += process_scan(
sess=sess,
scan=scan,
output_dir=output_dir,
machine=machine,
config=config,
progress=progress,
scans_csv=scans_csv,
tiles_csv=tiles_csv,
dry_run=dry_run,
mosaic_only=mosaic_only,
)
return total