Initial commit
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
This commit is contained in:
@@ -0,0 +1,307 @@
|
||||
"""
|
||||
High-level scrape orchestration: drives the per-machine and per-scan loops.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
|
||||
from spruce.progress import ProgressTracker, CsvWriter
|
||||
from spruce.session import MachineSession
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-scan helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _download_mosaic(
|
||||
sess: MachineSession,
|
||||
scan_meta: dict[str, Any],
|
||||
scan_id: int,
|
||||
mosaic_path: Path,
|
||||
progress: ProgressTracker,
|
||||
machine: dict[str, Any],
|
||||
dry_run: bool,
|
||||
) -> bool:
|
||||
"""Download the scan mosaic if not already done. Returns True if downloaded."""
|
||||
url = sess.mosaic_url(scan_id)
|
||||
if progress.is_done(url):
|
||||
return False
|
||||
if dry_run:
|
||||
log.info("[DRY-RUN] Mosaic: %s → %s", url, mosaic_path)
|
||||
return False
|
||||
log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id)
|
||||
size = sess.download_file(url, mosaic_path)
|
||||
if size:
|
||||
progress.mark_done(url)
|
||||
progress.save()
|
||||
log.info(
|
||||
"[%s] Mosaic saved: %s (%.1f MB)",
|
||||
machine["label"],
|
||||
mosaic_path,
|
||||
size / 1e6,
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _download_tiles_for_scan(
|
||||
sess: MachineSession,
|
||||
tiles: list[dict[str, Any]],
|
||||
scan_meta: dict[str, Any],
|
||||
scan_id: int,
|
||||
output_dir: Path,
|
||||
machine: dict[str, Any],
|
||||
config: dict[str, Any],
|
||||
progress: ProgressTracker,
|
||||
tiles_csv: CsvWriter,
|
||||
dry_run: bool,
|
||||
) -> int:
|
||||
"""Download all pending tiles for a scan. Returns count of tiles downloaded."""
|
||||
pending = [t for t in tiles if not progress.is_done(t["url"])]
|
||||
log.info(
|
||||
"[%s] Scan %d: %d tiles total, %d pending.",
|
||||
machine["label"],
|
||||
scan_id,
|
||||
len(tiles),
|
||||
len(pending),
|
||||
)
|
||||
|
||||
if dry_run:
|
||||
for t in pending[:5]:
|
||||
log.info("[DRY-RUN] Tile: %s", t["url"])
|
||||
if len(pending) > 5:
|
||||
log.info("[DRY-RUN] … and %d more tiles.", len(pending) - 5)
|
||||
return 0
|
||||
|
||||
# Attach scan_time for CSV rows
|
||||
for t in pending:
|
||||
t["scan_time"] = scan_meta.get("scan_time", "")
|
||||
|
||||
workers: int = config["workers"]
|
||||
downloaded = 0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||
futures = {
|
||||
pool.submit(
|
||||
sess.download_tile,
|
||||
tile,
|
||||
tile_dest(output_dir, machine, scan_meta, tile),
|
||||
False,
|
||||
): tile
|
||||
for tile in pending
|
||||
}
|
||||
|
||||
save_every = max(50, workers * 4)
|
||||
batch: list[dict[str, Any]] = []
|
||||
|
||||
with tqdm(
|
||||
total=len(pending),
|
||||
desc=f"{machine['label']} scan {scan_id}",
|
||||
unit="tile",
|
||||
leave=True,
|
||||
) as pbar:
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
if result.get("file_size_bytes"):
|
||||
batch.append(result)
|
||||
progress.mark_done(result["url"])
|
||||
downloaded += 1
|
||||
pbar.update(1)
|
||||
|
||||
if len(batch) >= save_every:
|
||||
for row in batch:
|
||||
tiles_csv.write(row)
|
||||
progress.save()
|
||||
batch.clear()
|
||||
|
||||
for row in batch:
|
||||
tiles_csv.write(row)
|
||||
progress.save()
|
||||
|
||||
log.info(
|
||||
"[%s] Scan %d complete: %d tiles downloaded.",
|
||||
machine["label"],
|
||||
scan_id,
|
||||
downloaded,
|
||||
)
|
||||
return downloaded
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-scan driver
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def process_scan(
|
||||
sess: MachineSession,
|
||||
scan: dict[str, Any],
|
||||
output_dir: Path,
|
||||
machine: dict[str, Any],
|
||||
config: dict[str, Any],
|
||||
progress: ProgressTracker,
|
||||
scans_csv: CsvWriter,
|
||||
tiles_csv: CsvWriter,
|
||||
dry_run: bool,
|
||||
mosaic_only: bool,
|
||||
) -> int:
|
||||
"""
|
||||
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
|
||||
Returns total files downloaded for this scan.
|
||||
"""
|
||||
scan_id: int = scan["scan_id"]
|
||||
log.info("[%s] Processing scan %d …", machine["label"], scan_id)
|
||||
|
||||
try:
|
||||
scan_meta = sess.get_scan_metadata(scan_id)
|
||||
except Exception as exc:
|
||||
log.error(
|
||||
"[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
|
||||
)
|
||||
return 0
|
||||
|
||||
if not scan_meta.get("nx") or not scan_meta.get("ny"):
|
||||
log.warning(
|
||||
"[%s] Scan %d: missing grid params, skipping.",
|
||||
machine["label"],
|
||||
scan_id,
|
||||
)
|
||||
return 0
|
||||
|
||||
# Merge list-level metadata into scan_meta (detail page takes precedence)
|
||||
for k in (
|
||||
"name",
|
||||
"scan_time",
|
||||
"start_datetime",
|
||||
"end_datetime",
|
||||
"status",
|
||||
"user",
|
||||
"scan_lines",
|
||||
"scan_mode",
|
||||
):
|
||||
scan_meta.setdefault(k, scan.get(k, ""))
|
||||
|
||||
# Save per-scan metadata.json
|
||||
scan_date = _extract_date(scan_meta.get("scan_time", ""))
|
||||
scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id)
|
||||
if not dry_run:
|
||||
scan_dir.mkdir(parents=True, exist_ok=True)
|
||||
meta_file = scan_dir / "metadata.json"
|
||||
if not meta_file.exists():
|
||||
meta_file.write_text(
|
||||
json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
|
||||
)
|
||||
|
||||
# Mosaic
|
||||
mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
|
||||
mosaic_url = sess.mosaic_url(scan_id)
|
||||
mosaic_downloaded = _download_mosaic(
|
||||
sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
|
||||
)
|
||||
total = 1 if mosaic_downloaded else 0
|
||||
|
||||
# Write scan-level CSV row
|
||||
scans_csv.write(
|
||||
{
|
||||
"machine": machine["label"],
|
||||
"machine_id": machine["machine_id"],
|
||||
"scan_id": scan_id,
|
||||
"name": scan_meta.get("name", ""),
|
||||
"scan_time": scan_meta.get("scan_time", ""),
|
||||
"start_x": scan_meta.get("start_x", ""),
|
||||
"start_y": scan_meta.get("start_y", ""),
|
||||
"end_x": scan_meta.get("end_x", ""),
|
||||
"end_y": scan_meta.get("end_y", ""),
|
||||
"dx": scan_meta.get("dx", ""),
|
||||
"dy": scan_meta.get("dy", ""),
|
||||
"nx": scan_meta.get("nx", ""),
|
||||
"ny": scan_meta.get("ny", ""),
|
||||
"total_tiles": scan_meta.get("total_tiles", ""),
|
||||
"scan_lines": scan_meta.get("scan_lines", ""),
|
||||
"scan_mode": scan_meta.get("scan_mode", ""),
|
||||
"start_datetime": scan_meta.get("start_datetime", ""),
|
||||
"end_datetime": scan_meta.get("end_datetime", ""),
|
||||
"status": scan_meta.get("status", ""),
|
||||
"user": scan_meta.get("user", ""),
|
||||
"disk_space_mb": scan_meta.get("disk_space_mb", ""),
|
||||
"mosaic_url": mosaic_url,
|
||||
"mosaic_local_path": str(mosaic_path),
|
||||
"mosaic_downloaded": mosaic_downloaded,
|
||||
}
|
||||
)
|
||||
|
||||
if mosaic_only:
|
||||
return total
|
||||
|
||||
# Tiles
|
||||
tiles = sess.enumerate_tiles(scan_meta)
|
||||
total += _download_tiles_for_scan(
|
||||
sess,
|
||||
tiles,
|
||||
scan_meta,
|
||||
scan_id,
|
||||
output_dir,
|
||||
machine,
|
||||
config,
|
||||
progress,
|
||||
tiles_csv,
|
||||
dry_run,
|
||||
)
|
||||
return total
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-machine driver
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def scrape_machine(
|
||||
machine: dict[str, Any],
|
||||
config: dict[str, Any],
|
||||
output_dir: Path,
|
||||
progress: ProgressTracker,
|
||||
tiles_csv: CsvWriter,
|
||||
scans_csv: CsvWriter,
|
||||
dry_run: bool,
|
||||
mosaic_only: bool,
|
||||
scan_id_filter: int | None,
|
||||
) -> int:
|
||||
"""Login, fetch scans, and download all content for one machine."""
|
||||
sess = MachineSession(machine, config)
|
||||
if not sess.login():
|
||||
return 0
|
||||
|
||||
if scan_id_filter is not None:
|
||||
scans: list[dict[str, Any]] = [
|
||||
{"scan_id": scan_id_filter, "status": "Completed"}
|
||||
]
|
||||
log.info("[%s] Targeting scan ID %d.", machine["label"], scan_id_filter)
|
||||
else:
|
||||
scans = sess.get_all_scans()
|
||||
if not scans:
|
||||
log.warning("[%s] No scans found.", machine["label"])
|
||||
return 0
|
||||
|
||||
total = 0
|
||||
for scan in scans:
|
||||
total += process_scan(
|
||||
sess=sess,
|
||||
scan=scan,
|
||||
output_dir=output_dir,
|
||||
machine=machine,
|
||||
config=config,
|
||||
progress=progress,
|
||||
scans_csv=scans_csv,
|
||||
tiles_csv=tiles_csv,
|
||||
dry_run=dry_run,
|
||||
mosaic_only=mosaic_only,
|
||||
)
|
||||
return total
|
||||
Reference in New Issue
Block a user