e122f6435a
Add spruce scraper with CLI, session management, parsers, progress tracking, recheck logic, and test suite. Includes example config and README.
308 lines
8.9 KiB
Python
308 lines
8.9 KiB
Python
"""
|
|
High-level scrape orchestration: drives the per-machine and per-scan loops.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from tqdm import tqdm
|
|
|
|
from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
|
|
from spruce.progress import ProgressTracker, CsvWriter
|
|
from spruce.session import MachineSession
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Per-scan helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _download_mosaic(
|
|
sess: MachineSession,
|
|
scan_meta: dict[str, Any],
|
|
scan_id: int,
|
|
mosaic_path: Path,
|
|
progress: ProgressTracker,
|
|
machine: dict[str, Any],
|
|
dry_run: bool,
|
|
) -> bool:
|
|
"""Download the scan mosaic if not already done. Returns True if downloaded."""
|
|
url = sess.mosaic_url(scan_id)
|
|
if progress.is_done(url):
|
|
return False
|
|
if dry_run:
|
|
log.info("[DRY-RUN] Mosaic: %s → %s", url, mosaic_path)
|
|
return False
|
|
log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id)
|
|
size = sess.download_file(url, mosaic_path)
|
|
if size:
|
|
progress.mark_done(url)
|
|
progress.save()
|
|
log.info(
|
|
"[%s] Mosaic saved: %s (%.1f MB)",
|
|
machine["label"],
|
|
mosaic_path,
|
|
size / 1e6,
|
|
)
|
|
return True
|
|
return False
|
|
|
|
|
|
def _download_tiles_for_scan(
|
|
sess: MachineSession,
|
|
tiles: list[dict[str, Any]],
|
|
scan_meta: dict[str, Any],
|
|
scan_id: int,
|
|
output_dir: Path,
|
|
machine: dict[str, Any],
|
|
config: dict[str, Any],
|
|
progress: ProgressTracker,
|
|
tiles_csv: CsvWriter,
|
|
dry_run: bool,
|
|
) -> int:
|
|
"""Download all pending tiles for a scan. Returns count of tiles downloaded."""
|
|
pending = [t for t in tiles if not progress.is_done(t["url"])]
|
|
log.info(
|
|
"[%s] Scan %d: %d tiles total, %d pending.",
|
|
machine["label"],
|
|
scan_id,
|
|
len(tiles),
|
|
len(pending),
|
|
)
|
|
|
|
if dry_run:
|
|
for t in pending[:5]:
|
|
log.info("[DRY-RUN] Tile: %s", t["url"])
|
|
if len(pending) > 5:
|
|
log.info("[DRY-RUN] … and %d more tiles.", len(pending) - 5)
|
|
return 0
|
|
|
|
# Attach scan_time for CSV rows
|
|
for t in pending:
|
|
t["scan_time"] = scan_meta.get("scan_time", "")
|
|
|
|
workers: int = config["workers"]
|
|
downloaded = 0
|
|
|
|
with ThreadPoolExecutor(max_workers=workers) as pool:
|
|
futures = {
|
|
pool.submit(
|
|
sess.download_tile,
|
|
tile,
|
|
tile_dest(output_dir, machine, scan_meta, tile),
|
|
False,
|
|
): tile
|
|
for tile in pending
|
|
}
|
|
|
|
save_every = max(50, workers * 4)
|
|
batch: list[dict[str, Any]] = []
|
|
|
|
with tqdm(
|
|
total=len(pending),
|
|
desc=f"{machine['label']} scan {scan_id}",
|
|
unit="tile",
|
|
leave=True,
|
|
) as pbar:
|
|
for future in as_completed(futures):
|
|
result = future.result()
|
|
if result.get("file_size_bytes"):
|
|
batch.append(result)
|
|
progress.mark_done(result["url"])
|
|
downloaded += 1
|
|
pbar.update(1)
|
|
|
|
if len(batch) >= save_every:
|
|
for row in batch:
|
|
tiles_csv.write(row)
|
|
progress.save()
|
|
batch.clear()
|
|
|
|
for row in batch:
|
|
tiles_csv.write(row)
|
|
progress.save()
|
|
|
|
log.info(
|
|
"[%s] Scan %d complete: %d tiles downloaded.",
|
|
machine["label"],
|
|
scan_id,
|
|
downloaded,
|
|
)
|
|
return downloaded
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Per-scan driver
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def process_scan(
|
|
sess: MachineSession,
|
|
scan: dict[str, Any],
|
|
output_dir: Path,
|
|
machine: dict[str, Any],
|
|
config: dict[str, Any],
|
|
progress: ProgressTracker,
|
|
scans_csv: CsvWriter,
|
|
tiles_csv: CsvWriter,
|
|
dry_run: bool,
|
|
mosaic_only: bool,
|
|
) -> int:
|
|
"""
|
|
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
|
|
Returns total files downloaded for this scan.
|
|
"""
|
|
scan_id: int = scan["scan_id"]
|
|
log.info("[%s] Processing scan %d …", machine["label"], scan_id)
|
|
|
|
try:
|
|
scan_meta = sess.get_scan_metadata(scan_id)
|
|
except Exception as exc:
|
|
log.error(
|
|
"[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
|
|
)
|
|
return 0
|
|
|
|
if not scan_meta.get("nx") or not scan_meta.get("ny"):
|
|
log.warning(
|
|
"[%s] Scan %d: missing grid params, skipping.",
|
|
machine["label"],
|
|
scan_id,
|
|
)
|
|
return 0
|
|
|
|
# Merge list-level metadata into scan_meta (detail page takes precedence)
|
|
for k in (
|
|
"name",
|
|
"scan_time",
|
|
"start_datetime",
|
|
"end_datetime",
|
|
"status",
|
|
"user",
|
|
"scan_lines",
|
|
"scan_mode",
|
|
):
|
|
scan_meta.setdefault(k, scan.get(k, ""))
|
|
|
|
# Save per-scan metadata.json
|
|
scan_date = _extract_date(scan_meta.get("scan_time", ""))
|
|
scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id)
|
|
if not dry_run:
|
|
scan_dir.mkdir(parents=True, exist_ok=True)
|
|
meta_file = scan_dir / "metadata.json"
|
|
if not meta_file.exists():
|
|
meta_file.write_text(
|
|
json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
|
|
)
|
|
|
|
# Mosaic
|
|
mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
|
|
mosaic_url = sess.mosaic_url(scan_id)
|
|
mosaic_downloaded = _download_mosaic(
|
|
sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
|
|
)
|
|
total = 1 if mosaic_downloaded else 0
|
|
|
|
# Write scan-level CSV row
|
|
scans_csv.write(
|
|
{
|
|
"machine": machine["label"],
|
|
"machine_id": machine["machine_id"],
|
|
"scan_id": scan_id,
|
|
"name": scan_meta.get("name", ""),
|
|
"scan_time": scan_meta.get("scan_time", ""),
|
|
"start_x": scan_meta.get("start_x", ""),
|
|
"start_y": scan_meta.get("start_y", ""),
|
|
"end_x": scan_meta.get("end_x", ""),
|
|
"end_y": scan_meta.get("end_y", ""),
|
|
"dx": scan_meta.get("dx", ""),
|
|
"dy": scan_meta.get("dy", ""),
|
|
"nx": scan_meta.get("nx", ""),
|
|
"ny": scan_meta.get("ny", ""),
|
|
"total_tiles": scan_meta.get("total_tiles", ""),
|
|
"scan_lines": scan_meta.get("scan_lines", ""),
|
|
"scan_mode": scan_meta.get("scan_mode", ""),
|
|
"start_datetime": scan_meta.get("start_datetime", ""),
|
|
"end_datetime": scan_meta.get("end_datetime", ""),
|
|
"status": scan_meta.get("status", ""),
|
|
"user": scan_meta.get("user", ""),
|
|
"disk_space_mb": scan_meta.get("disk_space_mb", ""),
|
|
"mosaic_url": mosaic_url,
|
|
"mosaic_local_path": str(mosaic_path),
|
|
"mosaic_downloaded": mosaic_downloaded,
|
|
}
|
|
)
|
|
|
|
if mosaic_only:
|
|
return total
|
|
|
|
# Tiles
|
|
tiles = sess.enumerate_tiles(scan_meta)
|
|
total += _download_tiles_for_scan(
|
|
sess,
|
|
tiles,
|
|
scan_meta,
|
|
scan_id,
|
|
output_dir,
|
|
machine,
|
|
config,
|
|
progress,
|
|
tiles_csv,
|
|
dry_run,
|
|
)
|
|
return total
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Per-machine driver
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def scrape_machine(
|
|
machine: dict[str, Any],
|
|
config: dict[str, Any],
|
|
output_dir: Path,
|
|
progress: ProgressTracker,
|
|
tiles_csv: CsvWriter,
|
|
scans_csv: CsvWriter,
|
|
dry_run: bool,
|
|
mosaic_only: bool,
|
|
scan_id_filter: int | None,
|
|
) -> int:
|
|
"""Login, fetch scans, and download all content for one machine."""
|
|
sess = MachineSession(machine, config)
|
|
if not sess.login():
|
|
return 0
|
|
|
|
if scan_id_filter is not None:
|
|
scans: list[dict[str, Any]] = [
|
|
{"scan_id": scan_id_filter, "status": "Completed"}
|
|
]
|
|
log.info("[%s] Targeting scan ID %d.", machine["label"], scan_id_filter)
|
|
else:
|
|
scans = sess.get_all_scans()
|
|
if not scans:
|
|
log.warning("[%s] No scans found.", machine["label"])
|
|
return 0
|
|
|
|
total = 0
|
|
for scan in scans:
|
|
total += process_scan(
|
|
sess=sess,
|
|
scan=scan,
|
|
output_dir=output_dir,
|
|
machine=machine,
|
|
config=config,
|
|
progress=progress,
|
|
scans_csv=scans_csv,
|
|
tiles_csv=tiles_csv,
|
|
dry_run=dry_run,
|
|
mosaic_only=mosaic_only,
|
|
)
|
|
return total
|