""" High-level scrape orchestration: drives the per-machine and per-scan loops. """ import json import logging from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Any from tqdm import tqdm from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date from spruce.progress import ProgressTracker, CsvWriter from spruce.session import MachineSession log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Per-scan helpers # --------------------------------------------------------------------------- def _download_mosaic( sess: MachineSession, scan_meta: dict[str, Any], scan_id: int, mosaic_path: Path, progress: ProgressTracker, machine: dict[str, Any], dry_run: bool, ) -> bool: """Download the scan mosaic if not already done. Returns True if downloaded.""" url = sess.mosaic_url(scan_id) if progress.is_done(url): return False if dry_run: log.info("[DRY-RUN] Mosaic: %s → %s", url, mosaic_path) return False log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id) size = sess.download_file(url, mosaic_path) if size: progress.mark_done(url) progress.save() log.info( "[%s] Mosaic saved: %s (%.1f MB)", machine["label"], mosaic_path, size / 1e6, ) return True return False def _download_tiles_for_scan( sess: MachineSession, tiles: list[dict[str, Any]], scan_meta: dict[str, Any], scan_id: int, output_dir: Path, machine: dict[str, Any], config: dict[str, Any], progress: ProgressTracker, tiles_csv: CsvWriter, dry_run: bool, ) -> int: """Download all pending tiles for a scan. Returns count of tiles downloaded.""" pending = [t for t in tiles if not progress.is_done(t["url"])] log.info( "[%s] Scan %d: %d tiles total, %d pending.", machine["label"], scan_id, len(tiles), len(pending), ) if dry_run: for t in pending[:5]: log.info("[DRY-RUN] Tile: %s", t["url"]) if len(pending) > 5: log.info("[DRY-RUN] … and %d more tiles.", len(pending) - 5) return 0 # Attach scan_time for CSV rows for t in pending: t["scan_time"] = scan_meta.get("scan_time", "") workers: int = config["workers"] downloaded = 0 with ThreadPoolExecutor(max_workers=workers) as pool: futures = { pool.submit( sess.download_tile, tile, tile_dest(output_dir, machine, scan_meta, tile), False, ): tile for tile in pending } save_every = max(50, workers * 4) batch: list[dict[str, Any]] = [] with tqdm( total=len(pending), desc=f"{machine['label']} scan {scan_id}", unit="tile", leave=True, ) as pbar: for future in as_completed(futures): result = future.result() if result.get("file_size_bytes"): batch.append(result) progress.mark_done(result["url"]) downloaded += 1 pbar.update(1) if len(batch) >= save_every: for row in batch: tiles_csv.write(row) progress.save() batch.clear() for row in batch: tiles_csv.write(row) progress.save() log.info( "[%s] Scan %d complete: %d tiles downloaded.", machine["label"], scan_id, downloaded, ) return downloaded # --------------------------------------------------------------------------- # Per-scan driver # --------------------------------------------------------------------------- def process_scan( sess: MachineSession, scan: dict[str, Any], output_dir: Path, machine: dict[str, Any], config: dict[str, Any], progress: ProgressTracker, scans_csv: CsvWriter, tiles_csv: CsvWriter, dry_run: bool, mosaic_only: bool, ) -> int: """ Process one scan: fetch metadata, download mosaic and (optionally) tiles. Returns total files downloaded for this scan. """ scan_id: int = scan["scan_id"] log.info("[%s] Processing scan %d …", machine["label"], scan_id) try: scan_meta = sess.get_scan_metadata(scan_id) except Exception as exc: log.error( "[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc ) return 0 if not scan_meta.get("nx") or not scan_meta.get("ny"): log.warning( "[%s] Scan %d: missing grid params, skipping.", machine["label"], scan_id, ) return 0 # Merge list-level metadata into scan_meta (detail page takes precedence) for k in ( "name", "scan_time", "start_datetime", "end_datetime", "status", "user", "scan_lines", "scan_mode", ): scan_meta.setdefault(k, scan.get(k, "")) # Save per-scan metadata.json scan_date = _extract_date(scan_meta.get("scan_time", "")) scan_dir = output_dir / machine_dir_name(machine) / scan_date / str(scan_id) if not dry_run: scan_dir.mkdir(parents=True, exist_ok=True) meta_file = scan_dir / "metadata.json" if not meta_file.exists(): meta_file.write_text( json.dumps(scan_meta, indent=2, default=str), encoding="utf-8" ) # Mosaic mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id) mosaic_url = sess.mosaic_url(scan_id) mosaic_downloaded = _download_mosaic( sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run ) total = 1 if mosaic_downloaded else 0 # Write scan-level CSV row scans_csv.write( { "machine": machine["label"], "machine_id": machine["machine_id"], "scan_id": scan_id, "name": scan_meta.get("name", ""), "scan_time": scan_meta.get("scan_time", ""), "start_x": scan_meta.get("start_x", ""), "start_y": scan_meta.get("start_y", ""), "end_x": scan_meta.get("end_x", ""), "end_y": scan_meta.get("end_y", ""), "dx": scan_meta.get("dx", ""), "dy": scan_meta.get("dy", ""), "nx": scan_meta.get("nx", ""), "ny": scan_meta.get("ny", ""), "total_tiles": scan_meta.get("total_tiles", ""), "scan_lines": scan_meta.get("scan_lines", ""), "scan_mode": scan_meta.get("scan_mode", ""), "start_datetime": scan_meta.get("start_datetime", ""), "end_datetime": scan_meta.get("end_datetime", ""), "status": scan_meta.get("status", ""), "user": scan_meta.get("user", ""), "disk_space_mb": scan_meta.get("disk_space_mb", ""), "mosaic_url": mosaic_url, "mosaic_local_path": str(mosaic_path), "mosaic_downloaded": mosaic_downloaded, } ) if mosaic_only: return total # Tiles tiles = sess.enumerate_tiles(scan_meta) total += _download_tiles_for_scan( sess, tiles, scan_meta, scan_id, output_dir, machine, config, progress, tiles_csv, dry_run, ) return total # --------------------------------------------------------------------------- # Per-machine driver # --------------------------------------------------------------------------- def scrape_machine( machine: dict[str, Any], config: dict[str, Any], output_dir: Path, progress: ProgressTracker, tiles_csv: CsvWriter, scans_csv: CsvWriter, dry_run: bool, mosaic_only: bool, scan_id_filter: int | None, ) -> int: """Login, fetch scans, and download all content for one machine.""" sess = MachineSession(machine, config) if not sess.login(): return 0 if scan_id_filter is not None: scans: list[dict[str, Any]] = [ {"scan_id": scan_id_filter, "status": "Completed"} ] log.info("[%s] Targeting scan ID %d.", machine["label"], scan_id_filter) else: scans = sess.get_all_scans() if not scans: log.warning("[%s] No scans found.", machine["label"]) return 0 total = 0 for scan in scans: total += process_scan( sess=sess, scan=scan, output_dir=output_dir, machine=machine, config=config, progress=progress, scans_csv=scans_csv, tiles_csv=tiles_csv, dry_run=dry_run, mosaic_only=mosaic_only, ) return total