Add --metadata-only mode; harden resume and idempotency
- Add --metadata-only flag: fetches scan detail pages, writes metadata.json + scans.csv rows, skips all image downloads. Re-runs skip scans whose metadata.json already exists. - Atomic progress.json saves (temp-file rename). - Heal-on-resume: tiles on disk but not in progress are silently re-marked before building the pending list. - scans.csv dedup: skip row if mosaic URL already in progress. - Rename mosaic_downloaded -> mosaic_on_disk (reflects disk state). - --recheck now checks mosaics as well as tiles. - RunStats dataclass replaces raw int return; richer run summary. - Fix argparse allow_abbrev reverted; fix --scan-id + --metadata-only glob fallback when scan_time is absent. - Add .venv/ to .gitignore. - README: fix typo, update worker counts, document all new behaviour.
This commit is contained in:
@@ -4,3 +4,4 @@ __pycache__/
|
|||||||
*.pyc
|
*.pyc
|
||||||
.DS_Store
|
.DS_Store
|
||||||
explore_dumps/
|
explore_dumps/
|
||||||
|
.venv/
|
||||||
|
|||||||
@@ -42,15 +42,15 @@ A full-tube scan covers a 310 mm × 740 mm cylinder at 3.01 × 2.26 mm steps, pr
|
|||||||
|
|
||||||
### Download speed
|
### Download speed
|
||||||
|
|
||||||
Tile downloads are server-limited: the RootView PHP backend renders tiles on-demand, sustaining ~**0.67 tiles/sec** with 8 parallel workers regardless of local bandwidth. Mosaics are pre-rendered and download ~20× faster per MB.
|
Tile downloads are server-limited: the RootView PHP backend renders tiles on-demand, sustaining ~**0.67 tiles/sec** with 4 parallel workers regardless of local bandwidth. Mosaics are pre-rendered and download ~20× faster per MB.
|
||||||
|
|
||||||
| Scenario | Estimated time |
|
| Scenario | Estimated time |
|
||||||
|---|---|
|
|---|---|
|
||||||
| All mosaics (4 workers) | ~3 months |
|
| All mosaics (4 workers) | ~3 months |
|
||||||
| Full tiles for one scan (8 workers) | ~14 hours |
|
| Full tiles for one scan (4 workers) | ~14 hours |
|
||||||
| All tiles, full-tube machines only | Years — not recommended |
|
| All tiles, full-tube machines only | Years — not recommended |
|
||||||
|
|
||||||
**Recommended approach:** archive mosaics first (`--mosaic-only`), then selectively download tiles for priority scans.
|
**Recommended approach:** inventory all scans first (`--metadata-only`, ~80 hours serial or ~7 hours if machines run in parallel), then archive mosaics (`--mosaic-only`), then selectively download tiles for priority scans.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -58,7 +58,7 @@ Tile downloads are server-limited: the RootView PHP backend renders tiles on-dem
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 1. Clone / download this repo
|
# 1. Clone / download this repo
|
||||||
cd spruce_scrapper
|
cd spruce_scraper
|
||||||
|
|
||||||
# 2. Install dependencies (Python 3.10+)
|
# 2. Install dependencies (Python 3.10+)
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
@@ -84,6 +84,10 @@ python scraper.py --list-scans --machine "BW3-20 [AMR-26]"
|
|||||||
# Preview what would be downloaded (dry run)
|
# Preview what would be downloaded (dry run)
|
||||||
python scraper.py --machine "BW3-20 [AMR-26]" --dry-run
|
python scraper.py --machine "BW3-20 [AMR-26]" --dry-run
|
||||||
|
|
||||||
|
# Inventory scan parameters only (no images downloaded) — very fast
|
||||||
|
python scraper.py --metadata-only
|
||||||
|
python scraper.py --machine "BW3-20 [AMR-26]" --metadata-only
|
||||||
|
|
||||||
# Download mosaics only for one machine
|
# Download mosaics only for one machine
|
||||||
python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only
|
python scraper.py --machine "BW3-20 [AMR-26]" --mosaic-only
|
||||||
|
|
||||||
@@ -103,11 +107,12 @@ python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
|
|||||||
|---|---|
|
|---|---|
|
||||||
| `--config FILE` | Config file path (default: `config.yaml`) |
|
| `--config FILE` | Config file path (default: `config.yaml`) |
|
||||||
| `--machine LABEL` | Restrict to one machine, e.g. `"BW3-20 [AMR-26]"` |
|
| `--machine LABEL` | Restrict to one machine, e.g. `"BW3-20 [AMR-26]"` |
|
||||||
| `--scan-id ID` | Download only this scan (use with `--machine`) |
|
| `--scan-id ID` | Restrict to one scan ID (use with `--machine`; works with all modes) |
|
||||||
| `--mosaic-only` | Download mosaics only; skip individual tiles |
|
| `--mosaic-only` | Download mosaics only; skip individual tiles |
|
||||||
|
| `--metadata-only` | Fetch scan parameters only; write `metadata.json` + `scans.csv` rows, skip all images. Re-runs skip scans whose `metadata.json` already exists |
|
||||||
| `--dry-run` | Print what would be downloaded without saving |
|
| `--dry-run` | Print what would be downloaded without saving |
|
||||||
| `--workers N` | Parallel download threads (default: 2, hard cap: 4) |
|
| `--workers N` | Parallel download threads (default: 2, hard cap: 4) |
|
||||||
| `--recheck` | Scan archive for zero-byte/missing tiles and remove them from `.progress.json` so they re-download on next run |
|
| `--recheck` | Scan archive for zero-byte/missing tiles and mosaics; remove bad entries from `.progress.json` so they re-download on next run |
|
||||||
| `--list-machines` | Print all machines and exit |
|
| `--list-machines` | Print all machines and exit |
|
||||||
| `--list-scans` | Print all scans for `--machine` and exit |
|
| `--list-scans` | Print all scans for `--machine` and exit |
|
||||||
| `--verbose` / `-v` | Debug logging |
|
| `--verbose` / `-v` | Debug logging |
|
||||||
@@ -128,7 +133,7 @@ archives/
|
|||||||
├── metadata.json # full scan parameters (grid, timestamps, etc.)
|
├── metadata.json # full scan parameters (grid, timestamps, etc.)
|
||||||
├── mosaic.jpg # pre-stitched full image (~16 MB)
|
├── mosaic.jpg # pre-stitched full image (~16 MB)
|
||||||
└── tiles/
|
└── tiles/
|
||||||
├── tile_r000_c000.jpg # row 0, column 0
|
├── tile_r000_c000.jpg # row 0, column 0 (zero-padding matches grid size)
|
||||||
├── tile_r000_c001.jpg
|
├── tile_r000_c001.jpg
|
||||||
└── ... # 33,784 tiles total for a full-tube scan
|
└── ... # 33,784 tiles total for a full-tube scan
|
||||||
```
|
```
|
||||||
@@ -137,10 +142,14 @@ Tile filenames encode position: `tile_r{row}_c{col}.jpg` where row increases wit
|
|||||||
|
|
||||||
### Metadata files
|
### Metadata files
|
||||||
|
|
||||||
**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_downloaded`
|
**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_on_disk`
|
||||||
|
|
||||||
|
- `mosaic_on_disk`: `True` if `mosaic.jpg` exists on disk at row-write time, regardless of which run downloaded it. Useful for inventory — reflects actual archive state rather than what happened in the current run.
|
||||||
|
|
||||||
**`tiles.csv`** columns: `machine`, `machine_id`, `scan_id`, `scan_time`, `row_index`, `col_index`, `x_mm`, `y_mm`, `url`, `local_path`, `downloaded_at`, `file_size_bytes`
|
**`tiles.csv`** columns: `machine`, `machine_id`, `scan_id`, `scan_time`, `row_index`, `col_index`, `x_mm`, `y_mm`, `url`, `local_path`, `downloaded_at`, `file_size_bytes`
|
||||||
|
|
||||||
|
- `downloaded_at`: ISO 8601 UTC timestamp of when the tile was fetched. Empty if the download failed.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Site structure (RootView)
|
## Site structure (RootView)
|
||||||
@@ -161,20 +170,47 @@ Grid coordinates (X, Y) are in millimetres, starting from `(start_x, start_y)` w
|
|||||||
|
|
||||||
## Resume and reliability
|
## Resume and reliability
|
||||||
|
|
||||||
- **Resumable**: `.progress.json` records every completed URL. Re-running the same command skips already-downloaded files.
|
- **Resumable**: `.progress.json` records every completed URL. Re-running the same command skips already-downloaded files. `--metadata-only` re-runs additionally skip any scan whose `metadata.json` already exists on disk — no HTTP request is made.
|
||||||
|
- **Atomic progress saves**: `.progress.json` is written via a temp-file rename, so a crash mid-save never produces a corrupt or empty progress file.
|
||||||
|
- **Heal on resume**: at the start of each scan's tile pass, any tile file that exists on disk but isn't recorded in progress is silently re-marked as complete, preventing duplicate `tiles.csv` rows and redundant re-downloads.
|
||||||
- **Retry logic**: each tile download retries up to 3 times with exponential backoff (5 s → 10 s → 20 s) before logging a warning and moving on.
|
- **Retry logic**: each tile download retries up to 3 times with exponential backoff (5 s → 10 s → 20 s) before logging a warning and moving on.
|
||||||
- **Worker cap**: the RootView server renders tiles on a single-threaded PHP process. Running more than 4 concurrent requests causes cascading read timeouts. The default is 2 workers; the scraper hard-caps at 4 and warns loudly if you try to exceed it.
|
- **Worker cap**: the RootView server renders tiles on a single-threaded PHP process. Running more than 4 concurrent requests causes cascading timeouts. The default is 2 workers; the scraper hard-caps at 4 and warns if you try to exceed it.
|
||||||
- **Crash recovery**: if a run is killed mid-flight, some in-progress tiles may have been written as zero-byte files without being marked complete. Run `--recheck` before resuming — it deletes zero-byte files on disk and removes their URLs from `.progress.json` so they are cleanly re-downloaded.
|
- **Crash recovery**: run `--recheck` to find and remove zero-byte or missing tile and mosaic files from `.progress.json` so they are cleanly re-downloaded on the next run.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# After any interrupted run, always do this first:
|
# After a hard crash, optionally run recheck before resuming:
|
||||||
python scraper.py --recheck
|
python scraper.py --recheck
|
||||||
# Then resume normally:
|
# Then resume normally — the scraper picks up where it left off:
|
||||||
python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374
|
python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Run summary
|
||||||
|
|
||||||
|
Every run prints a summary table on completion:
|
||||||
|
|
||||||
|
```
|
||||||
|
──────────────────────────────────────────────
|
||||||
|
Run complete
|
||||||
|
──────────────────────────────────────────────
|
||||||
|
Machines: 1
|
||||||
|
Scans fetched: 428 (2 already cached, 0 failed)
|
||||||
|
Metadata written: 428 (new JSON files)
|
||||||
|
──────────────────────────────────────────────
|
||||||
|
Scans CSV: archives/scans.csv
|
||||||
|
Progress: archives/.progress.json
|
||||||
|
──────────────────────────────────────────────
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Scans fetched**: metadata detail page was retrieved from the server this run.
|
||||||
|
- **Already cached**: `metadata.json` already existed on disk; no HTTP request was made.
|
||||||
|
- **Failed**: fetch error or scan missing required grid parameters.
|
||||||
|
- **Metadata written**: new `metadata.json` files created (shown in `--metadata-only` mode).
|
||||||
|
- Mosaic and tile counts appear in their respective modes.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
|
|
||||||
| Package | Purpose |
|
| Package | Purpose |
|
||||||
|
|||||||
+68
-11
@@ -10,7 +10,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from spruce.orchestrator import scrape_machine
|
from spruce.orchestrator import scrape_machine, RunStats
|
||||||
from spruce.parsers import parse_machine_option
|
from spruce.parsers import parse_machine_option
|
||||||
from spruce.progress import ProgressTracker, CsvWriter
|
from spruce.progress import ProgressTracker, CsvWriter
|
||||||
from spruce.recheck import recheck_archive, recheck_tile_files
|
from spruce.recheck import recheck_archive, recheck_tile_files
|
||||||
@@ -75,6 +75,15 @@ def parse_args() -> argparse.Namespace:
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Download mosaics only; skip individual tiles",
|
help="Download mosaics only; skip individual tiles",
|
||||||
)
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--metadata-only",
|
||||||
|
action="store_true",
|
||||||
|
help=(
|
||||||
|
"Fetch scan parameters only; write metadata.json and scans.csv "
|
||||||
|
"rows but skip mosaics and tiles. Very fast — suitable for "
|
||||||
|
"inventorying all scans across all machines."
|
||||||
|
),
|
||||||
|
)
|
||||||
p.add_argument(
|
p.add_argument(
|
||||||
"--dry-run",
|
"--dry-run",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
@@ -220,7 +229,11 @@ def main() -> None:
|
|||||||
len(machines),
|
len(machines),
|
||||||
", ".join(m["label"] for m in machines),
|
", ".join(m["label"] for m in machines),
|
||||||
)
|
)
|
||||||
if args.mosaic_only:
|
if args.mosaic_only and args.metadata_only:
|
||||||
|
sys.exit("--mosaic-only and --metadata-only are mutually exclusive.")
|
||||||
|
if args.metadata_only:
|
||||||
|
log.info("Mode: metadata only (mosaics and tiles skipped)")
|
||||||
|
elif args.mosaic_only:
|
||||||
log.info("Mode: mosaics only (individual tiles skipped)")
|
log.info("Mode: mosaics only (individual tiles skipped)")
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
log.info("Mode: dry-run (no files will be written)")
|
log.info("Mode: dry-run (no files will be written)")
|
||||||
@@ -230,10 +243,10 @@ def main() -> None:
|
|||||||
tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
|
tiles_csv = CsvWriter(output_dir / TILES_CSV_FILENAME, TILES_CSV_FIELDS)
|
||||||
scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
|
scans_csv = CsvWriter(output_dir / SCANS_CSV_FILENAME, SCANS_CSV_FIELDS)
|
||||||
|
|
||||||
total = 0
|
totals = RunStats()
|
||||||
try:
|
try:
|
||||||
for machine in machines:
|
for machine in machines:
|
||||||
count = scrape_machine(
|
stats = scrape_machine(
|
||||||
machine=machine,
|
machine=machine,
|
||||||
config=config,
|
config=config,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
@@ -242,18 +255,62 @@ def main() -> None:
|
|||||||
scans_csv=scans_csv,
|
scans_csv=scans_csv,
|
||||||
dry_run=args.dry_run,
|
dry_run=args.dry_run,
|
||||||
mosaic_only=args.mosaic_only,
|
mosaic_only=args.mosaic_only,
|
||||||
|
metadata_only=args.metadata_only,
|
||||||
scan_id_filter=args.scan_id,
|
scan_id_filter=args.scan_id,
|
||||||
)
|
)
|
||||||
total += count
|
totals.merge(stats)
|
||||||
finally:
|
finally:
|
||||||
tiles_csv.close()
|
tiles_csv.close()
|
||||||
scans_csv.close()
|
scans_csv.close()
|
||||||
progress.save()
|
progress.save()
|
||||||
|
|
||||||
if args.dry_run:
|
_print_summary(
|
||||||
log.info("Dry run complete.")
|
totals=totals,
|
||||||
|
machines=machines,
|
||||||
|
output_dir=output_dir,
|
||||||
|
dry_run=args.dry_run,
|
||||||
|
metadata_only=args.metadata_only,
|
||||||
|
mosaic_only=args.mosaic_only,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _print_summary(
|
||||||
|
totals: RunStats,
|
||||||
|
machines: list[dict],
|
||||||
|
output_dir: Path,
|
||||||
|
dry_run: bool,
|
||||||
|
metadata_only: bool,
|
||||||
|
mosaic_only: bool,
|
||||||
|
) -> None:
|
||||||
|
W = 46
|
||||||
|
sep = "─" * W
|
||||||
|
|
||||||
|
def row(label: str, value: str, note: str = "") -> str:
|
||||||
|
note_str = f" ({note})" if note else ""
|
||||||
|
return f" {label:<22}{value}{note_str}"
|
||||||
|
|
||||||
|
log.info(sep)
|
||||||
|
if dry_run:
|
||||||
|
log.info(" Dry run complete — no files written.")
|
||||||
else:
|
else:
|
||||||
log.info("Done. Total files downloaded: %d", total)
|
log.info(" Run complete")
|
||||||
log.info("Tiles CSV : %s", output_dir / TILES_CSV_FILENAME)
|
log.info(sep)
|
||||||
log.info("Scans CSV : %s", output_dir / SCANS_CSV_FILENAME)
|
log.info(row("Machines:", str(len(machines))))
|
||||||
log.info("Progress : %s", output_dir / PROGRESS_FILENAME)
|
log.info(
|
||||||
|
row("Scans fetched:", str(totals.scans_fetched),
|
||||||
|
f"{totals.scans_skipped} already cached, "
|
||||||
|
f"{totals.scans_failed} failed"
|
||||||
|
if totals.scans_skipped or totals.scans_failed else "")
|
||||||
|
)
|
||||||
|
if not metadata_only:
|
||||||
|
log.info(row("Mosaics downloaded:", str(totals.mosaics_downloaded)))
|
||||||
|
if not metadata_only and not mosaic_only:
|
||||||
|
log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
|
||||||
|
if metadata_only:
|
||||||
|
log.info(row("Metadata written:", str(totals.metadata_written), "new JSON files"))
|
||||||
|
log.info(sep)
|
||||||
|
log.info(row("Scans CSV:", str(output_dir / SCANS_CSV_FILENAME)))
|
||||||
|
if not metadata_only:
|
||||||
|
log.info(row("Tiles CSV:", str(output_dir / TILES_CSV_FILENAME)))
|
||||||
|
log.info(row("Progress:", str(output_dir / PROGRESS_FILENAME)))
|
||||||
|
log.info(sep)
|
||||||
|
|||||||
+110
-21
@@ -5,9 +5,30 @@ High-level scrape orchestration: drives the per-machine and per-scan loops.
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from dataclasses import dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RunStats:
|
||||||
|
"""Accumulated counters for one or more machines."""
|
||||||
|
|
||||||
|
scans_fetched: int = 0 # metadata fetched from server this run
|
||||||
|
scans_skipped: int = 0 # metadata.json already on disk; no HTTP request
|
||||||
|
scans_failed: int = 0 # fetch error or missing grid params
|
||||||
|
metadata_written: int = 0 # new metadata.json files created
|
||||||
|
mosaics_downloaded: int = 0
|
||||||
|
tiles_downloaded: int = 0
|
||||||
|
|
||||||
|
def merge(self, other: "RunStats") -> None:
|
||||||
|
self.scans_fetched += other.scans_fetched
|
||||||
|
self.scans_skipped += other.scans_skipped
|
||||||
|
self.scans_failed += other.scans_failed
|
||||||
|
self.metadata_written += other.metadata_written
|
||||||
|
self.mosaics_downloaded += other.mosaics_downloaded
|
||||||
|
self.tiles_downloaded += other.tiles_downloaded
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
|
from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
|
||||||
@@ -66,6 +87,24 @@ def _download_tiles_for_scan(
|
|||||||
dry_run: bool,
|
dry_run: bool,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Download all pending tiles for a scan. Returns count of tiles downloaded."""
|
"""Download all pending tiles for a scan. Returns count of tiles downloaded."""
|
||||||
|
# Heal progress for tiles that exist on disk but weren't recorded (e.g.
|
||||||
|
# crash between write and batch save). Prevents duplicate tiles.csv rows.
|
||||||
|
healed = 0
|
||||||
|
for t in tiles:
|
||||||
|
if not progress.is_done(t["url"]):
|
||||||
|
dest = tile_dest(output_dir, machine, scan_meta, t)
|
||||||
|
if dest.exists() and dest.stat().st_size > 0:
|
||||||
|
progress.mark_done(t["url"])
|
||||||
|
healed += 1
|
||||||
|
if healed:
|
||||||
|
log.debug(
|
||||||
|
"[%s] Scan %d: healed %d tile(s) already on disk into progress.",
|
||||||
|
machine["label"],
|
||||||
|
scan_id,
|
||||||
|
healed,
|
||||||
|
)
|
||||||
|
progress.save()
|
||||||
|
|
||||||
pending = [t for t in tiles if not progress.is_done(t["url"])]
|
pending = [t for t in tiles if not progress.is_done(t["url"])]
|
||||||
log.info(
|
log.info(
|
||||||
"[%s] Scan %d: %d tiles total, %d pending.",
|
"[%s] Scan %d: %d tiles total, %d pending.",
|
||||||
@@ -152,12 +191,43 @@ def process_scan(
|
|||||||
tiles_csv: CsvWriter,
|
tiles_csv: CsvWriter,
|
||||||
dry_run: bool,
|
dry_run: bool,
|
||||||
mosaic_only: bool,
|
mosaic_only: bool,
|
||||||
) -> int:
|
metadata_only: bool = False,
|
||||||
|
) -> RunStats:
|
||||||
"""
|
"""
|
||||||
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
|
Process one scan: fetch metadata, download mosaic and (optionally) tiles.
|
||||||
Returns total files downloaded for this scan.
|
Returns a RunStats with counters for what happened this call.
|
||||||
|
|
||||||
|
If metadata_only is True, writes metadata.json and the scans.csv row but
|
||||||
|
skips both the mosaic and the tiles.
|
||||||
"""
|
"""
|
||||||
scan_id: int = scan["scan_id"]
|
scan_id: int = scan["scan_id"]
|
||||||
|
stats = RunStats()
|
||||||
|
|
||||||
|
# In metadata-only mode, skip the HTTP fetch if metadata.json already exists.
|
||||||
|
# Try the date-hinted path first; fall back to a glob when scan_time is
|
||||||
|
# absent (e.g. when --scan-id is used and the synthetic scan dict has no
|
||||||
|
# scan_time field).
|
||||||
|
if metadata_only and not dry_run:
|
||||||
|
machine_root = output_dir / machine_dir_name(machine)
|
||||||
|
scan_date_hint = _extract_date(scan.get("scan_time", ""))
|
||||||
|
found_meta: Path | None = None
|
||||||
|
if scan_date_hint and scan_date_hint != "unknown":
|
||||||
|
candidate = machine_root / scan_date_hint / str(scan_id) / "metadata.json"
|
||||||
|
if candidate.exists():
|
||||||
|
found_meta = candidate
|
||||||
|
if found_meta is None:
|
||||||
|
matches = list(machine_root.glob(f"*/{scan_id}/metadata.json"))
|
||||||
|
if matches:
|
||||||
|
found_meta = matches[0]
|
||||||
|
if found_meta is not None:
|
||||||
|
log.debug(
|
||||||
|
"[%s] Scan %d: metadata.json already exists, skipping fetch.",
|
||||||
|
machine["label"],
|
||||||
|
scan_id,
|
||||||
|
)
|
||||||
|
stats.scans_skipped += 1
|
||||||
|
return stats
|
||||||
|
|
||||||
log.info("[%s] Processing scan %d …", machine["label"], scan_id)
|
log.info("[%s] Processing scan %d …", machine["label"], scan_id)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -166,7 +236,8 @@ def process_scan(
|
|||||||
log.error(
|
log.error(
|
||||||
"[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
|
"[%s] Cannot fetch scan %d metadata: %s", machine["label"], scan_id, exc
|
||||||
)
|
)
|
||||||
return 0
|
stats.scans_failed += 1
|
||||||
|
return stats
|
||||||
|
|
||||||
if not scan_meta.get("nx") or not scan_meta.get("ny"):
|
if not scan_meta.get("nx") or not scan_meta.get("ny"):
|
||||||
log.warning(
|
log.warning(
|
||||||
@@ -174,7 +245,10 @@ def process_scan(
|
|||||||
machine["label"],
|
machine["label"],
|
||||||
scan_id,
|
scan_id,
|
||||||
)
|
)
|
||||||
return 0
|
stats.scans_failed += 1
|
||||||
|
return stats
|
||||||
|
|
||||||
|
stats.scans_fetched += 1
|
||||||
|
|
||||||
# Merge list-level metadata into scan_meta (detail page takes precedence)
|
# Merge list-level metadata into scan_meta (detail page takes precedence)
|
||||||
for k in (
|
for k in (
|
||||||
@@ -199,16 +273,29 @@ def process_scan(
|
|||||||
meta_file.write_text(
|
meta_file.write_text(
|
||||||
json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
|
json.dumps(scan_meta, indent=2, default=str), encoding="utf-8"
|
||||||
)
|
)
|
||||||
|
stats.metadata_written += 1
|
||||||
|
|
||||||
# Mosaic
|
# Mosaic (skipped entirely in metadata-only mode)
|
||||||
mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
|
mosaic_path = mosaic_dest(output_dir, machine, scan_meta, scan_id)
|
||||||
mosaic_url = sess.mosaic_url(scan_id)
|
mosaic_url = sess.mosaic_url(scan_id)
|
||||||
mosaic_downloaded = _download_mosaic(
|
mosaic_already_done = progress.is_done(mosaic_url)
|
||||||
|
if metadata_only:
|
||||||
|
mosaic_just_downloaded = False
|
||||||
|
else:
|
||||||
|
mosaic_just_downloaded = _download_mosaic(
|
||||||
sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
|
sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
|
||||||
)
|
)
|
||||||
total = 1 if mosaic_downloaded else 0
|
if mosaic_just_downloaded:
|
||||||
|
stats.mosaics_downloaded += 1
|
||||||
|
|
||||||
# Write scan-level CSV row
|
# Write scan-level CSV row only if this scan hasn't been recorded before.
|
||||||
|
if mosaic_already_done and not metadata_only:
|
||||||
|
log.debug(
|
||||||
|
"[%s] Scan %d: already in scans.csv (mosaic was previously downloaded), skipping CSV row.",
|
||||||
|
machine["label"],
|
||||||
|
scan_id,
|
||||||
|
)
|
||||||
|
else:
|
||||||
scans_csv.write(
|
scans_csv.write(
|
||||||
{
|
{
|
||||||
"machine": machine["label"],
|
"machine": machine["label"],
|
||||||
@@ -234,16 +321,16 @@ def process_scan(
|
|||||||
"disk_space_mb": scan_meta.get("disk_space_mb", ""),
|
"disk_space_mb": scan_meta.get("disk_space_mb", ""),
|
||||||
"mosaic_url": mosaic_url,
|
"mosaic_url": mosaic_url,
|
||||||
"mosaic_local_path": str(mosaic_path),
|
"mosaic_local_path": str(mosaic_path),
|
||||||
"mosaic_downloaded": mosaic_downloaded,
|
"mosaic_on_disk": mosaic_path.exists(),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
if mosaic_only:
|
if mosaic_only or metadata_only:
|
||||||
return total
|
return stats
|
||||||
|
|
||||||
# Tiles
|
# Tiles
|
||||||
tiles = sess.enumerate_tiles(scan_meta)
|
tiles = sess.enumerate_tiles(scan_meta)
|
||||||
total += _download_tiles_for_scan(
|
stats.tiles_downloaded += _download_tiles_for_scan(
|
||||||
sess,
|
sess,
|
||||||
tiles,
|
tiles,
|
||||||
scan_meta,
|
scan_meta,
|
||||||
@@ -255,7 +342,7 @@ def process_scan(
|
|||||||
tiles_csv,
|
tiles_csv,
|
||||||
dry_run,
|
dry_run,
|
||||||
)
|
)
|
||||||
return total
|
return stats
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -272,12 +359,13 @@ def scrape_machine(
|
|||||||
scans_csv: CsvWriter,
|
scans_csv: CsvWriter,
|
||||||
dry_run: bool,
|
dry_run: bool,
|
||||||
mosaic_only: bool,
|
mosaic_only: bool,
|
||||||
scan_id_filter: int | None,
|
metadata_only: bool = False,
|
||||||
) -> int:
|
scan_id_filter: int | None = None,
|
||||||
|
) -> RunStats:
|
||||||
"""Login, fetch scans, and download all content for one machine."""
|
"""Login, fetch scans, and download all content for one machine."""
|
||||||
sess = MachineSession(machine, config)
|
sess = MachineSession(machine, config)
|
||||||
if not sess.login():
|
if not sess.login():
|
||||||
return 0
|
return RunStats()
|
||||||
|
|
||||||
if scan_id_filter is not None:
|
if scan_id_filter is not None:
|
||||||
scans: list[dict[str, Any]] = [
|
scans: list[dict[str, Any]] = [
|
||||||
@@ -288,11 +376,11 @@ def scrape_machine(
|
|||||||
scans = sess.get_all_scans()
|
scans = sess.get_all_scans()
|
||||||
if not scans:
|
if not scans:
|
||||||
log.warning("[%s] No scans found.", machine["label"])
|
log.warning("[%s] No scans found.", machine["label"])
|
||||||
return 0
|
return RunStats()
|
||||||
|
|
||||||
total = 0
|
stats = RunStats()
|
||||||
for scan in scans:
|
for scan in scans:
|
||||||
total += process_scan(
|
stats.merge(process_scan(
|
||||||
sess=sess,
|
sess=sess,
|
||||||
scan=scan,
|
scan=scan,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
@@ -303,5 +391,6 @@ def scrape_machine(
|
|||||||
tiles_csv=tiles_csv,
|
tiles_csv=tiles_csv,
|
||||||
dry_run=dry_run,
|
dry_run=dry_run,
|
||||||
mosaic_only=mosaic_only,
|
mosaic_only=mosaic_only,
|
||||||
)
|
metadata_only=metadata_only,
|
||||||
return total
|
))
|
||||||
|
return stats
|
||||||
|
|||||||
+3
-1
@@ -57,9 +57,11 @@ class ProgressTracker:
|
|||||||
|
|
||||||
def save(self) -> None:
|
def save(self) -> None:
|
||||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
self.path.write_text(
|
tmp = self.path.with_suffix(".json.tmp")
|
||||||
|
tmp.write_text(
|
||||||
json.dumps({"completed_urls": sorted(self._done)}, indent=2)
|
json.dumps({"completed_urls": sorted(self._done)}, indent=2)
|
||||||
)
|
)
|
||||||
|
tmp.replace(self.path) # atomic on POSIX; avoids corrupt JSON on crash
|
||||||
|
|
||||||
|
|
||||||
class CsvWriter:
|
class CsvWriter:
|
||||||
|
|||||||
+33
-13
@@ -86,47 +86,67 @@ def recheck_archive(output_dir: Path, progress: ProgressTracker) -> int:
|
|||||||
non-empty. Removes bad entries from progress so the next run re-downloads
|
non-empty. Removes bad entries from progress so the next run re-downloads
|
||||||
them. Returns the count of entries removed.
|
them. Returns the count of entries removed.
|
||||||
|
|
||||||
Only tile URLs are checked (mosaic URLs are skipped — mosaics are large
|
Both tile URLs and mosaic URLs are checked.
|
||||||
single files and are unlikely to be partially written due to streaming).
|
|
||||||
"""
|
"""
|
||||||
if len(progress) == 0:
|
if len(progress) == 0:
|
||||||
log.info("Progress file is empty — nothing to recheck.")
|
log.info("Progress file is empty — nothing to recheck.")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
tile_urls = [u for u in progress.iter_urls() if "cmd=image" in u]
|
all_urls = list(progress.iter_urls())
|
||||||
mosaic_count = len(progress) - len(tile_urls)
|
tile_urls = [u for u in all_urls if "cmd=image" in u]
|
||||||
|
mosaic_urls = [u for u in all_urls if "mosaic.jpg" in u]
|
||||||
log.info(
|
log.info(
|
||||||
"Rechecking %d tile URLs (%d mosaic URLs not rechecked) …",
|
"Rechecking %d tile URL(s) and %d mosaic URL(s) …",
|
||||||
len(tile_urls),
|
len(tile_urls),
|
||||||
mosaic_count,
|
len(mosaic_urls),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Build a disk index once
|
# Build a disk index of all tile files once
|
||||||
existing_files = _build_disk_index(output_dir)
|
existing_tiles = _build_disk_index(output_dir)
|
||||||
log.debug("Found %d tile files on disk.", len(existing_files))
|
log.debug("Found %d tile files on disk.", len(existing_tiles))
|
||||||
|
|
||||||
bad_urls: list[str] = []
|
bad_urls: list[str] = []
|
||||||
|
|
||||||
|
# --- Tile check ---
|
||||||
for url in tile_urls:
|
for url in tile_urls:
|
||||||
p = _parse_tile_url(url)
|
p = _parse_tile_url(url)
|
||||||
scan_id = p["scan_id"]
|
scan_id = p["scan_id"]
|
||||||
|
|
||||||
# Find tile files that live under a directory named after this scan_id
|
# Find tile files that live under a directory named after this scan_id
|
||||||
candidates = [path for path in existing_files if str(scan_id) in path.parts]
|
candidates = [path for path in existing_tiles if str(scan_id) in path.parts]
|
||||||
|
|
||||||
if not candidates:
|
if not candidates:
|
||||||
bad_urls.append(url)
|
bad_urls.append(url)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not any(existing_files[path] > 0 for path in candidates):
|
if not any(existing_tiles[path] > 0 for path in candidates):
|
||||||
|
bad_urls.append(url)
|
||||||
|
|
||||||
|
# --- Mosaic check ---
|
||||||
|
for url in mosaic_urls:
|
||||||
|
# Mosaic URLs: http://<host>:8011/RootView_Database/<scan_id>/mosaic.jpg
|
||||||
|
# Corresponding local path: <output_dir>/**/<scan_id>/mosaic.jpg
|
||||||
|
try:
|
||||||
|
scan_id = url.rstrip("/").split("/")[-2]
|
||||||
|
except IndexError:
|
||||||
|
bad_urls.append(url)
|
||||||
|
continue
|
||||||
|
|
||||||
|
matches = list(output_dir.glob(f"*/*/{scan_id}/mosaic.jpg"))
|
||||||
|
if not matches or not any(p.stat().st_size > 0 for p in matches):
|
||||||
|
log.debug("Mosaic missing or zero-byte for scan %s: %s", scan_id, url)
|
||||||
bad_urls.append(url)
|
bad_urls.append(url)
|
||||||
|
|
||||||
if not bad_urls:
|
if not bad_urls:
|
||||||
log.info("All %d tile URLs look healthy.", len(tile_urls))
|
log.info(
|
||||||
|
"All %d tile URL(s) and %d mosaic URL(s) look healthy.",
|
||||||
|
len(tile_urls),
|
||||||
|
len(mosaic_urls),
|
||||||
|
)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
log.warning(
|
log.warning(
|
||||||
"Found %d suspect tile URL(s). Removing from progress.",
|
"Found %d suspect URL(s). Removing from progress.",
|
||||||
len(bad_urls),
|
len(bad_urls),
|
||||||
)
|
)
|
||||||
for url in bad_urls:
|
for url in bad_urls:
|
||||||
|
|||||||
@@ -263,10 +263,6 @@ class MachineSession:
|
|||||||
}
|
}
|
||||||
if dry_run:
|
if dry_run:
|
||||||
return row
|
return row
|
||||||
if dest.exists():
|
|
||||||
row["downloaded_at"] = "already_exists"
|
|
||||||
row["file_size_bytes"] = dest.stat().st_size
|
|
||||||
return row
|
|
||||||
size = self.download_file(tile["url"], dest)
|
size = self.download_file(tile["url"], dest)
|
||||||
if size:
|
if size:
|
||||||
row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
|
row["downloaded_at"] = datetime.now(timezone.utc).isoformat()
|
||||||
|
|||||||
+1
-1
@@ -46,7 +46,7 @@ SCANS_CSV_FIELDS: list[str] = [
|
|||||||
"disk_space_mb",
|
"disk_space_mb",
|
||||||
"mosaic_url",
|
"mosaic_url",
|
||||||
"mosaic_local_path",
|
"mosaic_local_path",
|
||||||
"mosaic_downloaded",
|
"mosaic_on_disk",
|
||||||
]
|
]
|
||||||
|
|
||||||
TILES_CSV_FIELDS: list[str] = [
|
TILES_CSV_FIELDS: list[str] = [
|
||||||
|
|||||||
Reference in New Issue
Block a user