Add --metadata-only mode; harden resume and idempotency
- Add --metadata-only flag: fetches scan detail pages, writes metadata.json + scans.csv rows, skips all image downloads. Re-runs skip scans whose metadata.json already exists. - Atomic progress.json saves (temp-file rename). - Heal-on-resume: tiles on disk but not in progress are silently re-marked before building the pending list. - scans.csv dedup: skip row if mosaic URL already in progress. - Rename mosaic_downloaded -> mosaic_on_disk (reflects disk state). - --recheck now checks mosaics as well as tiles. - RunStats dataclass replaces raw int return; richer run summary. - Fix argparse allow_abbrev reverted; fix --scan-id + --metadata-only glob fallback when scan_time is absent. - Add .venv/ to .gitignore. - README: fix typo, update worker counts, document all new behaviour.
This commit is contained in:
+33
-13
@@ -86,47 +86,67 @@ def recheck_archive(output_dir: Path, progress: ProgressTracker) -> int:
|
||||
non-empty. Removes bad entries from progress so the next run re-downloads
|
||||
them. Returns the count of entries removed.
|
||||
|
||||
Only tile URLs are checked (mosaic URLs are skipped — mosaics are large
|
||||
single files and are unlikely to be partially written due to streaming).
|
||||
Both tile URLs and mosaic URLs are checked.
|
||||
"""
|
||||
if len(progress) == 0:
|
||||
log.info("Progress file is empty — nothing to recheck.")
|
||||
return 0
|
||||
|
||||
tile_urls = [u for u in progress.iter_urls() if "cmd=image" in u]
|
||||
mosaic_count = len(progress) - len(tile_urls)
|
||||
all_urls = list(progress.iter_urls())
|
||||
tile_urls = [u for u in all_urls if "cmd=image" in u]
|
||||
mosaic_urls = [u for u in all_urls if "mosaic.jpg" in u]
|
||||
log.info(
|
||||
"Rechecking %d tile URLs (%d mosaic URLs not rechecked) …",
|
||||
"Rechecking %d tile URL(s) and %d mosaic URL(s) …",
|
||||
len(tile_urls),
|
||||
mosaic_count,
|
||||
len(mosaic_urls),
|
||||
)
|
||||
|
||||
# Build a disk index once
|
||||
existing_files = _build_disk_index(output_dir)
|
||||
log.debug("Found %d tile files on disk.", len(existing_files))
|
||||
# Build a disk index of all tile files once
|
||||
existing_tiles = _build_disk_index(output_dir)
|
||||
log.debug("Found %d tile files on disk.", len(existing_tiles))
|
||||
|
||||
bad_urls: list[str] = []
|
||||
|
||||
# --- Tile check ---
|
||||
for url in tile_urls:
|
||||
p = _parse_tile_url(url)
|
||||
scan_id = p["scan_id"]
|
||||
|
||||
# Find tile files that live under a directory named after this scan_id
|
||||
candidates = [path for path in existing_files if str(scan_id) in path.parts]
|
||||
candidates = [path for path in existing_tiles if str(scan_id) in path.parts]
|
||||
|
||||
if not candidates:
|
||||
bad_urls.append(url)
|
||||
continue
|
||||
|
||||
if not any(existing_files[path] > 0 for path in candidates):
|
||||
if not any(existing_tiles[path] > 0 for path in candidates):
|
||||
bad_urls.append(url)
|
||||
|
||||
# --- Mosaic check ---
|
||||
for url in mosaic_urls:
|
||||
# Mosaic URLs: http://<host>:8011/RootView_Database/<scan_id>/mosaic.jpg
|
||||
# Corresponding local path: <output_dir>/**/<scan_id>/mosaic.jpg
|
||||
try:
|
||||
scan_id = url.rstrip("/").split("/")[-2]
|
||||
except IndexError:
|
||||
bad_urls.append(url)
|
||||
continue
|
||||
|
||||
matches = list(output_dir.glob(f"*/*/{scan_id}/mosaic.jpg"))
|
||||
if not matches or not any(p.stat().st_size > 0 for p in matches):
|
||||
log.debug("Mosaic missing or zero-byte for scan %s: %s", scan_id, url)
|
||||
bad_urls.append(url)
|
||||
|
||||
if not bad_urls:
|
||||
log.info("All %d tile URLs look healthy.", len(tile_urls))
|
||||
log.info(
|
||||
"All %d tile URL(s) and %d mosaic URL(s) look healthy.",
|
||||
len(tile_urls),
|
||||
len(mosaic_urls),
|
||||
)
|
||||
return 0
|
||||
|
||||
log.warning(
|
||||
"Found %d suspect tile URL(s). Removing from progress.",
|
||||
"Found %d suspect URL(s). Removing from progress.",
|
||||
len(bad_urls),
|
||||
)
|
||||
for url in bad_urls:
|
||||
|
||||
Reference in New Issue
Block a user