Add EXIF writing and machine metadata support
This commit is contained in:
@@ -117,6 +117,15 @@ python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4
|
||||
| `--list-scans` | Print all scans for `--machine` and exit |
|
||||
| `--verbose` / `-v` | Debug logging |
|
||||
|
||||
### `config.yaml` (optional keys)
|
||||
|
||||
| Key | Description |
|
||||
|---|---|
|
||||
| `write_exif` | If true (default), write EXIF to each `mosaic.jpg` after download. Set to false to skip. |
|
||||
| `machine_metadata` | Map of machine label → optional fields for mosaic EXIF: `plot_number`, `enclosure` (bool), `temp_treatment` (number or string), `co2_treatment` (`ambient` / `elevated`), `latitude_wgs_84`, `longitude_wgs_84`, `elevation_masl`. Omitted keys are not written. |
|
||||
|
||||
`config.example.yaml` lists all 12 machine labels with full `machine_metadata` (plot, enclosure, treatments, WGS84 coordinates, elevation) and an optional `machines` filter (commented).
|
||||
|
||||
---
|
||||
|
||||
## Output layout
|
||||
@@ -131,7 +140,7 @@ archives/
|
||||
└── 2024-07-29/
|
||||
└── 158374/
|
||||
├── metadata.json # full scan parameters (grid, timestamps, etc.)
|
||||
├── mosaic.jpg # pre-stitched full image (~16 MB)
|
||||
├── mosaic.jpg # pre-stitched full image (~16 MB), EXIF after download
|
||||
└── tiles/
|
||||
├── tile_r000_c000.jpg # row 0, column 0 (zero-padding matches grid size)
|
||||
├── tile_r000_c001.jpg
|
||||
@@ -140,6 +149,8 @@ archives/
|
||||
|
||||
Tile filenames encode position: `tile_r{row}_c{col}.jpg` where row increases with depth (Y in mm) and column increases along the tube circumference (X in mm).
|
||||
|
||||
**Mosaic `mosaic.jpg` EXIF** (when `write_exif` is true in `config.yaml`, default on): set immediately after a successful download via `piexif` (no re-encoding). Includes `DateTime` / `DateTimeOriginal` (from scan time), `ImageDescription` (machine, scan id, name), `Make` = RootView, `Model` = machine label, `Software` = RootView + server version, `ProcessingSoftware` = this scraper, `Artist` (user), a one-line `UserComment` (grid size, pointer to `metadata.json`, and when set in `machine_metadata`: `plot_number`, `enclosure`, `temp_treatment`, `co2_treatment`), `XPKeywords` with the same treatment fields when any of those four are set, and GPS when `latitude_wgs_84`, `longitude_wgs_84`, and optionally `elevation_masl` are set. See `config.example.yaml` for the `machine_metadata` layout.
|
||||
|
||||
### Metadata files
|
||||
|
||||
**`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_on_disk`
|
||||
@@ -195,7 +206,7 @@ Every run prints a summary table on completion:
|
||||
Run complete
|
||||
──────────────────────────────────────────────
|
||||
Machines: 1
|
||||
Scans fetched: 428 (2 already cached, 0 failed)
|
||||
Scans (metadata) fetched: 428 (2 already cached, 0 metadata failed)
|
||||
Metadata written: 428 (new JSON files)
|
||||
──────────────────────────────────────────────
|
||||
Scans CSV: archives/scans.csv
|
||||
@@ -203,10 +214,11 @@ Every run prints a summary table on completion:
|
||||
──────────────────────────────────────────────
|
||||
```
|
||||
|
||||
- **Scans fetched**: metadata detail page was retrieved from the server this run.
|
||||
- **Scans (metadata) fetched**: RootView scan detail page was retrieved (grid params, etc.). This does not mean the mosaic downloaded successfully; use **Mosaics downloaded** / **Mosaics failed** when not in `--metadata-only` mode.
|
||||
- **Already cached**: `metadata.json` already existed on disk; no HTTP request was made.
|
||||
- **Failed**: fetch error or scan missing required grid parameters.
|
||||
- **metadata failed**: metadata fetch error or scan missing required grid parameters.
|
||||
- **Metadata written**: new `metadata.json` files created (shown in `--metadata-only` mode).
|
||||
- **Mosaics failed** (when present): mosaic URL was requested but the file was not saved (e.g. HTTP 404, or empty body). Check the log for the exact URL.
|
||||
- Mosaic and tile counts appear in their respective modes.
|
||||
|
||||
---
|
||||
@@ -219,3 +231,4 @@ Every run prints a summary table on completion:
|
||||
| `beautifulsoup4` + `lxml` | HTML parsing |
|
||||
| `pyyaml` | Config file |
|
||||
| `tqdm` | Progress bars |
|
||||
| `piexif` | EXIF for downloaded mosaics |
|
||||
|
||||
@@ -23,7 +23,121 @@ timeout: 60
|
||||
# Delay between requests to a single machine (seconds, float ok)
|
||||
request_delay: 0.5
|
||||
|
||||
# Write EXIF to each mosaic JPEG after download (GPS, keywords, user comment, etc.)
|
||||
write_exif: true
|
||||
|
||||
# Per-machine plot, enclosure, treatments, WGS84 GPS, and elevation (masl) for mosaic EXIF.
|
||||
# co2_treatment: ambient | elevated. enclosure: true/false (aboveground enclosure).
|
||||
# Omitted or missing machines: no GPS/keywords in EXIF for that machine.
|
||||
machine_metadata:
|
||||
"BW1-4 [AMR-15]":
|
||||
plot_number: 4
|
||||
enclosure: true
|
||||
temp_treatment: 4.5
|
||||
co2_treatment: elevated
|
||||
latitude_wgs_84: 47.5051942
|
||||
longitude_wgs_84: -93.4539794
|
||||
elevation_masl: 412.77
|
||||
"BW1-6 [AMR-19]":
|
||||
plot_number: 6
|
||||
enclosure: true
|
||||
temp_treatment: "+0 Blowers only"
|
||||
co2_treatment: ambient
|
||||
latitude_wgs_84: 47.5050653
|
||||
longitude_wgs_84: -93.4534703
|
||||
elevation_masl: 412.763
|
||||
"BW1-7 [AMR-18]":
|
||||
plot_number: 7
|
||||
enclosure: false
|
||||
temp_treatment: ambient
|
||||
co2_treatment: ambient
|
||||
latitude_wgs_84: 47.5049308
|
||||
longitude_wgs_84: -93.4531358
|
||||
elevation_masl: 412.807
|
||||
"BW2-8 [AMR-25]":
|
||||
plot_number: 8
|
||||
enclosure: true
|
||||
temp_treatment: 6.75
|
||||
co2_treatment: ambient
|
||||
latitude_wgs_84: 47.5055697
|
||||
longitude_wgs_84: -93.4538811
|
||||
elevation_masl: 412.778
|
||||
"BW2-10 [AMR-22]":
|
||||
plot_number: 10
|
||||
enclosure: true
|
||||
temp_treatment: 9
|
||||
co2_treatment: elevated
|
||||
latitude_wgs_84: 47.5054358
|
||||
longitude_wgs_84: -93.4532131
|
||||
elevation_masl: 412.835
|
||||
"BW2-11 [AMR-23]":
|
||||
plot_number: 11
|
||||
enclosure: true
|
||||
temp_treatment: 2.25
|
||||
co2_treatment: elevated
|
||||
latitude_wgs_84: 47.5053442
|
||||
longitude_wgs_84: -93.4526772
|
||||
elevation_masl: 412.858
|
||||
"BW2-13 [AMR-24]":
|
||||
plot_number: 13
|
||||
enclosure: true
|
||||
temp_treatment: 4.5
|
||||
co2_treatment: ambient
|
||||
latitude_wgs_84: 47.5057086
|
||||
longitude_wgs_84: -93.4530022
|
||||
elevation_masl: 412.856
|
||||
"BW3-16 [AMR-16]":
|
||||
plot_number: 16
|
||||
enclosure: true
|
||||
temp_treatment: 6.75
|
||||
co2_treatment: elevated
|
||||
latitude_wgs_84: 47.5060981
|
||||
longitude_wgs_84: -93.4531872
|
||||
elevation_masl: 412.953
|
||||
"BW3-17 [AMR-20]":
|
||||
plot_number: 17
|
||||
enclosure: true
|
||||
temp_treatment: 9
|
||||
co2_treatment: ambient
|
||||
latitude_wgs_84: 47.5059761
|
||||
longitude_wgs_84: -93.4527164
|
||||
elevation_masl: 412.978
|
||||
"BW3-19 [AMR-21]":
|
||||
plot_number: 19
|
||||
enclosure: true
|
||||
temp_treatment: "+0 Blowers only"
|
||||
co2_treatment: elevated
|
||||
latitude_wgs_84: 47.5064783
|
||||
longitude_wgs_84: -93.4534736
|
||||
elevation_masl: 412.977
|
||||
"BW3-20 [AMR-26]":
|
||||
plot_number: 20
|
||||
enclosure: true
|
||||
temp_treatment: 2.25
|
||||
co2_treatment: ambient
|
||||
latitude_wgs_84: 47.5063689
|
||||
longitude_wgs_84: -93.4531658
|
||||
elevation_masl: 412.944
|
||||
"BW3-21 [AMR-17]":
|
||||
plot_number: 21
|
||||
enclosure: false
|
||||
temp_treatment: ambient
|
||||
co2_treatment: ambient
|
||||
latitude_wgs_84: 47.5062539
|
||||
longitude_wgs_84: -93.4527486
|
||||
elevation_masl: 412.908
|
||||
|
||||
# Optional: limit to specific machines by label (comment out to scrape all)
|
||||
# machines:
|
||||
# - "BW1-4 [AMR-15]"
|
||||
# - "BW1-6 [AMR-19]"
|
||||
# - "BW1-7 [AMR-18]"
|
||||
# - "BW2-8 [AMR-25]"
|
||||
# - "BW2-10 [AMR-22]"
|
||||
# - "BW2-11 [AMR-23]"
|
||||
# - "BW2-13 [AMR-24]"
|
||||
# - "BW3-16 [AMR-16]"
|
||||
# - "BW3-17 [AMR-20]"
|
||||
# - "BW3-19 [AMR-21]"
|
||||
# - "BW3-20 [AMR-26]"
|
||||
# - "BW3-21 [AMR-17]"
|
||||
|
||||
@@ -3,4 +3,5 @@ beautifulsoup4>=4.12.0
|
||||
lxml>=5.0.0
|
||||
pyyaml>=6.0.1
|
||||
tqdm>=4.66.0
|
||||
piexif>=1.1.3
|
||||
pytest>=8.0
|
||||
|
||||
+15
-3
@@ -297,13 +297,25 @@ def _print_summary(
|
||||
log.info(sep)
|
||||
log.info(row("Machines:", str(len(machines))))
|
||||
log.info(
|
||||
row("Scans fetched:", str(totals.scans_fetched),
|
||||
row(
|
||||
"Scans (metadata) fetched:",
|
||||
str(totals.scans_fetched),
|
||||
f"{totals.scans_skipped} already cached, "
|
||||
f"{totals.scans_failed} failed"
|
||||
if totals.scans_skipped or totals.scans_failed else "")
|
||||
f"{totals.scans_failed} metadata failed"
|
||||
if totals.scans_skipped or totals.scans_failed
|
||||
else "",
|
||||
)
|
||||
)
|
||||
if not metadata_only:
|
||||
log.info(row("Mosaics downloaded:", str(totals.mosaics_downloaded)))
|
||||
if totals.mosaics_failed:
|
||||
log.info(
|
||||
row(
|
||||
"Mosaics failed:",
|
||||
str(totals.mosaics_failed),
|
||||
"0 bytes or HTTP error; see log above",
|
||||
)
|
||||
)
|
||||
if not metadata_only and not mosaic_only:
|
||||
log.info(row("Tiles downloaded:", str(totals.tiles_downloaded)))
|
||||
if metadata_only:
|
||||
|
||||
+254
@@ -0,0 +1,254 @@
|
||||
"""
|
||||
Write EXIF metadata into downloaded mosaic JPEGs (piexif, no re-encode).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import piexif
|
||||
from piexif import ExifIFD, GPSIFD, ImageIFD
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
USER_COMMENT_ASCII = b"ASCII\x00\x00\x00"
|
||||
|
||||
|
||||
def _fmt_exif_datetime(scan_time: str) -> str:
|
||||
"""`YYYY-MM-DD HH:MM:SS` -> `YYYY:MM:DD HH:MM:SS` for EXIF; empty on failure."""
|
||||
m = re.match(
|
||||
r"^(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})$", scan_time.strip()
|
||||
)
|
||||
if not m:
|
||||
return ""
|
||||
y, mo, d, h, mi, s = m.groups()
|
||||
return f"{y}:{mo}:{d} {h}:{mi}:{s}"
|
||||
|
||||
|
||||
def _fmt_dim(v: Any) -> str:
|
||||
if v is None:
|
||||
return "?"
|
||||
if isinstance(v, float) and v == int(v):
|
||||
return str(int(v))
|
||||
return str(v)
|
||||
|
||||
|
||||
def _fmt_machine_meta_scalar(v: Any) -> str:
|
||||
if isinstance(v, float) and v == int(v):
|
||||
return str(int(v))
|
||||
if isinstance(v, float):
|
||||
return format(v, "g")
|
||||
return str(v).strip()
|
||||
|
||||
|
||||
def _user_comment_treatment_suffix(machine_meta: dict[str, Any] | None) -> str:
|
||||
if not machine_meta:
|
||||
return ""
|
||||
parts: list[str] = []
|
||||
pn = machine_meta.get("plot_number")
|
||||
if pn is not None and str(pn).strip() != "":
|
||||
parts.append(f"plot_number {_fmt_machine_meta_scalar(pn)}")
|
||||
if machine_meta.get("enclosure") is not None:
|
||||
enc = machine_meta["enclosure"]
|
||||
if isinstance(enc, bool):
|
||||
parts.append("enclosure yes" if enc else "enclosure no")
|
||||
else:
|
||||
parts.append(f"enclosure {_fmt_machine_meta_scalar(enc)}")
|
||||
if machine_meta.get("temp_treatment") is not None:
|
||||
tt = machine_meta["temp_treatment"]
|
||||
parts.append(f"temp_treatment {_fmt_machine_meta_scalar(tt)}")
|
||||
if machine_meta.get("co2_treatment") is not None:
|
||||
c = str(machine_meta["co2_treatment"]).strip().lower()
|
||||
parts.append(f"co2_treatment {c}")
|
||||
if not parts:
|
||||
return ""
|
||||
return " | " + " | ".join(parts)
|
||||
|
||||
|
||||
def _build_user_comment(
|
||||
scan_meta: dict[str, Any],
|
||||
machine: dict[str, Any],
|
||||
scan_id: int,
|
||||
machine_meta: dict[str, Any] | None = None,
|
||||
) -> bytes:
|
||||
nx = scan_meta.get("nx", "?")
|
||||
ny = scan_meta.get("ny", "?")
|
||||
dx = _fmt_dim(scan_meta.get("dx"))
|
||||
dy = _fmt_dim(scan_meta.get("dy"))
|
||||
ex = _fmt_dim(scan_meta.get("end_x"))
|
||||
ey = _fmt_dim(scan_meta.get("end_y"))
|
||||
text = (
|
||||
f"SPRUCE scan {scan_id} | machine {machine['label']} | "
|
||||
f"grid {nx}x{ny} @ {dx}x{dy}mm over {ex}x{ey}mm | "
|
||||
f"see metadata.json"
|
||||
)
|
||||
text += _user_comment_treatment_suffix(machine_meta)
|
||||
return USER_COMMENT_ASCII + text.encode("ascii", errors="replace")
|
||||
|
||||
|
||||
def _co2_keyword(co2: str) -> str:
|
||||
c = (co2 or "").strip().lower()
|
||||
if c == "ambient":
|
||||
return "aCO2"
|
||||
if c == "elevated":
|
||||
return "eCO2"
|
||||
return c or "co2"
|
||||
|
||||
|
||||
def _enclosure_keyword(enclosure: Any) -> str | None:
|
||||
if enclosure is None:
|
||||
return None
|
||||
if isinstance(enclosure, bool):
|
||||
return "enclosed" if enclosure else "no enclosure"
|
||||
s = str(enclosure).strip().lower()
|
||||
if s in ("yes", "y", "true", "1"):
|
||||
return "enclosed"
|
||||
if s in ("no", "n", "false", "0"):
|
||||
return "no enclosure"
|
||||
return None
|
||||
|
||||
|
||||
def _temp_treatment_keyword(temp: Any) -> str | None:
|
||||
if temp is None or isinstance(temp, bool):
|
||||
return None
|
||||
if isinstance(temp, (int, float)):
|
||||
w = float(temp)
|
||||
s = f"{w:g}C" if w != int(w) else f"{int(w)}C"
|
||||
return f"temp +{s}"
|
||||
t = str(temp).strip()
|
||||
if not t:
|
||||
return None
|
||||
try:
|
||||
w = float(t)
|
||||
s = f"{w:g}C" if w != int(w) else f"{int(w)}C"
|
||||
return f"temp +{s}"
|
||||
except (TypeError, ValueError):
|
||||
return f"temp {t}"
|
||||
|
||||
|
||||
def _build_xp_keywords(machine_meta: dict[str, Any] | None) -> bytes | None:
|
||||
if not machine_meta:
|
||||
return None
|
||||
parts: list[str] = []
|
||||
pn = machine_meta.get("plot_number")
|
||||
if pn is not None and str(pn).strip() != "":
|
||||
parts.append(f"plot {pn}")
|
||||
enc = _enclosure_keyword(machine_meta.get("enclosure"))
|
||||
if enc:
|
||||
parts.append(enc)
|
||||
if machine_meta.get("temp_treatment") is not None:
|
||||
tk = _temp_treatment_keyword(machine_meta["temp_treatment"])
|
||||
if tk:
|
||||
parts.append(tk)
|
||||
if machine_meta.get("co2_treatment") is not None:
|
||||
parts.append(_co2_keyword(str(machine_meta["co2_treatment"])))
|
||||
if not parts:
|
||||
return None
|
||||
text = "SPRUCE; " + "; ".join(parts)
|
||||
return text.encode("utf-16le") + b"\x00\x00"
|
||||
|
||||
def _decimal_to_dms_rational(deg: float) -> list[tuple[tuple[int, int], ...]]:
|
||||
abs_deg = abs(deg)
|
||||
d = int(abs_deg)
|
||||
t_min = (abs_deg - d) * 60.0
|
||||
m = int(t_min)
|
||||
s = (t_min - m) * 60.0
|
||||
sec = round(s * 1_000_000)
|
||||
return [(d, 1), (m, 1), (sec, 1_000_000)]
|
||||
|
||||
|
||||
def write_mosaic_exif(
|
||||
jpeg_path: Path,
|
||||
scan_meta: dict[str, Any],
|
||||
machine: dict[str, Any],
|
||||
scan_id: int,
|
||||
machine_meta: dict[str, Any] | None,
|
||||
processing_software: str = "spruce-scraper/1.0",
|
||||
) -> bool:
|
||||
"""
|
||||
Insert EXIF into a mosaic JPEG. Returns True on success.
|
||||
On failure, logs a warning and returns False.
|
||||
"""
|
||||
try:
|
||||
name = (scan_meta.get("name") or "").strip()
|
||||
desc = f"{machine['label']} scan {scan_id}"
|
||||
if name:
|
||||
desc = f"{desc} ({name})"
|
||||
desc_b = desc.encode("utf-8", errors="replace")
|
||||
|
||||
make_b = b"RootView"
|
||||
ver = (machine.get("version") or "").strip()
|
||||
software_b = f"RootView {ver}".encode("utf-8") if ver else b"RootView"
|
||||
model_b = str(machine.get("label", "")).encode("utf-8", errors="replace")
|
||||
proc_b = processing_software.encode("utf-8", errors="replace")
|
||||
artist = (scan_meta.get("user") or "").strip()
|
||||
artist_b = artist.encode("utf-8", errors="replace") if artist else b""
|
||||
|
||||
scan_time = (scan_meta.get("scan_time") or "").strip()
|
||||
dt = _fmt_exif_datetime(scan_time)
|
||||
dt_b = dt.encode("ascii") if dt else b""
|
||||
|
||||
zeroth: dict[int, Any] = {
|
||||
ImageIFD.ImageDescription: desc_b,
|
||||
ImageIFD.Make: make_b,
|
||||
ImageIFD.Model: model_b,
|
||||
ImageIFD.Software: software_b,
|
||||
ImageIFD.ProcessingSoftware: proc_b,
|
||||
}
|
||||
if artist_b:
|
||||
zeroth[ImageIFD.Artist] = artist_b
|
||||
if dt_b:
|
||||
zeroth[ImageIFD.DateTime] = dt_b
|
||||
|
||||
wp = _build_xp_keywords(machine_meta)
|
||||
if wp is not None:
|
||||
zeroth[ImageIFD.XPKeywords] = wp
|
||||
|
||||
exif_ifd: dict[int, Any] = {
|
||||
ExifIFD.UserComment: _build_user_comment(
|
||||
scan_meta, machine, scan_id, machine_meta
|
||||
),
|
||||
}
|
||||
if dt:
|
||||
bdt = dt.encode("ascii")
|
||||
exif_ifd[ExifIFD.DateTimeOriginal] = bdt
|
||||
exif_ifd[ExifIFD.DateTimeDigitized] = bdt
|
||||
|
||||
exif_dict: dict[str, Any] = {
|
||||
"0th": zeroth,
|
||||
"Exif": exif_ifd,
|
||||
}
|
||||
|
||||
lat_raw = None if not machine_meta else machine_meta.get("latitude_wgs_84")
|
||||
lon_raw = None if not machine_meta else machine_meta.get("longitude_wgs_84")
|
||||
if machine_meta and lat_raw is not None and lon_raw is not None:
|
||||
lat = float(lat_raw)
|
||||
lon = float(lon_raw)
|
||||
if not (-90 <= lat <= 90) or not (-180 <= lon <= 180):
|
||||
log.warning("Invalid lat/lon for EXIF GPS, skipping GPS: %s", jpeg_path)
|
||||
else:
|
||||
gps: dict[int, Any] = {
|
||||
GPSIFD.GPSVersionID: (2, 0, 0, 0),
|
||||
GPSIFD.GPSLatitudeRef: b"N" if lat >= 0 else b"S",
|
||||
GPSIFD.GPSLatitude: _decimal_to_dms_rational(abs(lat)),
|
||||
GPSIFD.GPSLongitudeRef: b"E" if lon >= 0 else b"W",
|
||||
GPSIFD.GPSLongitude: _decimal_to_dms_rational(abs(lon)),
|
||||
}
|
||||
if machine_meta.get("elevation_masl") is not None:
|
||||
alt = float(machine_meta["elevation_masl"])
|
||||
alt_abs = abs(alt)
|
||||
gps[GPSIFD.GPSAltitudeRef] = 0 if alt >= 0 else 1
|
||||
if alt_abs == int(alt_abs):
|
||||
gps[GPSIFD.GPSAltitude] = (int(alt_abs), 1)
|
||||
else:
|
||||
num = round(alt_abs * 1000)
|
||||
gps[GPSIFD.GPSAltitude] = (num, 1000)
|
||||
exif_dict["GPS"] = gps
|
||||
|
||||
exif_bytes = piexif.dump(exif_dict)
|
||||
piexif.insert(exif_bytes, str(jpeg_path))
|
||||
except Exception as exc:
|
||||
log.warning("EXIF write failed for %s: %s", jpeg_path, exc)
|
||||
return False
|
||||
return True
|
||||
+29
-4
@@ -14,11 +14,12 @@ from typing import Any
|
||||
class RunStats:
|
||||
"""Accumulated counters for one or more machines."""
|
||||
|
||||
scans_fetched: int = 0 # metadata fetched from server this run
|
||||
scans_fetched: int = 0 # scan detail page fetched (metadata), not tiles/mosaics
|
||||
scans_skipped: int = 0 # metadata.json already on disk; no HTTP request
|
||||
scans_failed: int = 0 # fetch error or missing grid params
|
||||
scans_failed: int = 0 # metadata fetch error or missing grid params
|
||||
metadata_written: int = 0 # new metadata.json files created
|
||||
mosaics_downloaded: int = 0
|
||||
mosaics_failed: int = 0 # mosaic URL attempted but 0 bytes / HTTP error
|
||||
tiles_downloaded: int = 0
|
||||
|
||||
def merge(self, other: "RunStats") -> None:
|
||||
@@ -27,10 +28,12 @@ class RunStats:
|
||||
self.scans_failed += other.scans_failed
|
||||
self.metadata_written += other.metadata_written
|
||||
self.mosaics_downloaded += other.mosaics_downloaded
|
||||
self.mosaics_failed += other.mosaics_failed
|
||||
self.tiles_downloaded += other.tiles_downloaded
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from spruce.exif import write_mosaic_exif
|
||||
from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date
|
||||
from spruce.progress import ProgressTracker, CsvWriter
|
||||
from spruce.session import MachineSession
|
||||
@@ -50,6 +53,7 @@ def _download_mosaic(
|
||||
mosaic_path: Path,
|
||||
progress: ProgressTracker,
|
||||
machine: dict[str, Any],
|
||||
config: dict[str, Any],
|
||||
dry_run: bool,
|
||||
) -> bool:
|
||||
"""Download the scan mosaic if not already done. Returns True if downloaded."""
|
||||
@@ -62,6 +66,13 @@ def _download_mosaic(
|
||||
log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id)
|
||||
size = sess.download_file(url, mosaic_path)
|
||||
if size:
|
||||
if config.get("write_exif", True):
|
||||
mmeta: dict[str, Any] | None = config.get("machine_metadata", {}).get(
|
||||
machine["label"]
|
||||
)
|
||||
write_mosaic_exif(
|
||||
mosaic_path, scan_meta, machine, scan_id, mmeta
|
||||
)
|
||||
progress.mark_done(url)
|
||||
progress.save()
|
||||
log.info(
|
||||
@@ -283,10 +294,24 @@ def process_scan(
|
||||
mosaic_just_downloaded = False
|
||||
else:
|
||||
mosaic_just_downloaded = _download_mosaic(
|
||||
sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run
|
||||
sess,
|
||||
scan_meta,
|
||||
scan_id,
|
||||
mosaic_path,
|
||||
progress,
|
||||
machine,
|
||||
config,
|
||||
dry_run,
|
||||
)
|
||||
if mosaic_just_downloaded:
|
||||
if not metadata_only and mosaic_just_downloaded:
|
||||
stats.mosaics_downloaded += 1
|
||||
elif (
|
||||
not metadata_only
|
||||
and not dry_run
|
||||
and not mosaic_already_done
|
||||
and not mosaic_just_downloaded
|
||||
):
|
||||
stats.mosaics_failed += 1
|
||||
|
||||
# Write scan-level CSV row only if this scan hasn't been recorded before.
|
||||
if mosaic_already_done and not metadata_only:
|
||||
|
||||
@@ -105,5 +105,7 @@ def load_config(path: str) -> dict:
|
||||
cfg.setdefault("timeout", 60)
|
||||
cfg.setdefault("request_delay", 0.5)
|
||||
cfg.setdefault("tile_scale", 1)
|
||||
cfg.setdefault("write_exif", True)
|
||||
cfg.setdefault("machine_metadata", {})
|
||||
cfg["workers"] = _clamp_workers(cfg["workers"])
|
||||
return cfg
|
||||
|
||||
Vendored
BIN
Binary file not shown.
|
After Width: | Height: | Size: 631 B |
@@ -0,0 +1,138 @@
|
||||
"""Tests for spruce.exif — mosaic EXIF injection."""
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import piexif
|
||||
import pytest
|
||||
from piexif import ExifIFD, GPSIFD, ImageIFD
|
||||
|
||||
from spruce.exif import USER_COMMENT_ASCII, write_mosaic_exif
|
||||
|
||||
FIXTURES = Path(__file__).parent / "fixtures"
|
||||
BLANK_JPEG = FIXTURES / "blank.jpg"
|
||||
|
||||
|
||||
def _dms_pair_to_float(num: int, den: int) -> float:
|
||||
return num / den if den else 0.0
|
||||
|
||||
|
||||
def _dms_to_deg(
|
||||
ref: bytes, dms: list[tuple[tuple[int, int], tuple[int, int], tuple[int, int]]]
|
||||
) -> float:
|
||||
d0, m0, s0 = dms
|
||||
deg = (
|
||||
_dms_pair_to_float(d0[0], d0[1])
|
||||
+ _dms_pair_to_float(m0[0], m0[1]) / 60.0
|
||||
+ _dms_pair_to_float(s0[0], s0[1]) / 3600.0
|
||||
)
|
||||
if ref in (b"S", b"W"):
|
||||
return -deg
|
||||
return deg
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tmp_jpeg(tmp_path: Path) -> Path:
|
||||
dest = tmp_path / "mosaic.jpg"
|
||||
shutil.copy(BLANK_JPEG, dest)
|
||||
return dest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def scan_meta() -> dict:
|
||||
return {
|
||||
"scan_id": 157743,
|
||||
"name": "Plot 7 AMR18 Full tube scan",
|
||||
"scan_time": "2024-06-28 11:00:00",
|
||||
"user": "Joanne",
|
||||
"nx": 103,
|
||||
"ny": 328,
|
||||
"dx": 3.01,
|
||||
"dy": 2.26,
|
||||
"end_x": 310.0,
|
||||
"end_y": 740.0,
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def machine() -> dict:
|
||||
return {"label": "BW1-7 [AMR-18]", "version": "3.0.0.18", "machine_id": "7"}
|
||||
|
||||
|
||||
def test_write_mosaic_exif_round_trip(tmp_jpeg: Path, scan_meta: dict, machine: dict):
|
||||
assert write_mosaic_exif(
|
||||
tmp_jpeg,
|
||||
scan_meta,
|
||||
machine,
|
||||
157743,
|
||||
None,
|
||||
)
|
||||
exif = piexif.load(str(tmp_jpeg))
|
||||
assert exif["Exif"][ExifIFD.DateTimeOriginal] == b"2024:06:28 11:00:00"
|
||||
assert (
|
||||
exif["0th"][ImageIFD.ImageDescription]
|
||||
== b"BW1-7 [AMR-18] scan 157743 (Plot 7 AMR18 Full tube scan)"
|
||||
)
|
||||
assert exif["0th"][ImageIFD.Artist] == b"Joanne"
|
||||
assert exif["0th"][ImageIFD.Software] == b"RootView 3.0.0.18"
|
||||
assert exif["0th"][ImageIFD.ProcessingSoftware] == b"spruce-scraper/1.0"
|
||||
uc = exif["Exif"][ExifIFD.UserComment]
|
||||
assert uc.startswith(USER_COMMENT_ASCII)
|
||||
tail = uc[len(USER_COMMENT_ASCII) :]
|
||||
assert b"SPRUCE scan 157743" in tail
|
||||
assert b"103x328" in tail
|
||||
assert b"see metadata.json" in tail
|
||||
assert b"plot_number" not in tail
|
||||
assert ImageIFD.XPKeywords not in exif["0th"]
|
||||
assert "GPS" not in exif or not exif["GPS"]
|
||||
|
||||
|
||||
def test_gps_decode(tmp_jpeg: Path, scan_meta: dict, machine: dict):
|
||||
mmeta = {
|
||||
"latitude_wgs_84": 47.5047,
|
||||
"longitude_wgs_84": -93.4530,
|
||||
"elevation_masl": 418.0,
|
||||
}
|
||||
assert write_mosaic_exif(tmp_jpeg, scan_meta, machine, 157743, mmeta)
|
||||
exif = piexif.load(str(tmp_jpeg))
|
||||
gps = exif["GPS"]
|
||||
lat = _dms_to_deg(gps[GPSIFD.GPSLatitudeRef], gps[GPSIFD.GPSLatitude])
|
||||
lon = _dms_to_deg(gps[GPSIFD.GPSLongitudeRef], gps[GPSIFD.GPSLongitude])
|
||||
assert abs(lat - 47.5047) < 1e-5
|
||||
assert abs(lon - (-93.4530)) < 1e-5
|
||||
alt = gps[GPSIFD.GPSAltitude]
|
||||
assert alt[0] / alt[1] == pytest.approx(418.0)
|
||||
|
||||
|
||||
def test_no_gps_no_keywords_when_meta_none(
|
||||
tmp_jpeg: Path, scan_meta: dict, machine: dict
|
||||
):
|
||||
assert write_mosaic_exif(tmp_jpeg, scan_meta, machine, 157743, None)
|
||||
exif = piexif.load(str(tmp_jpeg))
|
||||
assert ImageIFD.XPKeywords not in exif["0th"]
|
||||
assert "GPS" not in exif or not exif["GPS"]
|
||||
|
||||
|
||||
def test_xp_keywords_treatment(tmp_jpeg: Path, scan_meta: dict, machine: dict):
|
||||
mmeta = {
|
||||
"plot_number": 7,
|
||||
"enclosure": False,
|
||||
"temp_treatment": 2.25,
|
||||
"co2_treatment": "ambient",
|
||||
}
|
||||
assert write_mosaic_exif(tmp_jpeg, scan_meta, machine, 157743, mmeta)
|
||||
exif = piexif.load(str(tmp_jpeg))
|
||||
raw = exif["0th"][ImageIFD.XPKeywords]
|
||||
text = bytes(raw).decode("utf-16le").rstrip("\x00")
|
||||
assert "SPRUCE" in text
|
||||
assert "plot 7" in text
|
||||
assert "no enclosure" in text
|
||||
assert "temp" in text
|
||||
assert "2.25" in text
|
||||
assert "aCO2" in text
|
||||
uc = exif["Exif"][ExifIFD.UserComment]
|
||||
uct = uc[len(USER_COMMENT_ASCII) :].decode("ascii", errors="replace")
|
||||
assert "plot_number 7" in uct
|
||||
assert "enclosure no" in uct
|
||||
assert "temp_treatment 2.25" in uct
|
||||
assert "co2_treatment ambient" in uct
|
||||
@@ -143,7 +143,11 @@ def test_recheck_archive_skips_mosaic_urls(tmp_path):
|
||||
mosaic_url = "http://192.0.2.1:8011/RootView_Database/158374/mosaic.jpg"
|
||||
p.mark_done(mosaic_url)
|
||||
p.save()
|
||||
# recheck_verifies a non-zero mosaic exists under */*/<scan_id>/mosaic.jpg
|
||||
mpath = tmp_path / "M" / "2020-01-01" / "158374" / "mosaic.jpg"
|
||||
mpath.parent.mkdir(parents=True)
|
||||
mpath.write_bytes(b"\xff\xd8\xff\xd9") # minimal JPEG soff + eoi
|
||||
|
||||
removed = recheck_archive(tmp_path, p)
|
||||
assert removed == 0
|
||||
assert p.is_done(mosaic_url) # mosaics are never touched
|
||||
assert p.is_done(mosaic_url)
|
||||
|
||||
@@ -60,6 +60,8 @@ def test_load_config_defaults(tmp_path):
|
||||
assert cfg["timeout"] == 60
|
||||
assert cfg["request_delay"] == 0.5
|
||||
assert cfg["output_dir"] == "archives"
|
||||
assert cfg["write_exif"] is True
|
||||
assert cfg["machine_metadata"] == {}
|
||||
|
||||
|
||||
def test_load_config_overrides(tmp_path):
|
||||
|
||||
Reference in New Issue
Block a user