From e8d3bf718020e3f5586dfcf74c64f13eee6c2145 Mon Sep 17 00:00:00 2001 From: James Kolpack Date: Fri, 24 Apr 2026 18:21:37 -0400 Subject: [PATCH] Add EXIF writing and machine metadata support --- README.md | 21 +++- config.example.yaml | 114 ++++++++++++++++++ requirements.txt | 1 + spruce/cli.py | 18 ++- spruce/exif.py | 254 +++++++++++++++++++++++++++++++++++++++ spruce/orchestrator.py | 33 ++++- spruce/settings.py | 2 + tests/fixtures/blank.jpg | Bin 0 -> 631 bytes tests/test_exif.py | 138 +++++++++++++++++++++ tests/test_recheck.py | 6 +- tests/test_settings.py | 2 + 11 files changed, 577 insertions(+), 12 deletions(-) create mode 100644 spruce/exif.py create mode 100644 tests/fixtures/blank.jpg create mode 100644 tests/test_exif.py diff --git a/README.md b/README.md index 4d865d7..4da9bd9 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,15 @@ python scraper.py --machine "BW3-20 [AMR-26]" --scan-id 158374 --workers 4 | `--list-scans` | Print all scans for `--machine` and exit | | `--verbose` / `-v` | Debug logging | +### `config.yaml` (optional keys) + +| Key | Description | +|---|---| +| `write_exif` | If true (default), write EXIF to each `mosaic.jpg` after download. Set to false to skip. | +| `machine_metadata` | Map of machine label → optional fields for mosaic EXIF: `plot_number`, `enclosure` (bool), `temp_treatment` (number or string), `co2_treatment` (`ambient` / `elevated`), `latitude_wgs_84`, `longitude_wgs_84`, `elevation_masl`. Omitted keys are not written. | + +`config.example.yaml` lists all 12 machine labels with full `machine_metadata` (plot, enclosure, treatments, WGS84 coordinates, elevation) and an optional `machines` filter (commented). + --- ## Output layout @@ -131,7 +140,7 @@ archives/ └── 2024-07-29/ └── 158374/ ├── metadata.json # full scan parameters (grid, timestamps, etc.) - ├── mosaic.jpg # pre-stitched full image (~16 MB) + ├── mosaic.jpg # pre-stitched full image (~16 MB), EXIF after download └── tiles/ ├── tile_r000_c000.jpg # row 0, column 0 (zero-padding matches grid size) ├── tile_r000_c001.jpg @@ -140,6 +149,8 @@ archives/ Tile filenames encode position: `tile_r{row}_c{col}.jpg` where row increases with depth (Y in mm) and column increases along the tube circumference (X in mm). +**Mosaic `mosaic.jpg` EXIF** (when `write_exif` is true in `config.yaml`, default on): set immediately after a successful download via `piexif` (no re-encoding). Includes `DateTime` / `DateTimeOriginal` (from scan time), `ImageDescription` (machine, scan id, name), `Make` = RootView, `Model` = machine label, `Software` = RootView + server version, `ProcessingSoftware` = this scraper, `Artist` (user), a one-line `UserComment` (grid size, pointer to `metadata.json`, and when set in `machine_metadata`: `plot_number`, `enclosure`, `temp_treatment`, `co2_treatment`), `XPKeywords` with the same treatment fields when any of those four are set, and GPS when `latitude_wgs_84`, `longitude_wgs_84`, and optionally `elevation_masl` are set. See `config.example.yaml` for the `machine_metadata` layout. + ### Metadata files **`scans.csv`** columns: `machine`, `machine_id`, `scan_id`, `name`, `scan_time`, `start_x`, `start_y`, `end_x`, `end_y`, `dx`, `dy`, `nx`, `ny`, `total_tiles`, `scan_lines`, `scan_mode`, `start_datetime`, `end_datetime`, `status`, `user`, `disk_space_mb`, `mosaic_url`, `mosaic_local_path`, `mosaic_on_disk` @@ -195,7 +206,7 @@ Every run prints a summary table on completion: Run complete ────────────────────────────────────────────── Machines: 1 - Scans fetched: 428 (2 already cached, 0 failed) + Scans (metadata) fetched: 428 (2 already cached, 0 metadata failed) Metadata written: 428 (new JSON files) ────────────────────────────────────────────── Scans CSV: archives/scans.csv @@ -203,10 +214,11 @@ Every run prints a summary table on completion: ────────────────────────────────────────────── ``` -- **Scans fetched**: metadata detail page was retrieved from the server this run. +- **Scans (metadata) fetched**: RootView scan detail page was retrieved (grid params, etc.). This does not mean the mosaic downloaded successfully; use **Mosaics downloaded** / **Mosaics failed** when not in `--metadata-only` mode. - **Already cached**: `metadata.json` already existed on disk; no HTTP request was made. -- **Failed**: fetch error or scan missing required grid parameters. +- **metadata failed**: metadata fetch error or scan missing required grid parameters. - **Metadata written**: new `metadata.json` files created (shown in `--metadata-only` mode). +- **Mosaics failed** (when present): mosaic URL was requested but the file was not saved (e.g. HTTP 404, or empty body). Check the log for the exact URL. - Mosaic and tile counts appear in their respective modes. --- @@ -219,3 +231,4 @@ Every run prints a summary table on completion: | `beautifulsoup4` + `lxml` | HTML parsing | | `pyyaml` | Config file | | `tqdm` | Progress bars | +| `piexif` | EXIF for downloaded mosaics | diff --git a/config.example.yaml b/config.example.yaml index 7df15f2..755212d 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -23,7 +23,121 @@ timeout: 60 # Delay between requests to a single machine (seconds, float ok) request_delay: 0.5 +# Write EXIF to each mosaic JPEG after download (GPS, keywords, user comment, etc.) +write_exif: true + +# Per-machine plot, enclosure, treatments, WGS84 GPS, and elevation (masl) for mosaic EXIF. +# co2_treatment: ambient | elevated. enclosure: true/false (aboveground enclosure). +# Omitted or missing machines: no GPS/keywords in EXIF for that machine. +machine_metadata: + "BW1-4 [AMR-15]": + plot_number: 4 + enclosure: true + temp_treatment: 4.5 + co2_treatment: elevated + latitude_wgs_84: 47.5051942 + longitude_wgs_84: -93.4539794 + elevation_masl: 412.77 + "BW1-6 [AMR-19]": + plot_number: 6 + enclosure: true + temp_treatment: "+0 Blowers only" + co2_treatment: ambient + latitude_wgs_84: 47.5050653 + longitude_wgs_84: -93.4534703 + elevation_masl: 412.763 + "BW1-7 [AMR-18]": + plot_number: 7 + enclosure: false + temp_treatment: ambient + co2_treatment: ambient + latitude_wgs_84: 47.5049308 + longitude_wgs_84: -93.4531358 + elevation_masl: 412.807 + "BW2-8 [AMR-25]": + plot_number: 8 + enclosure: true + temp_treatment: 6.75 + co2_treatment: ambient + latitude_wgs_84: 47.5055697 + longitude_wgs_84: -93.4538811 + elevation_masl: 412.778 + "BW2-10 [AMR-22]": + plot_number: 10 + enclosure: true + temp_treatment: 9 + co2_treatment: elevated + latitude_wgs_84: 47.5054358 + longitude_wgs_84: -93.4532131 + elevation_masl: 412.835 + "BW2-11 [AMR-23]": + plot_number: 11 + enclosure: true + temp_treatment: 2.25 + co2_treatment: elevated + latitude_wgs_84: 47.5053442 + longitude_wgs_84: -93.4526772 + elevation_masl: 412.858 + "BW2-13 [AMR-24]": + plot_number: 13 + enclosure: true + temp_treatment: 4.5 + co2_treatment: ambient + latitude_wgs_84: 47.5057086 + longitude_wgs_84: -93.4530022 + elevation_masl: 412.856 + "BW3-16 [AMR-16]": + plot_number: 16 + enclosure: true + temp_treatment: 6.75 + co2_treatment: elevated + latitude_wgs_84: 47.5060981 + longitude_wgs_84: -93.4531872 + elevation_masl: 412.953 + "BW3-17 [AMR-20]": + plot_number: 17 + enclosure: true + temp_treatment: 9 + co2_treatment: ambient + latitude_wgs_84: 47.5059761 + longitude_wgs_84: -93.4527164 + elevation_masl: 412.978 + "BW3-19 [AMR-21]": + plot_number: 19 + enclosure: true + temp_treatment: "+0 Blowers only" + co2_treatment: elevated + latitude_wgs_84: 47.5064783 + longitude_wgs_84: -93.4534736 + elevation_masl: 412.977 + "BW3-20 [AMR-26]": + plot_number: 20 + enclosure: true + temp_treatment: 2.25 + co2_treatment: ambient + latitude_wgs_84: 47.5063689 + longitude_wgs_84: -93.4531658 + elevation_masl: 412.944 + "BW3-21 [AMR-17]": + plot_number: 21 + enclosure: false + temp_treatment: ambient + co2_treatment: ambient + latitude_wgs_84: 47.5062539 + longitude_wgs_84: -93.4527486 + elevation_masl: 412.908 + # Optional: limit to specific machines by label (comment out to scrape all) # machines: # - "BW1-4 [AMR-15]" # - "BW1-6 [AMR-19]" +# - "BW1-7 [AMR-18]" +# - "BW2-8 [AMR-25]" +# - "BW2-10 [AMR-22]" +# - "BW2-11 [AMR-23]" +# - "BW2-13 [AMR-24]" +# - "BW3-16 [AMR-16]" +# - "BW3-17 [AMR-20]" +# - "BW3-19 [AMR-21]" +# - "BW3-20 [AMR-26]" +# - "BW3-21 [AMR-17]" diff --git a/requirements.txt b/requirements.txt index 8093dd9..26e62e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ beautifulsoup4>=4.12.0 lxml>=5.0.0 pyyaml>=6.0.1 tqdm>=4.66.0 +piexif>=1.1.3 pytest>=8.0 diff --git a/spruce/cli.py b/spruce/cli.py index 9232104..853c7e3 100644 --- a/spruce/cli.py +++ b/spruce/cli.py @@ -297,13 +297,25 @@ def _print_summary( log.info(sep) log.info(row("Machines:", str(len(machines)))) log.info( - row("Scans fetched:", str(totals.scans_fetched), + row( + "Scans (metadata) fetched:", + str(totals.scans_fetched), f"{totals.scans_skipped} already cached, " - f"{totals.scans_failed} failed" - if totals.scans_skipped or totals.scans_failed else "") + f"{totals.scans_failed} metadata failed" + if totals.scans_skipped or totals.scans_failed + else "", + ) ) if not metadata_only: log.info(row("Mosaics downloaded:", str(totals.mosaics_downloaded))) + if totals.mosaics_failed: + log.info( + row( + "Mosaics failed:", + str(totals.mosaics_failed), + "0 bytes or HTTP error; see log above", + ) + ) if not metadata_only and not mosaic_only: log.info(row("Tiles downloaded:", str(totals.tiles_downloaded))) if metadata_only: diff --git a/spruce/exif.py b/spruce/exif.py new file mode 100644 index 0000000..c94b158 --- /dev/null +++ b/spruce/exif.py @@ -0,0 +1,254 @@ +""" +Write EXIF metadata into downloaded mosaic JPEGs (piexif, no re-encode). +""" + +import logging +import re +from pathlib import Path +from typing import Any + +import piexif +from piexif import ExifIFD, GPSIFD, ImageIFD + +log = logging.getLogger(__name__) + +USER_COMMENT_ASCII = b"ASCII\x00\x00\x00" + + +def _fmt_exif_datetime(scan_time: str) -> str: + """`YYYY-MM-DD HH:MM:SS` -> `YYYY:MM:DD HH:MM:SS` for EXIF; empty on failure.""" + m = re.match( + r"^(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})$", scan_time.strip() + ) + if not m: + return "" + y, mo, d, h, mi, s = m.groups() + return f"{y}:{mo}:{d} {h}:{mi}:{s}" + + +def _fmt_dim(v: Any) -> str: + if v is None: + return "?" + if isinstance(v, float) and v == int(v): + return str(int(v)) + return str(v) + + +def _fmt_machine_meta_scalar(v: Any) -> str: + if isinstance(v, float) and v == int(v): + return str(int(v)) + if isinstance(v, float): + return format(v, "g") + return str(v).strip() + + +def _user_comment_treatment_suffix(machine_meta: dict[str, Any] | None) -> str: + if not machine_meta: + return "" + parts: list[str] = [] + pn = machine_meta.get("plot_number") + if pn is not None and str(pn).strip() != "": + parts.append(f"plot_number {_fmt_machine_meta_scalar(pn)}") + if machine_meta.get("enclosure") is not None: + enc = machine_meta["enclosure"] + if isinstance(enc, bool): + parts.append("enclosure yes" if enc else "enclosure no") + else: + parts.append(f"enclosure {_fmt_machine_meta_scalar(enc)}") + if machine_meta.get("temp_treatment") is not None: + tt = machine_meta["temp_treatment"] + parts.append(f"temp_treatment {_fmt_machine_meta_scalar(tt)}") + if machine_meta.get("co2_treatment") is not None: + c = str(machine_meta["co2_treatment"]).strip().lower() + parts.append(f"co2_treatment {c}") + if not parts: + return "" + return " | " + " | ".join(parts) + + +def _build_user_comment( + scan_meta: dict[str, Any], + machine: dict[str, Any], + scan_id: int, + machine_meta: dict[str, Any] | None = None, +) -> bytes: + nx = scan_meta.get("nx", "?") + ny = scan_meta.get("ny", "?") + dx = _fmt_dim(scan_meta.get("dx")) + dy = _fmt_dim(scan_meta.get("dy")) + ex = _fmt_dim(scan_meta.get("end_x")) + ey = _fmt_dim(scan_meta.get("end_y")) + text = ( + f"SPRUCE scan {scan_id} | machine {machine['label']} | " + f"grid {nx}x{ny} @ {dx}x{dy}mm over {ex}x{ey}mm | " + f"see metadata.json" + ) + text += _user_comment_treatment_suffix(machine_meta) + return USER_COMMENT_ASCII + text.encode("ascii", errors="replace") + + +def _co2_keyword(co2: str) -> str: + c = (co2 or "").strip().lower() + if c == "ambient": + return "aCO2" + if c == "elevated": + return "eCO2" + return c or "co2" + + +def _enclosure_keyword(enclosure: Any) -> str | None: + if enclosure is None: + return None + if isinstance(enclosure, bool): + return "enclosed" if enclosure else "no enclosure" + s = str(enclosure).strip().lower() + if s in ("yes", "y", "true", "1"): + return "enclosed" + if s in ("no", "n", "false", "0"): + return "no enclosure" + return None + + +def _temp_treatment_keyword(temp: Any) -> str | None: + if temp is None or isinstance(temp, bool): + return None + if isinstance(temp, (int, float)): + w = float(temp) + s = f"{w:g}C" if w != int(w) else f"{int(w)}C" + return f"temp +{s}" + t = str(temp).strip() + if not t: + return None + try: + w = float(t) + s = f"{w:g}C" if w != int(w) else f"{int(w)}C" + return f"temp +{s}" + except (TypeError, ValueError): + return f"temp {t}" + + +def _build_xp_keywords(machine_meta: dict[str, Any] | None) -> bytes | None: + if not machine_meta: + return None + parts: list[str] = [] + pn = machine_meta.get("plot_number") + if pn is not None and str(pn).strip() != "": + parts.append(f"plot {pn}") + enc = _enclosure_keyword(machine_meta.get("enclosure")) + if enc: + parts.append(enc) + if machine_meta.get("temp_treatment") is not None: + tk = _temp_treatment_keyword(machine_meta["temp_treatment"]) + if tk: + parts.append(tk) + if machine_meta.get("co2_treatment") is not None: + parts.append(_co2_keyword(str(machine_meta["co2_treatment"]))) + if not parts: + return None + text = "SPRUCE; " + "; ".join(parts) + return text.encode("utf-16le") + b"\x00\x00" + +def _decimal_to_dms_rational(deg: float) -> list[tuple[tuple[int, int], ...]]: + abs_deg = abs(deg) + d = int(abs_deg) + t_min = (abs_deg - d) * 60.0 + m = int(t_min) + s = (t_min - m) * 60.0 + sec = round(s * 1_000_000) + return [(d, 1), (m, 1), (sec, 1_000_000)] + + +def write_mosaic_exif( + jpeg_path: Path, + scan_meta: dict[str, Any], + machine: dict[str, Any], + scan_id: int, + machine_meta: dict[str, Any] | None, + processing_software: str = "spruce-scraper/1.0", +) -> bool: + """ + Insert EXIF into a mosaic JPEG. Returns True on success. + On failure, logs a warning and returns False. + """ + try: + name = (scan_meta.get("name") or "").strip() + desc = f"{machine['label']} scan {scan_id}" + if name: + desc = f"{desc} ({name})" + desc_b = desc.encode("utf-8", errors="replace") + + make_b = b"RootView" + ver = (machine.get("version") or "").strip() + software_b = f"RootView {ver}".encode("utf-8") if ver else b"RootView" + model_b = str(machine.get("label", "")).encode("utf-8", errors="replace") + proc_b = processing_software.encode("utf-8", errors="replace") + artist = (scan_meta.get("user") or "").strip() + artist_b = artist.encode("utf-8", errors="replace") if artist else b"" + + scan_time = (scan_meta.get("scan_time") or "").strip() + dt = _fmt_exif_datetime(scan_time) + dt_b = dt.encode("ascii") if dt else b"" + + zeroth: dict[int, Any] = { + ImageIFD.ImageDescription: desc_b, + ImageIFD.Make: make_b, + ImageIFD.Model: model_b, + ImageIFD.Software: software_b, + ImageIFD.ProcessingSoftware: proc_b, + } + if artist_b: + zeroth[ImageIFD.Artist] = artist_b + if dt_b: + zeroth[ImageIFD.DateTime] = dt_b + + wp = _build_xp_keywords(machine_meta) + if wp is not None: + zeroth[ImageIFD.XPKeywords] = wp + + exif_ifd: dict[int, Any] = { + ExifIFD.UserComment: _build_user_comment( + scan_meta, machine, scan_id, machine_meta + ), + } + if dt: + bdt = dt.encode("ascii") + exif_ifd[ExifIFD.DateTimeOriginal] = bdt + exif_ifd[ExifIFD.DateTimeDigitized] = bdt + + exif_dict: dict[str, Any] = { + "0th": zeroth, + "Exif": exif_ifd, + } + + lat_raw = None if not machine_meta else machine_meta.get("latitude_wgs_84") + lon_raw = None if not machine_meta else machine_meta.get("longitude_wgs_84") + if machine_meta and lat_raw is not None and lon_raw is not None: + lat = float(lat_raw) + lon = float(lon_raw) + if not (-90 <= lat <= 90) or not (-180 <= lon <= 180): + log.warning("Invalid lat/lon for EXIF GPS, skipping GPS: %s", jpeg_path) + else: + gps: dict[int, Any] = { + GPSIFD.GPSVersionID: (2, 0, 0, 0), + GPSIFD.GPSLatitudeRef: b"N" if lat >= 0 else b"S", + GPSIFD.GPSLatitude: _decimal_to_dms_rational(abs(lat)), + GPSIFD.GPSLongitudeRef: b"E" if lon >= 0 else b"W", + GPSIFD.GPSLongitude: _decimal_to_dms_rational(abs(lon)), + } + if machine_meta.get("elevation_masl") is not None: + alt = float(machine_meta["elevation_masl"]) + alt_abs = abs(alt) + gps[GPSIFD.GPSAltitudeRef] = 0 if alt >= 0 else 1 + if alt_abs == int(alt_abs): + gps[GPSIFD.GPSAltitude] = (int(alt_abs), 1) + else: + num = round(alt_abs * 1000) + gps[GPSIFD.GPSAltitude] = (num, 1000) + exif_dict["GPS"] = gps + + exif_bytes = piexif.dump(exif_dict) + piexif.insert(exif_bytes, str(jpeg_path)) + except Exception as exc: + log.warning("EXIF write failed for %s: %s", jpeg_path, exc) + return False + return True diff --git a/spruce/orchestrator.py b/spruce/orchestrator.py index dff2371..1294ee8 100644 --- a/spruce/orchestrator.py +++ b/spruce/orchestrator.py @@ -14,11 +14,12 @@ from typing import Any class RunStats: """Accumulated counters for one or more machines.""" - scans_fetched: int = 0 # metadata fetched from server this run + scans_fetched: int = 0 # scan detail page fetched (metadata), not tiles/mosaics scans_skipped: int = 0 # metadata.json already on disk; no HTTP request - scans_failed: int = 0 # fetch error or missing grid params + scans_failed: int = 0 # metadata fetch error or missing grid params metadata_written: int = 0 # new metadata.json files created mosaics_downloaded: int = 0 + mosaics_failed: int = 0 # mosaic URL attempted but 0 bytes / HTTP error tiles_downloaded: int = 0 def merge(self, other: "RunStats") -> None: @@ -27,10 +28,12 @@ class RunStats: self.scans_failed += other.scans_failed self.metadata_written += other.metadata_written self.mosaics_downloaded += other.mosaics_downloaded + self.mosaics_failed += other.mosaics_failed self.tiles_downloaded += other.tiles_downloaded from tqdm import tqdm +from spruce.exif import write_mosaic_exif from spruce.paths import machine_dir_name, tile_dest, mosaic_dest, _extract_date from spruce.progress import ProgressTracker, CsvWriter from spruce.session import MachineSession @@ -50,6 +53,7 @@ def _download_mosaic( mosaic_path: Path, progress: ProgressTracker, machine: dict[str, Any], + config: dict[str, Any], dry_run: bool, ) -> bool: """Download the scan mosaic if not already done. Returns True if downloaded.""" @@ -62,6 +66,13 @@ def _download_mosaic( log.info("[%s] Downloading mosaic for scan %d …", machine["label"], scan_id) size = sess.download_file(url, mosaic_path) if size: + if config.get("write_exif", True): + mmeta: dict[str, Any] | None = config.get("machine_metadata", {}).get( + machine["label"] + ) + write_mosaic_exif( + mosaic_path, scan_meta, machine, scan_id, mmeta + ) progress.mark_done(url) progress.save() log.info( @@ -283,10 +294,24 @@ def process_scan( mosaic_just_downloaded = False else: mosaic_just_downloaded = _download_mosaic( - sess, scan_meta, scan_id, mosaic_path, progress, machine, dry_run + sess, + scan_meta, + scan_id, + mosaic_path, + progress, + machine, + config, + dry_run, ) - if mosaic_just_downloaded: + if not metadata_only and mosaic_just_downloaded: stats.mosaics_downloaded += 1 + elif ( + not metadata_only + and not dry_run + and not mosaic_already_done + and not mosaic_just_downloaded + ): + stats.mosaics_failed += 1 # Write scan-level CSV row only if this scan hasn't been recorded before. if mosaic_already_done and not metadata_only: diff --git a/spruce/settings.py b/spruce/settings.py index 55fc292..33ed042 100644 --- a/spruce/settings.py +++ b/spruce/settings.py @@ -105,5 +105,7 @@ def load_config(path: str) -> dict: cfg.setdefault("timeout", 60) cfg.setdefault("request_delay", 0.5) cfg.setdefault("tile_scale", 1) + cfg.setdefault("write_exif", True) + cfg.setdefault("machine_metadata", {}) cfg["workers"] = _clamp_workers(cfg["workers"]) return cfg diff --git a/tests/fixtures/blank.jpg b/tests/fixtures/blank.jpg new file mode 100644 index 0000000000000000000000000000000000000000..adfd5a59250e23f0cc776152346a87fabd6ddfee GIT binary patch literal 631 zcmex=iF;N$`UAd82aiwDF383NJD#LCRf%Eivc4pu@E@&5pWAP2}%%#2D5OoEKe zf{g!%nVEr(W(B$!sJa#?&%h$cDx_%W z$R-?^$gWfnAuRebI{N?Mn z?>~P20{M%Pff?d0Ac@sqfWKS#c@69{;yl(wme1fGL-^|!0}nGJF!GoM8SEMU{#n$e I!TkRw02Noi+5i9m literal 0 HcmV?d00001 diff --git a/tests/test_exif.py b/tests/test_exif.py new file mode 100644 index 0000000..0bd082e --- /dev/null +++ b/tests/test_exif.py @@ -0,0 +1,138 @@ +"""Tests for spruce.exif — mosaic EXIF injection.""" + +import shutil +from pathlib import Path + +import piexif +import pytest +from piexif import ExifIFD, GPSIFD, ImageIFD + +from spruce.exif import USER_COMMENT_ASCII, write_mosaic_exif + +FIXTURES = Path(__file__).parent / "fixtures" +BLANK_JPEG = FIXTURES / "blank.jpg" + + +def _dms_pair_to_float(num: int, den: int) -> float: + return num / den if den else 0.0 + + +def _dms_to_deg( + ref: bytes, dms: list[tuple[tuple[int, int], tuple[int, int], tuple[int, int]]] +) -> float: + d0, m0, s0 = dms + deg = ( + _dms_pair_to_float(d0[0], d0[1]) + + _dms_pair_to_float(m0[0], m0[1]) / 60.0 + + _dms_pair_to_float(s0[0], s0[1]) / 3600.0 + ) + if ref in (b"S", b"W"): + return -deg + return deg + + +@pytest.fixture +def tmp_jpeg(tmp_path: Path) -> Path: + dest = tmp_path / "mosaic.jpg" + shutil.copy(BLANK_JPEG, dest) + return dest + + +@pytest.fixture +def scan_meta() -> dict: + return { + "scan_id": 157743, + "name": "Plot 7 AMR18 Full tube scan", + "scan_time": "2024-06-28 11:00:00", + "user": "Joanne", + "nx": 103, + "ny": 328, + "dx": 3.01, + "dy": 2.26, + "end_x": 310.0, + "end_y": 740.0, + } + + +@pytest.fixture +def machine() -> dict: + return {"label": "BW1-7 [AMR-18]", "version": "3.0.0.18", "machine_id": "7"} + + +def test_write_mosaic_exif_round_trip(tmp_jpeg: Path, scan_meta: dict, machine: dict): + assert write_mosaic_exif( + tmp_jpeg, + scan_meta, + machine, + 157743, + None, + ) + exif = piexif.load(str(tmp_jpeg)) + assert exif["Exif"][ExifIFD.DateTimeOriginal] == b"2024:06:28 11:00:00" + assert ( + exif["0th"][ImageIFD.ImageDescription] + == b"BW1-7 [AMR-18] scan 157743 (Plot 7 AMR18 Full tube scan)" + ) + assert exif["0th"][ImageIFD.Artist] == b"Joanne" + assert exif["0th"][ImageIFD.Software] == b"RootView 3.0.0.18" + assert exif["0th"][ImageIFD.ProcessingSoftware] == b"spruce-scraper/1.0" + uc = exif["Exif"][ExifIFD.UserComment] + assert uc.startswith(USER_COMMENT_ASCII) + tail = uc[len(USER_COMMENT_ASCII) :] + assert b"SPRUCE scan 157743" in tail + assert b"103x328" in tail + assert b"see metadata.json" in tail + assert b"plot_number" not in tail + assert ImageIFD.XPKeywords not in exif["0th"] + assert "GPS" not in exif or not exif["GPS"] + + +def test_gps_decode(tmp_jpeg: Path, scan_meta: dict, machine: dict): + mmeta = { + "latitude_wgs_84": 47.5047, + "longitude_wgs_84": -93.4530, + "elevation_masl": 418.0, + } + assert write_mosaic_exif(tmp_jpeg, scan_meta, machine, 157743, mmeta) + exif = piexif.load(str(tmp_jpeg)) + gps = exif["GPS"] + lat = _dms_to_deg(gps[GPSIFD.GPSLatitudeRef], gps[GPSIFD.GPSLatitude]) + lon = _dms_to_deg(gps[GPSIFD.GPSLongitudeRef], gps[GPSIFD.GPSLongitude]) + assert abs(lat - 47.5047) < 1e-5 + assert abs(lon - (-93.4530)) < 1e-5 + alt = gps[GPSIFD.GPSAltitude] + assert alt[0] / alt[1] == pytest.approx(418.0) + + +def test_no_gps_no_keywords_when_meta_none( + tmp_jpeg: Path, scan_meta: dict, machine: dict +): + assert write_mosaic_exif(tmp_jpeg, scan_meta, machine, 157743, None) + exif = piexif.load(str(tmp_jpeg)) + assert ImageIFD.XPKeywords not in exif["0th"] + assert "GPS" not in exif or not exif["GPS"] + + +def test_xp_keywords_treatment(tmp_jpeg: Path, scan_meta: dict, machine: dict): + mmeta = { + "plot_number": 7, + "enclosure": False, + "temp_treatment": 2.25, + "co2_treatment": "ambient", + } + assert write_mosaic_exif(tmp_jpeg, scan_meta, machine, 157743, mmeta) + exif = piexif.load(str(tmp_jpeg)) + raw = exif["0th"][ImageIFD.XPKeywords] + text = bytes(raw).decode("utf-16le").rstrip("\x00") + assert "SPRUCE" in text + assert "plot 7" in text + assert "no enclosure" in text + assert "temp" in text + assert "2.25" in text + assert "aCO2" in text + uc = exif["Exif"][ExifIFD.UserComment] + uct = uc[len(USER_COMMENT_ASCII) :].decode("ascii", errors="replace") + assert "plot_number 7" in uct + assert "enclosure no" in uct + assert "temp_treatment 2.25" in uct + assert "co2_treatment ambient" in uct diff --git a/tests/test_recheck.py b/tests/test_recheck.py index 998997a..bce8713 100644 --- a/tests/test_recheck.py +++ b/tests/test_recheck.py @@ -143,7 +143,11 @@ def test_recheck_archive_skips_mosaic_urls(tmp_path): mosaic_url = "http://192.0.2.1:8011/RootView_Database/158374/mosaic.jpg" p.mark_done(mosaic_url) p.save() + # recheck_verifies a non-zero mosaic exists under */*//mosaic.jpg + mpath = tmp_path / "M" / "2020-01-01" / "158374" / "mosaic.jpg" + mpath.parent.mkdir(parents=True) + mpath.write_bytes(b"\xff\xd8\xff\xd9") # minimal JPEG soff + eoi removed = recheck_archive(tmp_path, p) assert removed == 0 - assert p.is_done(mosaic_url) # mosaics are never touched + assert p.is_done(mosaic_url) diff --git a/tests/test_settings.py b/tests/test_settings.py index 6c7cb59..6b49ee3 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -60,6 +60,8 @@ def test_load_config_defaults(tmp_path): assert cfg["timeout"] == 60 assert cfg["request_delay"] == 0.5 assert cfg["output_dir"] == "archives" + assert cfg["write_exif"] is True + assert cfg["machine_metadata"] == {} def test_load_config_overrides(tmp_path):