From 97dffe9b338e4050078b1dc17fdb64cf91323394 Mon Sep 17 00:00:00 2001 From: TheSensinator Date: Fri, 5 Jun 2026 01:16:39 -0500 Subject: [PATCH 1/8] Add 'inat' key with value 40 to globals --- oz_tree_build/_OZglobals.py | 1 + 1 file changed, 1 insertion(+) diff --git a/oz_tree_build/_OZglobals.py b/oz_tree_build/_OZglobals.py index 7105663..e3ba2a0 100644 --- a/oz_tree_build/_OZglobals.py +++ b/oz_tree_build/_OZglobals.py @@ -57,6 +57,7 @@ "arkive": 5, "wiki": 20, "eol": 30, + "inat": 40, "short_imprecise_name": 50, "eol_old": 99, }, From ad870aefdd9746f95d3dfad6cb1dcef4af4483d6 Mon Sep 17 00:00:00 2001 From: TheSensinator Date: Fri, 5 Jun 2026 01:17:59 -0500 Subject: [PATCH 2/8] Add README for iNaturalist image harvesting This README provides detailed instructions for using the iNaturalist image harvesting script, including mapping strategies, license rules, and modes of operation. --- .../README_inaturalist_images.md | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 oz_tree_build/images_and_vernaculars/README_inaturalist_images.md diff --git a/oz_tree_build/images_and_vernaculars/README_inaturalist_images.md b/oz_tree_build/images_and_vernaculars/README_inaturalist_images.md new file mode 100644 index 0000000..072f557 --- /dev/null +++ b/oz_tree_build/images_and_vernaculars/README_inaturalist_images.md @@ -0,0 +1,131 @@ +# iNaturalist image harvesting for OneZoom + +This directory now includes `get_inat_images.py`, a sibling of `get_wiki_images.py` for harvesting iNaturalist photos. + +## Mapping strategy + +The preferred mapping is: + +```text +OneZoom leaf -> ordered_leaves.wikidata -> Wikidata P3151 -> iNaturalist taxon ID +``` + +`P3151` is the Wikidata property for the iNaturalist taxon ID. The existing Wikidata filtering code already keeps this property, so a filtered OneZoom Wikidata dump can be used in clade mode. + +## License rule + +The harvester **must only accept** iNaturalist photos whose license is one of: + +- `CC0` +- `CC-BY` +- `CC-BY-SA` + +The code enforces this with `ALLOWED_INAT_PHOTO_LICENSES = {"cc0", "cc-by", "cc-by-sa"}`. It rejects non-commercial (`NC`) and no-derivatives (`ND`) licenses. + +## Image sources + +The script has two modes. + +### API mode + +Use this for prototypes, small clades, and the "most voted" behavior: + +```bash +python -m oz_tree_build.images_and_vernaculars.get_inat_images leaf "Xestospongia testudinaria" \ + --image-source api \ + --no-azure-crop \ + -o /path/to/OZtree/static/FinalOutputs/img \ + -c /path/to/appconfig.ini +``` + +API mode calls the iNaturalist v2 observations endpoint with: + +```text +taxon_id= +photos=true +quality_grade=research +photo_license=cc-by,cc-by-sa,cc0 +order_by=votes +order=desc +``` + +Then it chooses the best usable photo from the returned observations. This is the only mode that can approximate "most voted," because vote totals are not included in the open-data metadata dump. + +### Metadata mode + +Use this for bulk harvesting from the iNaturalist Open Data metadata dump after loading it into a local database: + +```bash +python -m oz_tree_build.images_and_vernaculars.get_inat_images clade OneZoom_latest-all.json Porifera \ + --image-source metadata \ + --inat-db-uri postgres://user:password@localhost/inaturalist-open-data \ + --no-azure-crop \ + -o /path/to/OZtree/static/FinalOutputs/img \ + -c /path/to/appconfig.ini +``` + +Metadata mode joins the dump tables: + +```text +observations -> photos -> observers +``` + +It filters to research-grade observations and the three allowed licenses. Since the metadata dump has no vote columns, it chooses by: + +1. lowest photo position, because the observer's first photo is usually the representative image; +2. largest pixel area; +3. lowest photo ID as a deterministic tie-breaker. + +## Expected iNaturalist metadata database + +The metadata database should contain the four iNaturalist Open Data tables named exactly as in the official documentation: + +- `observations` +- `photos` +- `taxa` +- `observers` + +The script currently needs these columns: + +```text +observations: observation_uuid, taxon_id, quality_grade, observed_on +photos: photo_id, observation_uuid, observer_id, extension, license, width, height, position +observers: observer_id, login, name +``` + +Useful indexes for this workflow: + +```sql +CREATE INDEX index_photos_observation_uuid ON photos USING btree (observation_uuid); +CREATE INDEX index_photos_observer_id ON photos USING btree (observer_id); +CREATE INDEX index_observers_observer_id ON observers USING btree (observer_id); +CREATE INDEX index_observations_taxon_id ON observations USING btree (taxon_id); +``` + +## Saved image layout + +The new source flag is: + +```python +src_flags["inat"] = 40 +``` + +Files are saved like existing image harvesters: + +```text +FinalOutputs/img/40//.jpg +FinalOutputs/img/40//_uncropped.jpg +FinalOutputs/img/40//_cropinfo.txt +``` + +The database `images_by_ott` row uses: + +- `src = 40` +- `src_id = iNaturalist photo_id` +- `url = iNaturalist photo page or observation URL` +- `rights = iNaturalist-style attribution string` +- `licence = CC0, CC-BY, or CC-BY-SA` + +## Important limitation + +The iNaturalist Open Data metadata dump is scalable, but it does not include vote totals. If you specifically need "most voted" photos, use API mode for candidate selection, then cache the selected photo IDs before downloading in bulk. From 52dd02c953a5ec27df4c9d785e5b895607c680b0 Mon Sep 17 00:00:00 2001 From: TheSensinator Date: Fri, 5 Jun 2026 01:18:21 -0500 Subject: [PATCH 3/8] Add get_inat_images.py for iNaturalist image harvesting This module harvests iNaturalist images for OneZoom leaves using either API mode or Metadata mode, filtering for specific open licenses and providing usage examples. --- .../images_and_vernaculars/get_inat_images.py | 853 ++++++++++++++++++ 1 file changed, 853 insertions(+) create mode 100644 oz_tree_build/images_and_vernaculars/get_inat_images.py diff --git a/oz_tree_build/images_and_vernaculars/get_inat_images.py b/oz_tree_build/images_and_vernaculars/get_inat_images.py new file mode 100644 index 0000000..5ecdb9e --- /dev/null +++ b/oz_tree_build/images_and_vernaculars/get_inat_images.py @@ -0,0 +1,853 @@ +""" +Harvest iNaturalist images for OneZoom leaves. + +This module mirrors oz_tree_build.images_and_vernaculars.get_wiki_images, but +uses Wikidata property P3151 (iNaturalist taxon ID) to map OneZoom leaves onto +iNaturalist taxa, then retrieves open-licensed iNaturalist photos. + +Only the following photo licences are accepted: +- CC0 +- CC-BY +- CC-BY-SA + +There are two image-selection routes: + +1. API mode, best for prototypes and small clades. It asks the iNaturalist v2 + observations API for research-grade observations with photos, sorted by votes, + then picks the first usable open-licensed photo. This is the only route that + can approximate "most voted" because the open-data metadata dump does not + include observation/photo vote totals. + +2. Metadata mode, best for bulk work with the iNaturalist Open Data dump loaded + into a local database. It joins observations, photos, and observers, filters to + research-grade observations and the three allowed licences, then chooses the + earliest-positioned / highest-resolution photo. This route scales, but cannot + rank by votes because the metadata dump does not provide vote fields. + +Usage examples: + + python -m oz_tree_build.images_and_vernaculars.get_inat_images leaf 563151 \ + --image-source api --no-azure-crop + + python -m oz_tree_build.images_and_vernaculars.get_inat_images clade \ + OneZoom_latest-all.json 563151 --image-source metadata \ + --inat-db-uri postgres://user:password@localhost/inaturalist-open-data +""" + +import argparse +import datetime +import json +import logging +import os +import re +import sys +import time +from pathlib import Path +from urllib.parse import urlparse + +import requests +from PIL import Image +from pydal import DAL + +from .._OZglobals import src_flags +from ..utilities.db_helper import ( + connect_to_database, + default_appconfig, + placeholder, + read_config, +) +from ..utilities.file_utils import enumerate_lines_from_file +from . import process_image_bits +from .get_wiki_images import default_outdir, get_qid_from_taxa_data, get_wikidata_json_for_qid, subdir_name +from .image_cropping import AzureImageCropper, CenterImageCropper + +logger = logging.getLogger(Path(__file__).name) + +# Keep this intentionally strict. Cropping/resizing is part of the pipeline, so +# no-derivatives licences are excluded, and non-commercial licences are excluded +# because OneZoom usage may not be strictly non-commercial in every context. +ALLOWED_INAT_PHOTO_LICENSES = frozenset({"cc0", "cc-by", "cc-by-sa"}) +ALLOWED_INAT_PHOTO_LICENSES_SQL = tuple(sorted(ALLOWED_INAT_PHOTO_LICENSES)) + +INAT_SRC = src_flags["inat"] +DEFAULT_INAT_IMAGE_RATING = 34000 +DEFAULT_API_PER_PAGE = 30 +DEFAULT_IMAGE_SIZE = "medium" +INAT_API_OBSERVATIONS_URL = "https://api.inaturalist.org/v2/observations" +INAT_OPEN_DATA_PHOTO_PREFIX = "https://inaturalist-open-data.s3.amazonaws.com/photos" +INAT_OBSERVATION_PREFIX = "https://www.inaturalist.org/observations" +INAT_PHOTO_PREFIX = "https://www.inaturalist.org/photos" + +inat_http_headers = { + "User-Agent": "OneZoomBot/0.1 (https://www.onezoom.org/; mail@onezoom.org) get-inat-images/0.1" +} + + +class InatImageError(Exception): + """Raised for recoverable iNaturalist image harvesting problems.""" + + +def normalise_inat_license(license_code): + """Return a normalized iNaturalist license code, e.g. 'cc-by-sa'.""" + if license_code is None: + return None + normalized = str(license_code).strip().lower().replace("_", "-") + if normalized in {"cc0-1.0", "cc0 1.0", "public-domain", "pd"}: + return "cc0" + # iNaturalist metadata sometimes stores an upper-case code, and API values + # sometimes include version suffixes. Keep only the family we explicitly allow. + for allowed in ALLOWED_INAT_PHOTO_LICENSES: + if normalized == allowed or normalized.startswith(allowed + "-"): + return allowed + return normalized + + +def is_allowed_inat_license(license_code): + return normalise_inat_license(license_code) in ALLOWED_INAT_PHOTO_LICENSES + + +def inat_license_string(license_code): + normalized = normalise_inat_license(license_code) + return normalized.upper() if normalized else None + + +def inat_photo_url_from_open_data(photo_id, extension, size=DEFAULT_IMAGE_SIZE): + """Build an S3 URL for an iNaturalist Open Data photo.""" + if not photo_id: + raise ValueError("photo_id is required") + if not extension: + raise ValueError("extension is required") + extension = str(extension).lower().lstrip(".") + return f"{INAT_OPEN_DATA_PHOTO_PREFIX}/{photo_id}/{size}.{extension}" + + +def inat_photo_page_url(photo_id): + return f"{INAT_PHOTO_PREFIX}/{photo_id}" if photo_id else None + + +def inat_observation_url(observation_uuid): + return f"{INAT_OBSERVATION_PREFIX}/{observation_uuid}" if observation_uuid else None + + +def make_http_request_with_retries(url, *, params=None, stream=False): + """Make an HTTP GET request with basic retry/backoff for rate limits.""" + retries = 6 + delay = 1 + for i in range(retries): + r = requests.get(url, params=params, headers=inat_http_headers, stream=stream) + if r.status_code == 200: + return r + if r.status_code in (429, 500, 502, 503, 504): + logger.warning("HTTP %s on attempt %s for %s", r.status_code, i + 1, url) + time.sleep(delay) + delay *= 2 + else: + raise InatImageError(f"Error requesting {url}: {r.status_code} {r.text}") + raise InatImageError(f"Failed to get {url} after {retries} attempts") + + +def get_inat_taxon_id_from_json_item(json_item): + """ + Extract Wikidata P3151 (iNaturalist taxon ID) from a Wikidata JSON item. + + Uses the preferred claim if present, otherwise the first normal claim. + """ + claims = json_item.get("claims", {}).get("P3151", []) + if not claims: + return None + preferred = [claim for claim in claims if claim.get("rank") == "preferred"] + claim = preferred[0] if preferred else claims[0] + try: + value = claim["mainsnak"]["datavalue"]["value"] + except (KeyError, TypeError): + return None + try: + return int(value) + except (TypeError, ValueError): + logger.warning("Invalid P3151 value in Q%s: %r", json_item.get("id"), value) + return None + + +def enumerate_wikidata_dump_items_with_inat_ids(wikidata_dump_file): + """Yield (qid, inat_taxon_id) for dump items containing P3151.""" + for _, line in enumerate_lines_from_file(wikidata_dump_file): + if not line.startswith('{"type":'): + continue + json_item = json.loads(line.rstrip().rstrip(",")) + inat_taxon_id = get_inat_taxon_id_from_json_item(json_item) + if inat_taxon_id is None: + continue + qid = int(json_item["id"][1:]) + yield qid, inat_taxon_id + + +def get_inat_taxon_id_from_taxa_data(taxa_data, taxon): + """Read an optional cached iNat taxon ID from a taxa-data JSON object.""" + if taxa_data is None or taxon not in taxa_data: + return None + data = taxa_data[taxon] + if not data: + return None + if "redirect" in data: + data = taxa_data[data["redirect"]] + for prop in ("inat", "inat_taxon", "inat_taxon_id", "inaturalist", "inaturalist_taxon_id"): + if prop in data and data[prop]: + return int(data[prop]) + return None + + +def get_inat_taxon_id_for_qid(qid): + json_item = get_wikidata_json_for_qid(qid) + return get_inat_taxon_id_from_json_item(json_item) + + +def preferred_api_photo_url(photo, size=DEFAULT_IMAGE_SIZE): + """Return a usable API photo URL, preferring medium/large URLs over square thumbnails.""" + # Some API responses expose explicit size keys. + if size == "large" and photo.get("large_url"): + return photo["large_url"] + if size == "medium" and photo.get("medium_url"): + return photo["medium_url"] + for key in ("medium_url", "large_url", "url"): + url = photo.get(key) + if url: + break + else: + return None + + # iNaturalist API photo URLs often end in /square.ext by default. Replace + # the size segment with the requested size when possible. + return re.sub(r"/(square|thumb|small|medium|large)\.([A-Za-z0-9]+)(\?.*)?$", f"/{size}.\\2", url) + + +def photo_id_from_api_photo(photo): + return photo.get("id") or photo.get("photo_id") + + +def dimensions_from_api_photo(photo): + dims = photo.get("original_dimensions") or photo.get("dimensions") or {} + return int(dims.get("width") or photo.get("width") or 0), int(dims.get("height") or photo.get("height") or 0) + + +def observer_name_from_api_observation(observation): + user = observation.get("user") or {} + return user.get("name") or user.get("login") or "Unknown iNaturalist observer" + + +def attribution_for_inat_photo(license_code, observer_name): + license_string = inat_license_string(license_code) + observer_name = observer_name or "Unknown iNaturalist observer" + if normalise_inat_license(license_code) == "cc0": + return f"{observer_name}, no rights reserved ({license_string})" + return f"© {observer_name}, some rights reserved ({license_string})" + + +def candidate_from_api_observation(observation, photo, position=0, image_size=DEFAULT_IMAGE_SIZE): + license_code = normalise_inat_license(photo.get("license_code") or photo.get("license")) + if license_code not in ALLOWED_INAT_PHOTO_LICENSES: + return None + + photo_id = photo_id_from_api_photo(photo) + image_url = preferred_api_photo_url(photo, image_size) + if not image_url: + return None + + width, height = dimensions_from_api_photo(photo) + observer_name = photo.get("attribution_name") or observer_name_from_api_observation(observation) + observation_uuid = observation.get("uuid") or observation.get("observation_uuid") + observation_url = observation.get("uri") or inat_observation_url(observation_uuid) or observation.get("url") + page_url = photo.get("native_page_url") or inat_photo_page_url(photo_id) or observation_url + + votes = observation.get("cached_votes_total") or observation.get("votes_count") or observation.get("votes") or 0 + try: + votes = int(votes) + except (TypeError, ValueError): + votes = 0 + + return { + "photo_id": int(photo_id) if photo_id is not None else None, + "src_id": int(photo_id) if photo_id is not None else None, + "image_url": image_url, + "page_url": page_url, + "observation_url": observation_url, + "observation_uuid": observation_uuid, + "license": license_code, + "license_string": inat_license_string(license_code), + "rights": attribution_for_inat_photo(license_code, observer_name), + "observer_name": observer_name, + "width": width, + "height": height, + "position": position, + "votes": votes, + "quality_grade": observation.get("quality_grade"), + "verified": str(observation.get("quality_grade", "")).lower() in {"research", "research grade"}, + "source": "api", + } + + +def score_candidate(candidate): + """Score candidates deterministically; API results are already vote-ordered.""" + if not candidate: + return -1 + score = 0 + score += int(candidate.get("votes") or 0) * 1_000_000 + if str(candidate.get("quality_grade", "")).lower() in {"research", "research grade"}: + score += 100_000 + if candidate.get("license") == "cc0": + score += 3_000 + elif candidate.get("license") == "cc-by": + score += 2_000 + elif candidate.get("license") == "cc-by-sa": + score += 1_000 + position = candidate.get("position") + if position is None: + position = 999 + score -= int(position) * 100 + score += min(int(candidate.get("width") or 0) * int(candidate.get("height") or 0), 20_000_000) // 10_000 + return score + + +def get_best_photo_from_inat_api(inat_taxon_id, *, per_page=DEFAULT_API_PER_PAGE, image_size=DEFAULT_IMAGE_SIZE): + """ + Query iNaturalist v2 observations and choose the best allowed photo. + + This uses order_by=votes because the open-data metadata dump does not expose + vote totals. Only CC0, CC-BY, and CC-BY-SA photo licences are requested and + accepted. + """ + params = { + "taxon_id": str(inat_taxon_id), + "photos": "true", + "quality_grade": "research", + "photo_license": ",".join(sorted(ALLOWED_INAT_PHOTO_LICENSES)), + "order_by": "votes", + "order": "desc", + "per_page": str(per_page), + # Keep fields broad because iNat v2 nested-field syntax can change. The + # selector functions above are defensive and ignore missing fields. + "fields": "all", + } + response = make_http_request_with_retries(INAT_API_OBSERVATIONS_URL, params=params) + data = response.json() + candidates = [] + for observation in data.get("results", []): + for position, photo in enumerate(observation.get("photos") or []): + candidate = candidate_from_api_observation(observation, photo, position, image_size=image_size) + if candidate: + candidates.append(candidate) + if not candidates: + return None + return max(candidates, key=score_candidate) + + +def connect_to_metadata_database(metadata_db_uri): + """Connect to the local iNaturalist metadata database.""" + if not metadata_db_uri: + raise ValueError("metadata_db_uri is required for --image-source metadata") + return DAL(metadata_db_uri) + + +def get_inat_metadata_db_uri(config, args): + if args.inat_db_uri: + return args.inat_db_uri + if config.has_section("inat"): + for key in ("metadata_uri", "metadata_db_uri", "uri"): + if config.has_option("inat", key): + return config.get("inat", key) + return None + + +def rows_to_dicts(columns, rows): + return [dict(zip(columns, row)) for row in rows] + + +def get_best_photo_from_metadata_db(inat_db, inat_taxon_id, *, image_size=DEFAULT_IMAGE_SIZE): + """ + Query a local iNaturalist Open Data metadata database for the best photo. + + The metadata dump has no vote columns, so this is not a "most voted" ranking. + It is the scalable fallback: research-grade, allowed licence, first photo + position, then largest image. + """ + s = placeholder(inat_db) + columns = [ + "photo_id", + "extension", + "license", + "width", + "height", + "position", + "observation_uuid", + "quality_grade", + "observed_on", + "observer_login", + "observer_name", + ] + sql = f""" + SELECT + p.photo_id, + p.extension, + p.license, + p.width, + p.height, + p.position, + obs.observation_uuid, + obs.quality_grade, + obs.observed_on, + o.login AS observer_login, + o.name AS observer_name + FROM observations obs + JOIN photos p ON obs.observation_uuid = p.observation_uuid + LEFT JOIN observers o ON p.observer_id = o.observer_id + WHERE obs.taxon_id = {s} + AND LOWER(p.license) IN ({s},{s},{s}) + AND LOWER(obs.quality_grade) IN ('research', 'research grade') + AND p.photo_id IS NOT NULL + AND p.extension IS NOT NULL + ORDER BY + COALESCE(p.position, 9999) ASC, + (COALESCE(p.width, 0) * COALESCE(p.height, 0)) DESC, + p.photo_id ASC + LIMIT 1; + """ + rows = rows_to_dicts( + columns, + inat_db.executesql(sql, (inat_taxon_id, *ALLOWED_INAT_PHOTO_LICENSES_SQL)), + ) + if not rows: + return None + row = rows[0] + license_code = normalise_inat_license(row["license"]) + if license_code not in ALLOWED_INAT_PHOTO_LICENSES: + return None + observer_name = row.get("observer_name") or row.get("observer_login") or "Unknown iNaturalist observer" + photo_id = row["photo_id"] + observation_uuid = str(row["observation_uuid"]) if row.get("observation_uuid") else None + return { + "photo_id": int(photo_id), + "src_id": int(photo_id), + "image_url": inat_photo_url_from_open_data(photo_id, row["extension"], image_size), + "page_url": inat_photo_page_url(photo_id), + "observation_url": inat_observation_url(observation_uuid), + "observation_uuid": observation_uuid, + "license": license_code, + "license_string": inat_license_string(license_code), + "rights": attribution_for_inat_photo(license_code, observer_name), + "observer_name": observer_name, + "width": int(row.get("width") or 0), + "height": int(row.get("height") or 0), + "position": int(row.get("position") or 0), + "votes": None, + "quality_grade": row.get("quality_grade"), + "verified": True, + "source": "metadata", + } + + +def get_best_photo(inat_taxon_id, *, image_source, inat_db=None, per_page=DEFAULT_API_PER_PAGE, image_size=DEFAULT_IMAGE_SIZE): + if image_source == "api": + return get_best_photo_from_inat_api(inat_taxon_id, per_page=per_page, image_size=image_size) + if image_source == "metadata": + if inat_db is None: + raise ValueError("inat_db must be provided for metadata image source") + return get_best_photo_from_metadata_db(inat_db, inat_taxon_id, image_size=image_size) + raise ValueError(f"Unknown image_source: {image_source}") + + +def safe_src_id(candidate): + src_id = candidate.get("src_id") or candidate.get("photo_id") + if src_id is None: + # API responses should include an ID; this is only a fallback to keep the + # file-system/database logic from receiving None. + parsed = urlparse(candidate.get("image_url") or "") + match = re.search(r"/photos/(\d+)/", parsed.path) + if match: + src_id = int(match.group(1)) + if src_id is None: + raise InatImageError("Cannot save iNaturalist image without a photo_id/src_id") + return int(src_id) + + +def save_inat_image(db, leaf_data, candidate, rating, output_dir, cropper): + """ + Download, crop, save, and insert an iNaturalist image into images_by_ott. + """ + s = placeholder(db) + ott = leaf_data["ott"] + if not ott: + logger.warning("No OTT for iNaturalist photo %s. Can't save image", candidate.get("photo_id")) + return False + + if not is_allowed_inat_license(candidate.get("license")): + logger.warning("Rejecting iNaturalist photo %s because of license %r", candidate.get("photo_id"), candidate.get("license")) + return False + + src = INAT_SRC + src_id = safe_src_id(candidate) + page_url = candidate.get("page_url") or candidate.get("observation_url") or inat_photo_page_url(src_id) + image_url = candidate.get("image_url") + if not image_url: + logger.warning("No downloadable image URL for iNaturalist photo %s", src_id) + return False + + image_dir = os.path.normpath(os.path.join(output_dir, str(src), subdir_name(src_id))) + image_path = f"{image_dir}/{src_id}.jpg" + + if leaf_data.get("img") == page_url and os.path.isfile(image_path): + logger.debug("iNaturalist image %s for ott=%s is already present", src_id, ott) + return True + + logger.info("Processing iNaturalist image for ott=%s, taxon=%s, photo_id=%s", ott, leaf_data.get("taxon"), src_id) + + if not os.path.exists(image_dir): + os.makedirs(image_dir) + + uncropped_image_path = f"{image_dir}/{src_id}_uncropped.jpg" + response = make_http_request_with_retries(image_url, stream=True) + with open(uncropped_image_path, "wb") as f: + for chunk in response.iter_content(1024): + if chunk: + f.write(chunk) + + if cropper is None: + cropper = CenterImageCropper() + + crop_box = cropper.crop(image_url, uncropped_image_path) + + im = Image.open(uncropped_image_path) + if im.mode in ("RGBA", "P", "LA"): + im = im.convert("RGB") + im = im.resize( + (300, 300), + box=( + crop_box.x, + crop_box.y, + crop_box.x + crop_box.width, + crop_box.y + crop_box.height, + ), + ) + try: + im.save(image_path) + except Exception as e: + logger.warning("Error saving %s: %s", image_path, e) + return False + + crop_info_path = f"{image_dir}/{src_id}_cropinfo.txt" + with open(crop_info_path, "w") as f: + f.write(f"{crop_box.x},{crop_box.y},{crop_box.width},{crop_box.height}") + + # Keep one iNaturalist image per OTT for this source. + db.executesql(f"DELETE FROM images_by_ott WHERE ott={s} and src={s};", (ott, src)) + + is_public_domain = normalise_inat_license(candidate.get("license")) == "cc0" + verified = 1 if candidate.get("verified") else 0 + + db.executesql( + "INSERT INTO images_by_ott " + "(ott,src,src_id,url,rating,rating_confidence,best_any,best_verified,best_pd," + "overall_best_any,overall_best_verified,overall_best_pd,rights,licence,updated) " + f"VALUES ({s},{s},{s},{s},{s},{s},{s},{s},{s},{s},{s},{s},{s},{s},{s});", + ( + ott, + src, + src_id, + page_url, + rating, + None, + 1, + verified, + (1 if is_public_domain else 0), + 1, + verified, + (1 if is_public_domain else 0), + candidate.get("rights"), + candidate.get("license_string") or inat_license_string(candidate.get("license")), + datetime.datetime.now().isoformat(), + ), + ) + db.commit() + + process_image_bits.resolve(db, ott) + logger.info("Saved iNaturalist photo %s for ott=%s in %s", src_id, ott, image_path) + return True + + +def get_leaf_record(db, ott_or_taxon): + s = placeholder(db) + sql = "SELECT ott,wikidata,name FROM ordered_leaves WHERE " + if ott_or_taxon.lstrip("-").isnumeric(): + ott_or_taxon_type = "ott" + sql += f"ott={s};" + else: + ott_or_taxon_type = "name" + sql += f"name={s};" + + result = db.executesql(sql, (ott_or_taxon,)) + if len(result) > 1: + logger.error("Multiple results for '%s'", ott_or_taxon) + return None + if len(result) == 0: + logger.error("%s '%s' not found in ordered_leaves table", ott_or_taxon_type, ott_or_taxon) + return None + ott, qid, name = result[0] + return {"ott": ott, "qid": qid, "taxon": name, "img": None} + + +def process_leaf( + db, + ott_or_taxon, + taxa_data=None, + rating=None, + output_dir=None, + cropper=None, + image_source="api", + inat_db=None, + per_page=DEFAULT_API_PER_PAGE, + image_size=DEFAULT_IMAGE_SIZE, + inat_taxon_id=None, +): + leaf_data = get_leaf_record(db, ott_or_taxon) + if leaf_data is None: + return False + + qid = leaf_data["qid"] or get_qid_from_taxa_data(taxa_data, leaf_data["taxon"]) + inat_taxon_id = inat_taxon_id or get_inat_taxon_id_from_taxa_data(taxa_data, leaf_data["taxon"]) + if not inat_taxon_id: + if not qid: + logger.warning("No Wikidata QID or iNaturalist taxon ID for %s. Skipping.", leaf_data["taxon"]) + return False + inat_taxon_id = get_inat_taxon_id_for_qid(qid) + if not inat_taxon_id: + logger.warning("No Wikidata P3151/iNaturalist taxon ID for %s. Skipping.", leaf_data["taxon"]) + return False + + if rating is None: + rating = DEFAULT_INAT_IMAGE_RATING + + candidate = get_best_photo( + inat_taxon_id, + image_source=image_source, + inat_db=inat_db, + per_page=per_page, + image_size=image_size, + ) + if not candidate: + logger.warning("No allowed iNaturalist image found for %s (iNat taxon %s)", leaf_data["taxon"], inat_taxon_id) + return False + + return save_inat_image(db, leaf_data, candidate, rating, output_dir, cropper) + + +def process_clade( + db, + ott_or_taxon, + dump_file, + taxa_data=None, + rating=None, + output_dir=None, + cropper=None, + image_source="metadata", + inat_db=None, + per_page=DEFAULT_API_PER_PAGE, + image_size=DEFAULT_IMAGE_SIZE, +): + s = placeholder(db) + sql = "SELECT leaf_lft,leaf_rgt,ott FROM ordered_nodes WHERE " + if ott_or_taxon.isnumeric(): + sql += f"ott={s};" + else: + sql += f"name={s};" + rows = db.executesql(sql, (ott_or_taxon,)) + if len(rows) == 0: + raise ValueError(f"'{ott_or_taxon}' not found in ordered_nodes table") + if len(rows) > 1: + logger.error("Multiple results for '%s', choose out of these OTTs: %s", ott_or_taxon, [r[2] for r in rows]) + return + leaf_lft, leaf_rgt, _ = rows[0] + + sql = f""" + SELECT wikidata, ordered_leaves.ott, name, url FROM ordered_leaves + LEFT OUTER JOIN (SELECT ott,src,url FROM images_by_ott + WHERE src={s}) as inat_images_by_ott ON ordered_leaves.ott=inat_images_by_ott.ott + WHERE ordered_leaves.id >= {s} AND ordered_leaves.id <= {s}; + """ + rows = db.executesql(sql, (INAT_SRC, leaf_lft, leaf_rgt)) + + leaves_data = {} + for qid, ott, name, url in rows: + if ott is None: + continue + if not qid: + qid = get_qid_from_taxa_data(taxa_data, name) + if not qid: + logger.warning("No qid for %s. Skipping it.", name) + continue + leaves_data[qid] = {"ott": ott, "taxon": name, "img": url} + logger.info("Found %s leaves in the database", len(leaves_data)) + + if rating is None: + rating = DEFAULT_INAT_IMAGE_RATING + + leaves_that_got_images = set() + for qid, inat_taxon_id in enumerate_wikidata_dump_items_with_inat_ids(dump_file): + if qid not in leaves_data: + continue + candidate = get_best_photo( + inat_taxon_id, + image_source=image_source, + inat_db=inat_db, + per_page=per_page, + image_size=image_size, + ) + if candidate and save_inat_image(db, leaves_data[qid], candidate, rating, output_dir, cropper): + leaves_that_got_images.add(qid) + + missing = "" + for qid, leaf_data in leaves_data.items(): + if qid not in leaves_that_got_images: + missing += f"\n ott={leaf_data['ott']} qid={qid} {leaf_data['taxon']}" + if missing: + logger.info("Taxa for which we couldn't find an allowed iNaturalist image:%s", missing) + + +def process_args(args): + outdir = args.output_dir + config = read_config(args.conf_file) + database = config.get("db", "uri") + + if outdir is None: + outdir = os.path.join(os.path.dirname(os.path.abspath(__file__)), default_outdir) + if not os.path.exists(outdir): + logger.error("Output directory '%s' does not exist", outdir) + return + + db = connect_to_database(database) + cropper = None if args.no_azure_crop else AzureImageCropper(config) + + inat_db = None + if args.image_source == "metadata": + inat_db_uri = get_inat_metadata_db_uri(config, args) + inat_db = connect_to_metadata_database(inat_db_uri) + + taxa_data = {} + if args.taxa_data_file: + with open(args.taxa_data_file) as f: + taxa_data = json.load(f) + + if args.subcommand == "leaf": + if len(args.ott_or_taxa) > 1 and args.inat_taxon_id is not None: + raise ValueError("Cannot specify --inat-taxon-id when processing multiple taxa") + for name in args.ott_or_taxa: + process_leaf( + db, + name, + taxa_data=taxa_data, + rating=args.rating, + output_dir=outdir, + cropper=cropper, + image_source=args.image_source, + inat_db=inat_db, + per_page=args.api_per_page, + image_size=args.image_size, + inat_taxon_id=args.inat_taxon_id, + ) + elif args.subcommand == "clade": + for name in args.ott_or_taxa: + process_clade( + db, + name, + args.wd_dump, + taxa_data=taxa_data, + rating=args.rating, + output_dir=outdir, + cropper=cropper, + image_source=args.image_source, + inat_db=inat_db, + per_page=args.api_per_page, + image_size=args.image_size, + ) + + if inat_db is not None: + inat_db.close() + db.close() + + +def setup_logging(args): + log_level = "WARN" + if args.quiet > 0: + log_level = "ERROR" + if args.quiet > 1: + log_level = "CRITICAL" + if args.quiet > 2: + log_level = logging.CRITICAL + 1 + else: + if args.verbosity > 0: + log_level = "INFO" + if args.verbosity > 1: + log_level = "DEBUG" + logging.basicConfig(level=log_level) + return log_level + + +def main(): + parser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0]) + subparsers = parser.add_subparsers(help="help for subcommand", dest="subcommand") + + def add_common_args(subparser): + subparser.add_argument("-v", "--verbosity", action="count", default=0) + subparser.add_argument("-q", "--quiet", action="count", default=0) + subparser.add_argument("--taxa-data-file", default=None, help="JSON file with persisted data about various taxa") + subparser.add_argument( + "--no-azure-crop", + action="store_true", + help="Do not use the Azure Vision API to crop images; use a centered crop instead.", + ) + subparser.add_argument( + "-o", + "--output-dir", + default=None, + help=( + "The location to save image files (e.g. FinalOutputs/img). " + "Files are saved to output_dir/{src_flag}/{last-three-digits}/{photo_id}.jpg" + ), + ) + subparser.add_argument("-c", "--conf-file", default=None, help=f"The configuration file. Defaults to {default_appconfig}") + subparser.add_argument( + "--image-source", + choices=("api", "metadata"), + default="api", + help=( + "api = use iNaturalist API ordered by votes; metadata = use local Open Data metadata DB. " + "Metadata mode cannot rank by votes because the dump does not include vote counts." + ), + ) + subparser.add_argument( + "--inat-db-uri", + default=None, + help="pydal URI for local iNaturalist metadata DB, e.g. postgres://user:password@host/inaturalist-open-data", + ) + subparser.add_argument("--api-per-page", type=int, default=DEFAULT_API_PER_PAGE) + subparser.add_argument("--image-size", choices=("medium", "large"), default=DEFAULT_IMAGE_SIZE) + subparser.add_argument("-r", "--rating", type=int, help=f"Image rating; defaults to {DEFAULT_INAT_IMAGE_RATING}") + + parser_leaf = subparsers.add_parser("leaf", help="Process one or more leaves") + parser_leaf.add_argument("ott_or_taxa", nargs="+", type=str, help="Leaf OTTs or names to process") + parser_leaf.add_argument("--inat-taxon-id", type=int, default=None, help="Manual iNaturalist taxon ID override for one leaf") + add_common_args(parser_leaf) + + parser_clade = subparsers.add_parser("clade", help="Process a full clade") + parser_clade.add_argument("wd_dump", type=str, help="Filtered Wikidata JSON dump containing P3151 claims") + parser_clade.add_argument("ott_or_taxa", nargs="+", type=str, help="Root node OTT or name") + add_common_args(parser_clade) + + args = parser.parse_args() + if not args.subcommand: + parser.print_help() + sys.exit() + + setup_logging(args) + process_args(args) + + +if __name__ == "__main__": + main() From 2753df6800e6aca99c0e7c632de9540ea648d921 Mon Sep 17 00:00:00 2001 From: TheSensinator Date: Fri, 5 Jun 2026 01:19:05 -0500 Subject: [PATCH 4/8] Add get_inat_images function to pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2138c83..3273878 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ discover_latest_enwiki_sql_url = "oz_tree_build.utilities.filter_wikipedia_sql:d download_opentree = "oz_tree_build.utilities.download_opentree:main" CSV_base_table_creator = "oz_tree_build.taxon_mapping_and_popularity.CSV_base_table_creator:main" get_wiki_images = "oz_tree_build.images_and_vernaculars.get_wiki_images:main" +get_inat_images = "oz_tree_build.images_and_vernaculars.get_inat_images:main" process_image_bits = "oz_tree_build.images_and_vernaculars.process_image_bits:main" make_js_treefiles = "oz_tree_build.utilities.make_js_treefiles:main" format_newick = "oz_tree_build.newick.format_newick:main" From fed96df47203d72d240787857c9bf6ec4e320fc8 Mon Sep 17 00:00:00 2001 From: TheSensinator Date: Fri, 5 Jun 2026 01:19:40 -0500 Subject: [PATCH 5/8] Implement tests for get_inat_images functionality Add unit tests for the get_inat_images module, covering various scenarios including taxon ID extraction, license filtering, and API selection. --- tests/test_get_inat_images.py | 161 ++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 tests/test_get_inat_images.py diff --git a/tests/test_get_inat_images.py b/tests/test_get_inat_images.py new file mode 100644 index 0000000..8234e4d --- /dev/null +++ b/tests/test_get_inat_images.py @@ -0,0 +1,161 @@ +from types import SimpleNamespace +from unittest import mock + +import pytest + +from oz_tree_build.images_and_vernaculars import get_inat_images +from oz_tree_build.utilities.db_helper import connect_to_database, placeholder + + +class MockResponse: + def __init__(self, status_code=200, json_data=None, content=b""): + self.status_code = status_code + self._json_data = json_data or {} + self.content = content + self.text = "" + + def json(self): + return self._json_data + + def iter_content(self, chunk_size): + return [self.content] + + +def test_get_inat_taxon_id_from_json_item_prefers_preferred_claim(): + json_item = { + "id": "Q2355832", + "claims": { + "P3151": [ + {"rank": "normal", "mainsnak": {"datavalue": {"value": "111"}}}, + {"rank": "preferred", "mainsnak": {"datavalue": {"value": "319598"}}}, + ] + }, + } + assert get_inat_images.get_inat_taxon_id_from_json_item(json_item) == 319598 + + +@pytest.mark.parametrize( + "licence,expected", + [ + ("CC0", True), + ("cc-by", True), + ("CC-BY-SA", True), + ("cc-by-nc", False), + ("cc-by-nd", False), + ("all-rights-reserved", False), + (None, False), + ], +) +def test_license_filter_is_strict(licence, expected): + assert get_inat_images.is_allowed_inat_license(licence) is expected + + +def test_inat_open_data_url_uses_medium_not_square(): + url = get_inat_images.inat_photo_url_from_open_data(12345, "jpg") + assert url == "https://inaturalist-open-data.s3.amazonaws.com/photos/12345/medium.jpg" + + +def test_api_selection_rejects_non_allowed_licences(): + api_response = { + "results": [ + { + "uuid": "obs-1", + "quality_grade": "research", + "cached_votes_total": 999, + "user": {"login": "badlicence"}, + "photos": [ + { + "id": 1, + "url": "https://static.inaturalist.org/photos/1/square.jpg", + "license_code": "cc-by-nc", + } + ], + }, + { + "uuid": "obs-2", + "quality_grade": "research", + "cached_votes_total": 10, + "user": {"login": "goodlicence"}, + "photos": [ + { + "id": 2, + "url": "https://static.inaturalist.org/photos/2/square.jpg", + "license_code": "cc-by", + "original_dimensions": {"width": 1000, "height": 800}, + } + ], + }, + ] + } + + def fake_get(url, params=None, headers=None, stream=False): + assert params["photo_license"] == "cc-by,cc-by-sa,cc0" + assert params["order_by"] == "votes" + return MockResponse(json_data=api_response) + + with mock.patch("requests.get", side_effect=fake_get): + candidate = get_inat_images.get_best_photo_from_inat_api(319598) + + assert candidate["photo_id"] == 2 + assert candidate["license"] == "cc-by" + assert candidate["image_url"].endswith("/medium.jpg") + + +def test_metadata_selection_filters_to_allowed_license_and_research_grade(tmp_path): + db_path = tmp_path / "inat.sqlite" + inat_db = connect_to_database(database=f"sqlite://{db_path}") + s = placeholder(inat_db) + inat_db.executesql( + """CREATE TABLE observations ( + observation_uuid TEXT NOT NULL, + observer_id INTEGER, + taxon_id INTEGER, + quality_grade TEXT, + observed_on TEXT + );""" + ) + inat_db.executesql( + """CREATE TABLE photos ( + photo_id INTEGER NOT NULL, + observation_uuid TEXT NOT NULL, + observer_id INTEGER, + extension TEXT, + license TEXT, + width INTEGER, + height INTEGER, + position INTEGER + );""" + ) + inat_db.executesql( + """CREATE TABLE observers ( + observer_id INTEGER NOT NULL, + login TEXT, + name TEXT + );""" + ) + inat_db.executesql("INSERT INTO observers VALUES (1, 'observer_login', 'Observer Name');") + inat_db.executesql( + f"INSERT INTO observations VALUES ({s},{s},{s},{s},{s});", + ("obs-bad", 1, 319598, "research", "2024-01-01"), + ) + inat_db.executesql( + f"INSERT INTO observations VALUES ({s},{s},{s},{s},{s});", + ("obs-good", 1, 319598, "research", "2024-01-02"), + ) + inat_db.executesql( + f"INSERT INTO photos VALUES ({s},{s},{s},{s},{s},{s},{s},{s});", + (1, "obs-bad", 1, "jpg", "cc-by-nc", 4000, 4000, 0), + ) + inat_db.executesql( + f"INSERT INTO photos VALUES ({s},{s},{s},{s},{s},{s},{s},{s});", + (2, "obs-good", 1, "jpeg", "CC-BY-SA", 2000, 1000, 0), + ) + inat_db.commit() + + candidate = get_inat_images.get_best_photo_from_metadata_db(inat_db, 319598) + assert candidate["photo_id"] == 2 + assert candidate["license"] == "cc-by-sa" + assert candidate["license_string"] == "CC-BY-SA" + assert candidate["rights"] == "© Observer Name, some rights reserved (CC-BY-SA)" + assert candidate["image_url"].endswith("/2/medium.jpeg") + inat_db.close() From 7a945eee2f3164015c7436f351538bdd12357ac5 Mon Sep 17 00:00:00 2001 From: TheSensinator Date: Fri, 5 Jun 2026 13:52:05 -0500 Subject: [PATCH 6/8] Add duckdb as a dependency in pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 3273878..d6fb0ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "cryptography>=42.0", "mwparserfromhell>=0.6.6", "requests-cache>=1.2.1", + "duckdb>=1.0", "dvc[s3]>=3.0", ] From 2144b9fd93def4d71d64c0667a3035164253b8fc Mon Sep 17 00:00:00 2001 From: TheSensinator Date: Fri, 5 Jun 2026 14:06:18 -0500 Subject: [PATCH 7/8] Switch iNaturalist metadata handling to DuckDB Refactor to use DuckDB for iNaturalist metadata, updating database connection methods and SQL queries accordingly. --- .../images_and_vernaculars/get_inat_images.py | 192 ++++++++++++++---- 1 file changed, 148 insertions(+), 44 deletions(-) diff --git a/oz_tree_build/images_and_vernaculars/get_inat_images.py b/oz_tree_build/images_and_vernaculars/get_inat_images.py index 5ecdb9e..9269e71 100644 --- a/oz_tree_build/images_and_vernaculars/get_inat_images.py +++ b/oz_tree_build/images_and_vernaculars/get_inat_images.py @@ -31,7 +31,10 @@ python -m oz_tree_build.images_and_vernaculars.get_inat_images clade \ OneZoom_latest-all.json 563151 --image-source metadata \ - --inat-db-uri postgres://user:password@localhost/inaturalist-open-data + --inat-duckdb-path data/iNaturalist/inaturalist.duckdb + + python -m oz_tree_build.images_and_vernaculars.get_inat_images stats \ + --inat-duckdb-path data/iNaturalist/inaturalist.duckdb """ import argparse @@ -45,9 +48,9 @@ from pathlib import Path from urllib.parse import urlparse +import duckdb import requests from PIL import Image -from pydal import DAL from .._OZglobals import src_flags from ..utilities.db_helper import ( @@ -68,6 +71,13 @@ # because OneZoom usage may not be strictly non-commercial in every context. ALLOWED_INAT_PHOTO_LICENSES = frozenset({"cc0", "cc-by", "cc-by-sa"}) ALLOWED_INAT_PHOTO_LICENSES_SQL = tuple(sorted(ALLOWED_INAT_PHOTO_LICENSES)) +# SQL predicate used against the iNaturalist metadata dump. Keep it aligned +# with normalise_inat_license()/is_allowed_inat_license(), but do the cheap +# filtering inside DuckDB so the full dump does not have to be loaded into +# Python. Non-commercial and no-derivatives variants are deliberately excluded. +INAT_USABLE_LICENSE_SQL = """ + normalized_license IN ('cc0', 'cc-by', 'cc-by-sa') +""" INAT_SRC = src_flags["inat"] DEFAULT_INAT_IMAGE_RATING = 34000 @@ -96,7 +106,7 @@ def normalise_inat_license(license_code): return "cc0" # iNaturalist metadata sometimes stores an upper-case code, and API values # sometimes include version suffixes. Keep only the family we explicitly allow. - for allowed in ALLOWED_INAT_PHOTO_LICENSES: + for allowed in sorted(ALLOWED_INAT_PHOTO_LICENSES, key=len, reverse=True): if normalized == allowed or normalized.startswith(allowed + "-"): return allowed return normalized @@ -340,23 +350,27 @@ def get_best_photo_from_inat_api(inat_taxon_id, *, per_page=DEFAULT_API_PER_PAGE return max(candidates, key=score_candidate) -def connect_to_metadata_database(metadata_db_uri): - """Connect to the local iNaturalist metadata database.""" - if not metadata_db_uri: - raise ValueError("metadata_db_uri is required for --image-source metadata") - return DAL(metadata_db_uri) +def connect_to_metadata_database(metadata_db_path): + """Connect to the local iNaturalist DuckDB metadata database.""" + if not metadata_db_path: + raise ValueError("--inat-duckdb-path is required for --image-source metadata and stats") + return duckdb.connect(str(metadata_db_path), read_only=True) -def get_inat_metadata_db_uri(config, args): - if args.inat_db_uri: - return args.inat_db_uri +def get_inat_metadata_db_path(config, args): + if args.inat_duckdb_path: + return args.inat_duckdb_path if config.has_section("inat"): - for key in ("metadata_uri", "metadata_db_uri", "uri"): + for key in ("duckdb_path", "metadata_duckdb_path", "metadata_db_path", "metadata_uri", "metadata_db_uri", "uri"): if config.has_option("inat", key): return config.get("inat", key) return None +def run_duckdb_query(inat_db, sql, params=None): + return inat_db.execute(sql, params or ()).fetchall() + + def rows_to_dicts(columns, rows): return [dict(zip(columns, row)) for row in rows] @@ -369,7 +383,6 @@ def get_best_photo_from_metadata_db(inat_db, inat_taxon_id, *, image_size=DEFAUL It is the scalable fallback: research-grade, allowed licence, first photo position, then largest image. """ - s = placeholder(inat_db) columns = [ "photo_id", "extension", @@ -384,36 +397,49 @@ def get_best_photo_from_metadata_db(inat_db, inat_taxon_id, *, image_size=DEFAUL "observer_name", ] sql = f""" + WITH candidate_photos AS ( + SELECT + p.photo_id, + p.extension, + p.license, + LOWER(REPLACE(COALESCE(p.license, ''), '_', '-')) AS normalized_license, + p.width, + p.height, + p.position, + obs.observation_uuid, + obs.quality_grade, + obs.observed_on, + o.login AS observer_login, + o.name AS observer_name + FROM observations obs + JOIN photos p ON obs.observation_uuid = p.observation_uuid + LEFT JOIN observers o ON p.observer_id = o.observer_id + WHERE obs.taxon_id = ? + AND LOWER(obs.quality_grade) IN ('research', 'research grade') + AND p.photo_id IS NOT NULL + AND p.extension IS NOT NULL + ) SELECT - p.photo_id, - p.extension, - p.license, - p.width, - p.height, - p.position, - obs.observation_uuid, - obs.quality_grade, - obs.observed_on, - o.login AS observer_login, - o.name AS observer_name - FROM observations obs - JOIN photos p ON obs.observation_uuid = p.observation_uuid - LEFT JOIN observers o ON p.observer_id = o.observer_id - WHERE obs.taxon_id = {s} - AND LOWER(p.license) IN ({s},{s},{s}) - AND LOWER(obs.quality_grade) IN ('research', 'research grade') - AND p.photo_id IS NOT NULL - AND p.extension IS NOT NULL + photo_id, + extension, + license, + width, + height, + position, + observation_uuid, + quality_grade, + observed_on, + observer_login, + observer_name + FROM candidate_photos + WHERE {INAT_USABLE_LICENSE_SQL} ORDER BY - COALESCE(p.position, 9999) ASC, - (COALESCE(p.width, 0) * COALESCE(p.height, 0)) DESC, - p.photo_id ASC + COALESCE(position, 9999) ASC, + (COALESCE(width, 0) * COALESCE(height, 0)) DESC, + photo_id ASC LIMIT 1; """ - rows = rows_to_dicts( - columns, - inat_db.executesql(sql, (inat_taxon_id, *ALLOWED_INAT_PHOTO_LICENSES_SQL)), - ) + rows = rows_to_dicts(columns, run_duckdb_query(inat_db, sql, (inat_taxon_id,))) if not rows: return None row = rows[0] @@ -454,6 +480,60 @@ def get_best_photo(inat_taxon_id, *, image_source, inat_db=None, per_page=DEFAUL raise ValueError(f"Unknown image_source: {image_source}") +def get_usable_photo_species_stats(inat_db): + """Return counts of iNaturalist species that have at least one usable photo.""" + columns = [ + "inat_species", + "inat_species_with_usable_photos", + "inat_species_with_research_grade_usable_photos", + ] + sql = f""" + WITH photo_observations AS ( + SELECT + obs.taxon_id, + obs.quality_grade, + LOWER(REPLACE(COALESCE(p.license, ''), '_', '-')) AS normalized_license + FROM observations obs + JOIN photos p ON obs.observation_uuid = p.observation_uuid + WHERE p.photo_id IS NOT NULL + ), + species_taxa AS ( + SELECT taxon_id + FROM taxa + WHERE LOWER(rank) = 'species' + ), + usable_species AS ( + SELECT DISTINCT po.taxon_id + FROM photo_observations po + JOIN species_taxa st ON po.taxon_id = st.taxon_id + WHERE {INAT_USABLE_LICENSE_SQL} + ), + usable_research_grade_species AS ( + SELECT DISTINCT po.taxon_id + FROM photo_observations po + JOIN species_taxa st ON po.taxon_id = st.taxon_id + WHERE {INAT_USABLE_LICENSE_SQL} + AND LOWER(po.quality_grade) IN ('research', 'research grade') + ) + SELECT + (SELECT COUNT(*) FROM species_taxa) AS inat_species, + (SELECT COUNT(*) FROM usable_species) AS inat_species_with_usable_photos, + (SELECT COUNT(*) FROM usable_research_grade_species) AS inat_species_with_research_grade_usable_photos; + """ + rows = rows_to_dicts(columns, run_duckdb_query(inat_db, sql)) + return rows[0] if rows else dict.fromkeys(columns, 0) + + +def print_usable_photo_species_stats(inat_db): + stats = get_usable_photo_species_stats(inat_db) + print(f"iNaturalist species: {stats['inat_species']:,}") + print(f"iNaturalist species with >=1 usable photo: {stats['inat_species_with_usable_photos']:,}") + print( + "iNaturalist species with >=1 usable research-grade photo: " + f"{stats['inat_species_with_research_grade_usable_photos']:,}" + ) + + def safe_src_id(candidate): src_id = candidate.get("src_id") or candidate.get("photo_id") if src_id is None: @@ -711,8 +791,18 @@ def process_clade( def process_args(args): - outdir = args.output_dir config = read_config(args.conf_file) + + if args.subcommand == "stats": + inat_db_path = get_inat_metadata_db_path(config, args) + inat_db = connect_to_metadata_database(inat_db_path) + try: + print_usable_photo_species_stats(inat_db) + finally: + inat_db.close() + return + + outdir = args.output_dir database = config.get("db", "uri") if outdir is None: @@ -726,8 +816,8 @@ def process_args(args): inat_db = None if args.image_source == "metadata": - inat_db_uri = get_inat_metadata_db_uri(config, args) - inat_db = connect_to_metadata_database(inat_db_uri) + inat_db_path = get_inat_metadata_db_path(config, args) + inat_db = connect_to_metadata_database(inat_db_path) taxa_data = {} if args.taxa_data_file: @@ -820,11 +910,13 @@ def add_common_args(subparser): "api = use iNaturalist API ordered by votes; metadata = use local Open Data metadata DB. " "Metadata mode cannot rank by votes because the dump does not include vote counts." ), - ) + subparser.add_argument( + "--inat-duckdb-path", "--inat-db-uri", + dest="inat_duckdb_path", default=None, - help="pydal URI for local iNaturalist metadata DB, e.g. postgres://user:password@host/inaturalist-open-data", + help="Path to local iNaturalist DuckDB metadata DB, e.g. data/iNaturalist/inaturalist.duckdb", ) subparser.add_argument("--api-per-page", type=int, default=DEFAULT_API_PER_PAGE) subparser.add_argument("--image-size", choices=("medium", "large"), default=DEFAULT_IMAGE_SIZE) @@ -840,6 +932,18 @@ def add_common_args(subparser): parser_clade.add_argument("ott_or_taxa", nargs="+", type=str, help="Root node OTT or name") add_common_args(parser_clade) + parser_stats = subparsers.add_parser("stats", help="Print usable-photo species counts from the iNaturalist DuckDB metadata DB") + parser_stats.add_argument("-v", "--verbosity", action="count", default=0) + parser_stats.add_argument("-q", "--quiet", action="count", default=0) + parser_stats.add_argument("-c", "--conf-file", default=None, help=f"The configuration file. Defaults to {default_appconfig}") + parser_stats.add_argument( + "--inat-duckdb-path", + "--inat-db-uri", + dest="inat_duckdb_path", + default=None, + help="Path to local iNaturalist DuckDB metadata DB, e.g. data/iNaturalist/inaturalist.duckdb", + ) + args = parser.parse_args() if not args.subcommand: parser.print_help() From 2be8d1833e50467ccc3102be08108929a76292c9 Mon Sep 17 00:00:00 2001 From: TheSensinator Date: Fri, 5 Jun 2026 14:12:56 -0500 Subject: [PATCH 8/8] Switch to DuckDB for test database setup Refactor database creation to use DuckDB and update test cases accordingly. --- tests/test_get_inat_images.py | 64 +++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/tests/test_get_inat_images.py b/tests/test_get_inat_images.py index 8234e4d..236803e 100644 --- a/tests/test_get_inat_images.py +++ b/tests/test_get_inat_images.py @@ -1,10 +1,10 @@ from types import SimpleNamespace from unittest import mock +import duckdb import pytest from oz_tree_build.images_and_vernaculars import get_inat_images -from oz_tree_build.utilities.db_helper import connect_to_database, placeholder class MockResponse: @@ -101,11 +101,10 @@ def fake_get(url, params=None, headers=None, stream=False): assert candidate["image_url"].endswith("/medium.jpg") -def test_metadata_selection_filters_to_allowed_license_and_research_grade(tmp_path): - db_path = tmp_path / "inat.sqlite" - inat_db = connect_to_database(database=f"sqlite://{db_path}") - s = placeholder(inat_db) - inat_db.executesql( +def create_test_inat_duckdb(tmp_path): + db_path = tmp_path / "inat.duckdb" + inat_db = duckdb.connect(str(db_path)) + inat_db.execute( """CREATE TABLE observations ( observation_uuid TEXT NOT NULL, observer_id INTEGER, @@ -114,7 +113,7 @@ def test_metadata_selection_filters_to_allowed_license_and_research_grade(tmp_pa observed_on TEXT );""" ) - inat_db.executesql( + inat_db.execute( """CREATE TABLE photos ( photo_id INTEGER NOT NULL, observation_uuid TEXT NOT NULL, @@ -126,32 +125,51 @@ def test_metadata_selection_filters_to_allowed_license_and_research_grade(tmp_pa position INTEGER );""" ) - inat_db.executesql( + inat_db.execute( """CREATE TABLE observers ( observer_id INTEGER NOT NULL, login TEXT, name TEXT );""" ) - inat_db.executesql("INSERT INTO observers VALUES (1, 'observer_login', 'Observer Name');") - inat_db.executesql( - f"INSERT INTO observations VALUES ({s},{s},{s},{s},{s});", + inat_db.execute( + """CREATE TABLE taxa ( + taxon_id INTEGER NOT NULL, + rank TEXT + );""" + ) + inat_db.execute("INSERT INTO observers VALUES (1, 'observer_login', 'Observer Name');") + inat_db.execute("INSERT INTO taxa VALUES (319598, 'species'), (999, 'species'), (111, 'genus');") + inat_db.execute( + "INSERT INTO observations VALUES (?, ?, ?, ?, ?);", ("obs-bad", 1, 319598, "research", "2024-01-01"), ) - inat_db.executesql( - f"INSERT INTO observations VALUES ({s},{s},{s},{s},{s});", + inat_db.execute( + "INSERT INTO observations VALUES (?, ?, ?, ?, ?);", ("obs-good", 1, 319598, "research", "2024-01-02"), ) - inat_db.executesql( - f"INSERT INTO photos VALUES ({s},{s},{s},{s},{s},{s},{s},{s});", + inat_db.execute( + "INSERT INTO observations VALUES (?, ?, ?, ?, ?);", + ("obs-nonresearch", 1, 999, "casual", "2024-01-03"), + ) + inat_db.execute( + "INSERT INTO photos VALUES (?, ?, ?, ?, ?, ?, ?, ?);", (1, "obs-bad", 1, "jpg", "cc-by-nc", 4000, 4000, 0), ) - inat_db.executesql( - f"INSERT INTO photos VALUES ({s},{s},{s},{s},{s},{s},{s},{s});", + inat_db.execute( + "INSERT INTO photos VALUES (?, ?, ?, ?, ?, ?, ?, ?);", (2, "obs-good", 1, "jpeg", "CC-BY-SA", 2000, 1000, 0), ) - inat_db.commit() + inat_db.execute( + "INSERT INTO photos VALUES (?, ?, ?, ?, ?, ?, ?, ?);", + (3, "obs-nonresearch", 1, "jpg", "CC0", 1000, 1000, 0), + ) + return inat_db + +def test_metadata_selection_filters_to_allowed_license_and_research_grade(tmp_path): + inat_db = create_test_inat_duckdb(tmp_path) + candidate = get_inat_images.get_best_photo_from_metadata_db(inat_db, 319598) assert candidate["photo_id"] == 2 assert candidate["license"] == "cc-by-sa" @@ -159,3 +177,13 @@ def test_metadata_selection_filters_to_allowed_license_and_research_grade(tmp_pa assert candidate["rights"] == "© Observer Name, some rights reserved (CC-BY-SA)" assert candidate["image_url"].endswith("/2/medium.jpeg") inat_db.close() + + +def test_usable_photo_species_stats(tmp_path): + inat_db = create_test_inat_duckdb(tmp_path) + + stats = get_inat_images.get_usable_photo_species_stats(inat_db) + assert stats["inat_species"] == 2 + assert stats["inat_species_with_usable_photos"] == 2 + assert stats["inat_species_with_research_grade_usable_photos"] == 1 + inat_db.close()