From 5ae500a13add23b3235655beefdc625993f58749 Mon Sep 17 00:00:00 2001 From: Koen Vossen Date: Wed, 8 Apr 2026 16:08:31 +0200 Subject: [PATCH] Truncate long identifier values in file paths Values longer than 40 chars are truncated and suffixed with an 8-char MD5 hash to preserve uniqueness. Truncation happens before URL-encoding to avoid cutting encoded sequences like %2F. --- .../services/identifier_key_transformer.py | 9 ++++- .../tests/test_identifier_transformer.py | 37 +++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 ingestify/tests/test_identifier_transformer.py diff --git a/ingestify/domain/services/identifier_key_transformer.py b/ingestify/domain/services/identifier_key_transformer.py index b4c939a..d3687a2 100644 --- a/ingestify/domain/services/identifier_key_transformer.py +++ b/ingestify/domain/services/identifier_key_transformer.py @@ -143,8 +143,13 @@ def to_path(self, provider: str, dataset_type: str, identifier: dict) -> str: path_parts.append(f"{key}_{suffix}={transformed_value}") # Append the original value (either standalone for identity or alongside transformed). - # URL-encode the value so special characters, spaces, etc. are safe in paths. - path_parts.append(f"{key}={quote(str(value), safe='')}") + # Truncate long values before encoding to keep paths under + # filesystem/GCS limits. Append a short hash to preserve uniqueness. + str_value = str(value) + if len(str_value) > 40: + short_hash = hashlib.md5(str_value.encode()).hexdigest()[:8] + str_value = f"{str_value[:40]}_{short_hash}" + path_parts.append(f"{key}={quote(str_value, safe='')}") # Join the parts with `/` to form the full path return "/".join(path_parts) diff --git a/ingestify/tests/test_identifier_transformer.py b/ingestify/tests/test_identifier_transformer.py new file mode 100644 index 0000000..509c3c7 --- /dev/null +++ b/ingestify/tests/test_identifier_transformer.py @@ -0,0 +1,37 @@ +"""Tests for IdentifierTransformer.to_path.""" +from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer + + +def test_to_path_short_value_unchanged(): + t = IdentifierTransformer() + path = t.to_path("p", "d", {"key": "short"}) + assert path == "key=short" + + +def test_to_path_special_chars_url_encoded(): + t = IdentifierTransformer() + path = t.to_path("p", "d", {"key": "$99 mattress"}) + assert path == "key=%2499%20mattress" + + +def test_to_path_long_value_truncated_with_hash(): + t = IdentifierTransformer() + long_value = "a" * 50 + path = t.to_path("p", "d", {"key": long_value}) + # Truncated at 40 chars + _ + 8-char hash + assert path.startswith("key=" + "a" * 40 + "_") + assert len(path.split("=")[1]) == 40 + 1 + 8 # value_hash + + +def test_to_path_long_value_hash_is_stable(): + t = IdentifierTransformer() + long_value = "keyword " * 10 + path1 = t.to_path("p", "d", {"key": long_value}) + path2 = t.to_path("p", "d", {"key": long_value}) + assert path1 == path2 + + +def test_to_path_integer_value_unchanged(): + t = IdentifierTransformer() + path = t.to_path("p", "d", {"id": 12345}) + assert path == "id=12345"