Skip to content

Commit 0e103a4

Browse files
authored
Merge pull request #72 from PySport/feature/truncate-long-path-values
Truncate long identifier values in file paths
2 parents 4a564a5 + 5ae500a commit 0e103a4

2 files changed

Lines changed: 44 additions & 2 deletions

File tree

ingestify/domain/services/identifier_key_transformer.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,13 @@ def to_path(self, provider: str, dataset_type: str, identifier: dict) -> str:
143143
path_parts.append(f"{key}_{suffix}={transformed_value}")
144144

145145
# Append the original value (either standalone for identity or alongside transformed).
146-
# URL-encode the value so special characters, spaces, etc. are safe in paths.
147-
path_parts.append(f"{key}={quote(str(value), safe='')}")
146+
# Truncate long values before encoding to keep paths under
147+
# filesystem/GCS limits. Append a short hash to preserve uniqueness.
148+
str_value = str(value)
149+
if len(str_value) > 40:
150+
short_hash = hashlib.md5(str_value.encode()).hexdigest()[:8]
151+
str_value = f"{str_value[:40]}_{short_hash}"
152+
path_parts.append(f"{key}={quote(str_value, safe='')}")
148153

149154
# Join the parts with `/` to form the full path
150155
return "/".join(path_parts)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""Tests for IdentifierTransformer.to_path."""
2+
from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer
3+
4+
5+
def test_to_path_short_value_unchanged():
6+
t = IdentifierTransformer()
7+
path = t.to_path("p", "d", {"key": "short"})
8+
assert path == "key=short"
9+
10+
11+
def test_to_path_special_chars_url_encoded():
12+
t = IdentifierTransformer()
13+
path = t.to_path("p", "d", {"key": "$99 mattress"})
14+
assert path == "key=%2499%20mattress"
15+
16+
17+
def test_to_path_long_value_truncated_with_hash():
18+
t = IdentifierTransformer()
19+
long_value = "a" * 50
20+
path = t.to_path("p", "d", {"key": long_value})
21+
# Truncated at 40 chars + _ + 8-char hash
22+
assert path.startswith("key=" + "a" * 40 + "_")
23+
assert len(path.split("=")[1]) == 40 + 1 + 8 # value_hash
24+
25+
26+
def test_to_path_long_value_hash_is_stable():
27+
t = IdentifierTransformer()
28+
long_value = "keyword " * 10
29+
path1 = t.to_path("p", "d", {"key": long_value})
30+
path2 = t.to_path("p", "d", {"key": long_value})
31+
assert path1 == path2
32+
33+
34+
def test_to_path_integer_value_unchanged():
35+
t = IdentifierTransformer()
36+
path = t.to_path("p", "d", {"id": 12345})
37+
assert path == "id=12345"

0 commit comments

Comments
 (0)