diff --git a/application/single_app/config.py b/application/single_app/config.py index a6a3bc99..7094a3c5 100644 --- a/application/single_app/config.py +++ b/application/single_app/config.py @@ -88,7 +88,7 @@ EXECUTOR_TYPE = 'thread' EXECUTOR_MAX_WORKERS = 30 SESSION_TYPE = 'filesystem' -VERSION = "0.238.024" +VERSION = "0.238.025" SECRET_KEY = os.getenv('SECRET_KEY', 'dev-secret-key-change-in-production') diff --git a/application/single_app/functions_documents.py b/application/single_app/functions_documents.py index 0078b456..ce08066d 100644 --- a/application/single_app/functions_documents.py +++ b/application/single_app/functions_documents.py @@ -6420,6 +6420,44 @@ def validate_tags(tags): return True, None, normalized +def sanitize_tags_for_filter(raw_tags): + """ + Sanitize and validate tags for use in filter/query operations. + Silently skips invalid tags since they can never match stored tags. + + Args: + raw_tags: Either a comma-separated string or a list of strings + Returns: + List of valid, normalized tag strings matching ^[a-z0-9_-]+$ + """ + import re + + if isinstance(raw_tags, str): + candidates = [t.strip() for t in raw_tags.split(',') if t.strip()] + elif isinstance(raw_tags, list): + candidates = [t for t in raw_tags if isinstance(t, str)] + else: + return [] + + valid_tags = [] + seen = set() + + for tag in candidates: + normalized = normalize_tag(tag) + if not normalized: + continue + if not re.match(r'^[a-z0-9_-]+$', normalized): + continue + if len(normalized) > 50: + continue + if normalized in seen: + continue + seen.add(normalized) + valid_tags.append(normalized) + + return valid_tags + + def get_workspace_tags(user_id, group_id=None, public_workspace_id=None): """ Get all unique tags used in a workspace with document counts. diff --git a/application/single_app/functions_search.py b/application/single_app/functions_search.py index 65406bd6..4ea75404 100644 --- a/application/single_app/functions_search.py +++ b/application/single_app/functions_search.py @@ -77,18 +77,21 @@ def build_tags_filter(tags_filter): """ Build OData filter clause for tags. tags_filter: List of tag names (already normalized) - Returns: String like "document_tags/any(t: t in ('tag1', 'tag2'))" or empty string + Returns: String like "document_tags/any(t: t eq 'tag1') and ..." or empty string + + Tags are validated to contain only [a-z0-9_-] characters before + being interpolated into the OData expression. """ if not tags_filter or not isinstance(tags_filter, list) or len(tags_filter) == 0: return "" - - # Escape single quotes in tag names - escaped_tags = [tag.replace("'", "''") for tag in tags_filter] - tags_list_str = "', '".join(escaped_tags) - - # For AND logic (all tags must be present), we need multiple any() clauses - # document_tags/any(t: t eq 'tag1') and document_tags/any(t: t eq 'tag2') - tag_conditions = [f"document_tags/any(t: t eq '{tag}')" for tag in escaped_tags] + + from functions_documents import sanitize_tags_for_filter + safe_tags = sanitize_tags_for_filter(tags_filter) + + if not safe_tags: + return "" + + tag_conditions = [f"document_tags/any(t: t eq '{tag}')" for tag in safe_tags] return " and ".join(tag_conditions) def hybrid_search(query, user_id, document_id=None, document_ids=None, top_n=12, doc_scope="all", active_group_id=None, active_group_ids=None, active_public_workspace_id=None, enable_file_sharing=True, tags_filter=None): diff --git a/application/single_app/route_backend_documents.py b/application/single_app/route_backend_documents.py index 568b3b7d..fb4eb19b 100644 --- a/application/single_app/route_backend_documents.py +++ b/application/single_app/route_backend_documents.py @@ -362,9 +362,9 @@ def api_get_user_documents(): # Tags Filter (comma-separated, AND logic - document must have all specified tags) if tags_filter: - from functions_documents import normalize_tag - tags_list = [normalize_tag(t.strip()) for t in tags_filter.split(',') if t.strip()] - + from functions_documents import sanitize_tags_for_filter + tags_list = sanitize_tags_for_filter(tags_filter) + if tags_list: # Each tag must exist in the document's tags array for idx, tag in enumerate(tags_list): diff --git a/application/single_app/route_backend_group_documents.py b/application/single_app/route_backend_group_documents.py index 0d4fa6eb..957c3ee4 100644 --- a/application/single_app/route_backend_group_documents.py +++ b/application/single_app/route_backend_group_documents.py @@ -266,8 +266,8 @@ def api_get_group_documents(): param_count += 1 if tags_filter: - from functions_documents import normalize_tag - tags_list = [normalize_tag(t.strip()) for t in tags_filter.split(',') if t.strip()] + from functions_documents import sanitize_tags_for_filter + tags_list = sanitize_tags_for_filter(tags_filter) if tags_list: for idx, tag in enumerate(tags_list): param_name = f"@tag_{param_count}_{idx}" diff --git a/application/single_app/route_backend_public_documents.py b/application/single_app/route_backend_public_documents.py index c443127c..dab2bdb8 100644 --- a/application/single_app/route_backend_public_documents.py +++ b/application/single_app/route_backend_public_documents.py @@ -196,8 +196,8 @@ def api_list_public_documents(): param_count += 1 if tags_filter: - from functions_documents import normalize_tag - tags_list = [normalize_tag(t.strip()) for t in tags_filter.split(',') if t.strip()] + from functions_documents import sanitize_tags_for_filter + tags_list = sanitize_tags_for_filter(tags_filter) if tags_list: for idx, tag in enumerate(tags_list): param_name = f"@tag_{param_count}_{idx}" diff --git a/application/single_app/tmp_vc3uki_.pdf b/application/single_app/tmp_vc3uki_.pdf deleted file mode 100644 index b4449ee5..00000000 Binary files a/application/single_app/tmp_vc3uki_.pdf and /dev/null differ diff --git a/application/single_app/tmpcoskgt2p.pdf b/application/single_app/tmpcoskgt2p.pdf deleted file mode 100644 index 2b36c364..00000000 Binary files a/application/single_app/tmpcoskgt2p.pdf and /dev/null differ diff --git a/docs/explanation/fixes/TAG_FILTER_INJECTION_FIX.md b/docs/explanation/fixes/TAG_FILTER_INJECTION_FIX.md new file mode 100644 index 00000000..77514833 --- /dev/null +++ b/docs/explanation/fixes/TAG_FILTER_INJECTION_FIX.md @@ -0,0 +1,57 @@ + + +# Tag Filter Injection Fix + +## Issue Description + +Tag filter inputs from user query parameters (`?tags=...`) and JSON request bodies were passed through `normalize_tag()` which only trims whitespace and lowercases, without validating the character set. While Cosmos DB queries used parameterized values (preventing direct SQL injection), the `build_tags_filter()` function in `functions_search.py` constructed OData filter strings via string interpolation, creating a potential OData injection vector in Azure AI Search. + +## Root Cause + +The `validate_tags()` function enforces a strict `^[a-z0-9_-]+$` character whitelist when **saving** tags, but this validation was not applied when **filtering** by tags. The filter path only used `normalize_tag()` (strip + lowercase), allowing arbitrary characters to reach query construction code. + +## Version + +- **Fixed in**: v0.238.025 +- **Affected versions**: Prior versions with tag filtering + +## Technical Details + +### Files Modified + +| File | Change | +|------|--------| +| `application/single_app/functions_documents.py` | Added `sanitize_tags_for_filter()` function | +| `application/single_app/route_backend_documents.py` | Replaced `normalize_tag` with `sanitize_tags_for_filter` in tag filter | +| `application/single_app/route_backend_group_documents.py` | Replaced `normalize_tag` with `sanitize_tags_for_filter` in tag filter | +| `application/single_app/route_backend_public_documents.py` | Replaced `normalize_tag` with `sanitize_tags_for_filter` in tag filter | +| `application/single_app/functions_search.py` | Hardened `build_tags_filter()` to validate tags before OData interpolation | +| `application/single_app/config.py` | Version bump to 0.238.025 | + +### Code Changes + +**New function `sanitize_tags_for_filter()`**: Accepts either a comma-separated string (from query params) or a list of strings (from JSON bodies). Normalizes each tag, validates against `^[a-z0-9_-]+$`, enforces the 50-character limit, deduplicates, and silently drops invalid entries. + +**Route file updates**: The inline `normalize_tag()` + split pattern was replaced with a single call to `sanitize_tags_for_filter()`, which handles splitting, normalizing, and validating internally. + +**`build_tags_filter()` hardening**: Replaced the single-quote escaping approach with `sanitize_tags_for_filter()` validation. Since validated tags can only contain `[a-z0-9_-]`, no escaping is necessary and OData injection is impossible. + +### Defense-in-Depth Layers + +1. **Character whitelist**: `^[a-z0-9_-]+$` prevents any injection-significant characters +2. **Parameterized Cosmos DB queries**: Tag values passed as parameters, not interpolated +3. **Tag normalization**: Lowercase + trim before validation +4. **Length limit**: 50-character maximum per tag + +## Testing + +- **Functional test**: `functional_tests/test_tag_filter_sanitization.py` +- Covers: valid tags, special character rejection, SQL injection attempts, OData injection attempts, edge cases (empty/None/numeric input), length limits, deduplication + +## Impact + +- No functional behavior change for valid tag filters (tags stored in the system already pass `^[a-z0-9_-]+$` validation) +- Invalid characters in tag filters are silently dropped rather than passed through to queries +- OData filter injection via `build_tags_filter()` is now prevented by input validation + + diff --git a/docs/explanation/release_notes.md b/docs/explanation/release_notes.md index c474eb28..9c3d934e 100644 --- a/docs/explanation/release_notes.md +++ b/docs/explanation/release_notes.md @@ -2,6 +2,19 @@ # Feature Release +### **(v0.238.025)** + +#### Bug Fixes + +* **Tag Filter Input Sanitization (Injection Prevention)** + * Added `sanitize_tags_for_filter()` function to validate tag filter inputs against the same `^[a-z0-9_-]+$` character whitelist enforced when saving tags. + * Previously, tag filter values from query parameters only passed through `normalize_tag()` (strip + lowercase) without character validation, allowing arbitrary characters to reach OData filter construction in `build_tags_filter()`. + * Hardened `build_tags_filter()` in `functions_search.py` to validate tags before interpolating into OData expressions, eliminating the OData injection vector. + * Updated tag filter parsing in personal, group, and public document routes to use `sanitize_tags_for_filter()` for defense-in-depth. + * Invalid tag filter values are silently dropped (they cannot match any stored tag). + * **Files Modified**: `functions_documents.py`, `functions_search.py`, `route_backend_documents.py`, `route_backend_group_documents.py`, `route_backend_public_documents.py`. + * (Ref: `TAG_FILTER_INJECTION_FIX.md`, `sanitize_tags_for_filter`) + ### **(v0.238.024)** #### New Features diff --git a/functional_tests/test_tag_filter_sanitization.py b/functional_tests/test_tag_filter_sanitization.py new file mode 100644 index 00000000..18203048 --- /dev/null +++ b/functional_tests/test_tag_filter_sanitization.py @@ -0,0 +1,213 @@ +# test_tag_filter_sanitization.py +#!/usr/bin/env python3 +""" +Functional test for tag filter sanitization. +Version: 0.238.025 +Implemented in: 0.238.025 + +This test ensures that sanitize_tags_for_filter() properly validates and +sanitizes tag inputs for filter/query operations, preventing SQL and OData +injection attacks while allowing valid tags through. +""" + +import sys +import os +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'application', 'single_app')) + + +def test_sanitize_tags_for_filter(): + """Test the sanitize_tags_for_filter function.""" + print("Testing sanitize_tags_for_filter()...") + + from functions_documents import sanitize_tags_for_filter + + try: + # --- Valid inputs --- + # Basic valid tags (comma-separated string) + result = sanitize_tags_for_filter("valid-tag,another_tag") + assert result == ["valid-tag", "another_tag"], f"Expected ['valid-tag', 'another_tag'], got {result}" + print(" PASS: Basic valid tags (comma-separated string)") + + # Basic valid tags (list) + result = sanitize_tags_for_filter(["valid-tag", "another_tag"]) + assert result == ["valid-tag", "another_tag"], f"Expected ['valid-tag', 'another_tag'], got {result}" + print(" PASS: Basic valid tags (list)") + + # Uppercase normalization + result = sanitize_tags_for_filter("MyTag,UPPER") + assert result == ["mytag", "upper"], f"Expected ['mytag', 'upper'], got {result}" + print(" PASS: Uppercase normalization") + + # Whitespace trimming + result = sanitize_tags_for_filter(" spaced , padded ") + assert result == ["spaced", "padded"], f"Expected ['spaced', 'padded'], got {result}" + print(" PASS: Whitespace trimming") + + # --- Invalid inputs filtered out --- + # Special characters dropped + result = sanitize_tags_for_filter("good,b@d,fine") + assert result == ["good", "fine"], f"Expected ['good', 'fine'], got {result}" + print(" PASS: Special characters dropped") + + # SQL injection attempt + result = sanitize_tags_for_filter("'; DROP TABLE--") + assert result == [], f"Expected [], got {result}" + print(" PASS: SQL injection attempt blocked") + + # OData injection attempt + result = sanitize_tags_for_filter("x) or true or (t eq 'y") + assert result == [], f"Expected [], got {result}" + print(" PASS: OData injection attempt blocked") + + # Tags with spaces (list input) + result = sanitize_tags_for_filter(["tag1", "tag with spaces"]) + assert result == ["tag1"], f"Expected ['tag1'], got {result}" + print(" PASS: Tags with spaces dropped (list input)") + + # Tags with quotes + result = sanitize_tags_for_filter(["good", "it's", 'say"hello']) + assert result == ["good"], f"Expected ['good'], got {result}" + print(" PASS: Tags with quotes dropped") + + # Tags with parentheses + result = sanitize_tags_for_filter(["valid", "bad(tag)", "also)bad"]) + assert result == ["valid"], f"Expected ['valid'], got {result}" + print(" PASS: Tags with parentheses dropped") + + # --- Edge cases --- + # Empty string + result = sanitize_tags_for_filter("") + assert result == [], f"Expected [], got {result}" + print(" PASS: Empty string returns []") + + # Empty list + result = sanitize_tags_for_filter([]) + assert result == [], f"Expected [], got {result}" + print(" PASS: Empty list returns []") + + # None + result = sanitize_tags_for_filter(None) + assert result == [], f"Expected [], got {result}" + print(" PASS: None returns []") + + # Non-string type + result = sanitize_tags_for_filter(12345) + assert result == [], f"Expected [], got {result}" + print(" PASS: Non-string/list type returns []") + + # Length limit (51 chars) + long_tag = "a" * 51 + result = sanitize_tags_for_filter(long_tag) + assert result == [], f"Expected [], got {result}" + print(" PASS: 51-char tag dropped") + + # Length limit (exactly 50 chars - should pass) + tag_50 = "a" * 50 + result = sanitize_tags_for_filter(tag_50) + assert result == [tag_50], f"Expected ['{tag_50}'], got {result}" + print(" PASS: 50-char tag accepted") + + # Deduplication + result = sanitize_tags_for_filter("dup,dup,unique") + assert result == ["dup", "unique"], f"Expected ['dup', 'unique'], got {result}" + print(" PASS: Deduplication works") + + # Deduplication with case difference + result = sanitize_tags_for_filter("Tag,TAG,tag") + assert result == ["tag"], f"Expected ['tag'], got {result}" + print(" PASS: Case-insensitive deduplication works") + + # Only commas / empty segments + result = sanitize_tags_for_filter(",,,") + assert result == [], f"Expected [], got {result}" + print(" PASS: Only commas returns []") + + # Mixed valid and invalid in list + result = sanitize_tags_for_filter([None, 123, "valid", "", " ", "b@d"]) + assert result == ["valid"], f"Expected ['valid'], got {result}" + print(" PASS: Non-string items in list dropped") + + print("\nAll sanitize_tags_for_filter tests passed!") + return True + + except AssertionError as e: + print(f"\nTest failed: {e}") + import traceback + traceback.print_exc() + return False + except Exception as e: + print(f"\nTest failed with exception: {e}") + import traceback + traceback.print_exc() + return False + + +def test_build_tags_filter(): + """Test the build_tags_filter function with sanitized inputs.""" + print("\nTesting build_tags_filter()...") + + from functions_search import build_tags_filter + + try: + # Valid single tag + result = build_tags_filter(["valid"]) + assert result == "document_tags/any(t: t eq 'valid')", f"Unexpected result: {result}" + print(" PASS: Single valid tag") + + # Valid multiple tags + result = build_tags_filter(["tag1", "tag-2"]) + expected = "document_tags/any(t: t eq 'tag1') and document_tags/any(t: t eq 'tag-2')" + assert result == expected, f"Unexpected result: {result}" + print(" PASS: Multiple valid tags") + + # Mixed valid and invalid - only valid should appear + result = build_tags_filter(["good", "b@d", "fine"]) + expected = "document_tags/any(t: t eq 'good') and document_tags/any(t: t eq 'fine')" + assert result == expected, f"Unexpected result: {result}" + print(" PASS: Mixed valid/invalid tags - only valid in output") + + # All invalid tags + result = build_tags_filter(["b@d", "inv alid", "no!way"]) + assert result == "", f"Expected empty string, got: {result}" + print(" PASS: All invalid tags returns empty string") + + # Empty list + result = build_tags_filter([]) + assert result == "", f"Expected empty string, got: {result}" + print(" PASS: Empty list returns empty string") + + # None + result = build_tags_filter(None) + assert result == "", f"Expected empty string, got: {result}" + print(" PASS: None returns empty string") + + # Injection attempt via tag value + result = build_tags_filter(["x') or true or (t eq 'y"]) + assert result == "", f"Expected empty string, got: {result}" + print(" PASS: OData injection attempt returns empty string") + + print("\nAll build_tags_filter tests passed!") + return True + + except AssertionError as e: + print(f"\nTest failed: {e}") + import traceback + traceback.print_exc() + return False + except Exception as e: + print(f"\nTest failed with exception: {e}") + import traceback + traceback.print_exc() + return False + + +if __name__ == "__main__": + success1 = test_sanitize_tags_for_filter() + success2 = test_build_tags_filter() + + if success1 and success2: + print("\n=== ALL TESTS PASSED ===") + sys.exit(0) + else: + print("\n=== SOME TESTS FAILED ===") + sys.exit(1)