Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion application/single_app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
EXECUTOR_TYPE = 'thread'
EXECUTOR_MAX_WORKERS = 30
SESSION_TYPE = 'filesystem'
VERSION = "0.238.024"
VERSION = "0.238.025"

SECRET_KEY = os.getenv('SECRET_KEY', 'dev-secret-key-change-in-production')

Expand Down
38 changes: 38 additions & 0 deletions application/single_app/functions_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -6420,6 +6420,44 @@ def validate_tags(tags):
return True, None, normalized


def sanitize_tags_for_filter(raw_tags):
"""
Sanitize and validate tags for use in filter/query operations.
Silently skips invalid tags since they can never match stored tags.

Args:
raw_tags: Either a comma-separated string or a list of strings
Returns:
List of valid, normalized tag strings matching ^[a-z0-9_-]+$
"""
import re

if isinstance(raw_tags, str):
candidates = [t.strip() for t in raw_tags.split(',') if t.strip()]
elif isinstance(raw_tags, list):
candidates = [t for t in raw_tags if isinstance(t, str)]
else:
return []

valid_tags = []
seen = set()

for tag in candidates:
normalized = normalize_tag(tag)
if not normalized:
continue
if not re.match(r'^[a-z0-9_-]+$', normalized):
continue
if len(normalized) > 50:
continue
if normalized in seen:
continue
seen.add(normalized)
valid_tags.append(normalized)

return valid_tags


def get_workspace_tags(user_id, group_id=None, public_workspace_id=None):
"""
Get all unique tags used in a workspace with document counts.
Expand Down
21 changes: 12 additions & 9 deletions application/single_app/functions_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,18 +77,21 @@ def build_tags_filter(tags_filter):
"""
Build OData filter clause for tags.
tags_filter: List of tag names (already normalized)
Returns: String like "document_tags/any(t: t in ('tag1', 'tag2'))" or empty string
Returns: String like "document_tags/any(t: t eq 'tag1') and ..." or empty string

Tags are validated to contain only [a-z0-9_-] characters before
being interpolated into the OData expression.
"""
if not tags_filter or not isinstance(tags_filter, list) or len(tags_filter) == 0:
return ""
# Escape single quotes in tag names
escaped_tags = [tag.replace("'", "''") for tag in tags_filter]
tags_list_str = "', '".join(escaped_tags)

# For AND logic (all tags must be present), we need multiple any() clauses
# document_tags/any(t: t eq 'tag1') and document_tags/any(t: t eq 'tag2')
tag_conditions = [f"document_tags/any(t: t eq '{tag}')" for tag in escaped_tags]

from functions_documents import sanitize_tags_for_filter
safe_tags = sanitize_tags_for_filter(tags_filter)

if not safe_tags:
return ""

tag_conditions = [f"document_tags/any(t: t eq '{tag}')" for tag in safe_tags]
return " and ".join(tag_conditions)

def hybrid_search(query, user_id, document_id=None, document_ids=None, top_n=12, doc_scope="all", active_group_id=None, active_group_ids=None, active_public_workspace_id=None, enable_file_sharing=True, tags_filter=None):
Expand Down
6 changes: 3 additions & 3 deletions application/single_app/route_backend_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,9 +362,9 @@ def api_get_user_documents():

# Tags Filter (comma-separated, AND logic - document must have all specified tags)
if tags_filter:
from functions_documents import normalize_tag
tags_list = [normalize_tag(t.strip()) for t in tags_filter.split(',') if t.strip()]
from functions_documents import sanitize_tags_for_filter
tags_list = sanitize_tags_for_filter(tags_filter)

if tags_list:
# Each tag must exist in the document's tags array
for idx, tag in enumerate(tags_list):
Expand Down
4 changes: 2 additions & 2 deletions application/single_app/route_backend_group_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,8 @@ def api_get_group_documents():
param_count += 1

if tags_filter:
from functions_documents import normalize_tag
tags_list = [normalize_tag(t.strip()) for t in tags_filter.split(',') if t.strip()]
from functions_documents import sanitize_tags_for_filter
tags_list = sanitize_tags_for_filter(tags_filter)
if tags_list:
for idx, tag in enumerate(tags_list):
param_name = f"@tag_{param_count}_{idx}"
Expand Down
4 changes: 2 additions & 2 deletions application/single_app/route_backend_public_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ def api_list_public_documents():
param_count += 1

if tags_filter:
from functions_documents import normalize_tag
tags_list = [normalize_tag(t.strip()) for t in tags_filter.split(',') if t.strip()]
from functions_documents import sanitize_tags_for_filter
tags_list = sanitize_tags_for_filter(tags_filter)
if tags_list:
for idx, tag in enumerate(tags_list):
param_name = f"@tag_{param_count}_{idx}"
Expand Down
Binary file removed application/single_app/tmp_vc3uki_.pdf
Binary file not shown.
Binary file removed application/single_app/tmpcoskgt2p.pdf
Binary file not shown.
57 changes: 57 additions & 0 deletions docs/explanation/fixes/TAG_FILTER_INJECTION_FIX.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<!-- BEGIN TAG_FILTER_INJECTION_FIX.md BLOCK -->

# Tag Filter Injection Fix

## Issue Description

Tag filter inputs from user query parameters (`?tags=...`) and JSON request bodies were passed through `normalize_tag()` which only trims whitespace and lowercases, without validating the character set. While Cosmos DB queries used parameterized values (preventing direct SQL injection), the `build_tags_filter()` function in `functions_search.py` constructed OData filter strings via string interpolation, creating a potential OData injection vector in Azure AI Search.

## Root Cause

The `validate_tags()` function enforces a strict `^[a-z0-9_-]+$` character whitelist when **saving** tags, but this validation was not applied when **filtering** by tags. The filter path only used `normalize_tag()` (strip + lowercase), allowing arbitrary characters to reach query construction code.

## Version

- **Fixed in**: v0.238.025
- **Affected versions**: Prior versions with tag filtering

## Technical Details

### Files Modified

| File | Change |
|------|--------|
| `application/single_app/functions_documents.py` | Added `sanitize_tags_for_filter()` function |
| `application/single_app/route_backend_documents.py` | Replaced `normalize_tag` with `sanitize_tags_for_filter` in tag filter |
| `application/single_app/route_backend_group_documents.py` | Replaced `normalize_tag` with `sanitize_tags_for_filter` in tag filter |
| `application/single_app/route_backend_public_documents.py` | Replaced `normalize_tag` with `sanitize_tags_for_filter` in tag filter |
| `application/single_app/functions_search.py` | Hardened `build_tags_filter()` to validate tags before OData interpolation |
| `application/single_app/config.py` | Version bump to 0.238.025 |

### Code Changes

**New function `sanitize_tags_for_filter()`**: Accepts either a comma-separated string (from query params) or a list of strings (from JSON bodies). Normalizes each tag, validates against `^[a-z0-9_-]+$`, enforces the 50-character limit, deduplicates, and silently drops invalid entries.

**Route file updates**: The inline `normalize_tag()` + split pattern was replaced with a single call to `sanitize_tags_for_filter()`, which handles splitting, normalizing, and validating internally.

**`build_tags_filter()` hardening**: Replaced the single-quote escaping approach with `sanitize_tags_for_filter()` validation. Since validated tags can only contain `[a-z0-9_-]`, no escaping is necessary and OData injection is impossible.

### Defense-in-Depth Layers

1. **Character whitelist**: `^[a-z0-9_-]+$` prevents any injection-significant characters
2. **Parameterized Cosmos DB queries**: Tag values passed as parameters, not interpolated
3. **Tag normalization**: Lowercase + trim before validation
4. **Length limit**: 50-character maximum per tag

## Testing

- **Functional test**: `functional_tests/test_tag_filter_sanitization.py`
- Covers: valid tags, special character rejection, SQL injection attempts, OData injection attempts, edge cases (empty/None/numeric input), length limits, deduplication

## Impact

- No functional behavior change for valid tag filters (tags stored in the system already pass `^[a-z0-9_-]+$` validation)
- Invalid characters in tag filters are silently dropped rather than passed through to queries
- OData filter injection via `build_tags_filter()` is now prevented by input validation

<!-- END TAG_FILTER_INJECTION_FIX.md BLOCK -->
13 changes: 13 additions & 0 deletions docs/explanation/release_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@

# Feature Release

### **(v0.238.025)**

#### Bug Fixes

* **Tag Filter Input Sanitization (Injection Prevention)**
* Added `sanitize_tags_for_filter()` function to validate tag filter inputs against the same `^[a-z0-9_-]+$` character whitelist enforced when saving tags.
* Previously, tag filter values from query parameters only passed through `normalize_tag()` (strip + lowercase) without character validation, allowing arbitrary characters to reach OData filter construction in `build_tags_filter()`.
* Hardened `build_tags_filter()` in `functions_search.py` to validate tags before interpolating into OData expressions, eliminating the OData injection vector.
* Updated tag filter parsing in personal, group, and public document routes to use `sanitize_tags_for_filter()` for defense-in-depth.
* Invalid tag filter values are silently dropped (they cannot match any stored tag).
* **Files Modified**: `functions_documents.py`, `functions_search.py`, `route_backend_documents.py`, `route_backend_group_documents.py`, `route_backend_public_documents.py`.
* (Ref: `TAG_FILTER_INJECTION_FIX.md`, `sanitize_tags_for_filter`)

### **(v0.238.024)**

#### New Features
Expand Down
Loading