From 5ad24e4f13be440d307592faf81d5597c273aec0 Mon Sep 17 00:00:00 2001 From: phact Date: Tue, 2 Jun 2026 13:59:41 -0400 Subject: [PATCH 1/2] shards and replicas --- .env.example | 5 ++ docker-compose.yml | 4 + docs/docs/reference/configuration.mdx | 2 + flows/components/opensearch_multimodal.py | 35 +++++--- flows/ingestion_flow.json | 2 +- flows/openrag_agent.json | 2 +- flows/openrag_nudges.json | 2 +- flows/openrag_url_mcp.json | 2 +- .../templates/backend/backend-dotenv.yaml | 2 + .../templates/langflow/langflow-dotenv.yaml | 2 + kubernetes/helm/openrag/values.yaml | 2 + kubernetes/operator/README.md | 2 + .../operator/api/v1alpha1/openrag_types.go | 12 +++ .../config/crd/bases/openr.ag_openrags.yaml | 14 ++++ .../samples/kind-cluster-openrag-cr.yaml | 2 + .../samples/openrag_v1alpha1_openrag.yaml | 2 + .../operator/internal/controller/env.go | 12 ++- .../internal/controller/openrag_controller.go | 12 +++ .../controller/openrag_controller_test.go | 22 +++++ src/config/settings.py | 21 +++-- src/tui/config_fields.py | 10 +++ src/tui/managers/env_manager.py | 12 +++ src/utils/embeddings.py | 8 +- src/utils/opensearch_init.py | 84 +++++++++---------- tests/unit/test_embedding_fields.py | 14 ++++ 25 files changed, 221 insertions(+), 66 deletions(-) diff --git a/.env.example b/.env.example index a44ce3c70..9c50312e3 100644 --- a/.env.example +++ b/.env.example @@ -119,6 +119,11 @@ OPENSEARCH_USERNAME=admin # Change this if you want to use a different index name or avoid conflicts OPENSEARCH_INDEX_NAME=documents +# OpenSearch index layout for newly-created OpenRAG indices +# Shard count cannot be changed on an existing index without reindexing. +OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS=1 +OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS=0 + # IBM AMS Authentication (IBM Watsonx Data embedded mode) # Set IBM_AUTH_ENABLED=true to authenticate via the ibm-openrag-session cookie # instead of Google OAuth. The raw IBM JWT is also passed directly to OpenSearch. diff --git a/docker-compose.yml b/docker-compose.yml index 16f8d48d7..1b3b26c8e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -95,6 +95,8 @@ services: - IBM_COS_HMAC_SECRET_ACCESS_KEY=${IBM_COS_HMAC_SECRET_ACCESS_KEY} - IBM_COS_AUTH_ENDPOINT=${IBM_COS_AUTH_ENDPOINT} - OPENSEARCH_INDEX_NAME=${OPENSEARCH_INDEX_NAME:-documents} + - OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS=${OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS:-1} + - OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS=${OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS:-0} - LANGFLOW_KEY=${LANGFLOW_KEY} - SEGMENT_WRITE_KEY=${SEGMENT_WRITE_KEY:-} - ENVIRONMENT=${ENVIRONMENT:-production} @@ -195,6 +197,8 @@ services: - OPENSEARCH_PORT=${LANGFLOW_OPENSEARCH_PORT:-${OPENSEARCH_PORT:-9200}} - OPENSEARCH_URL=https://${LANGFLOW_OPENSEARCH_HOST:-${OPENSEARCH_HOST:-opensearch}}:${LANGFLOW_OPENSEARCH_PORT:-${OPENSEARCH_PORT:-9200}} - OPENSEARCH_INDEX_NAME=${OPENSEARCH_INDEX_NAME:-documents} + - OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS=${OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS:-1} + - OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS=${OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS:-0} - DOCLING_SERVE_URL=${DOCLING_SERVE_URL:-http://host.docker.internal:5001} - DOCLING_TASK_ID=None - FILENAME=None diff --git a/docs/docs/reference/configuration.mdx b/docs/docs/reference/configuration.mdx index d7b6e3c8f..43fca694d 100644 --- a/docs/docs/reference/configuration.mdx +++ b/docs/docs/reference/configuration.mdx @@ -152,6 +152,8 @@ Configure OpenSearch database authentication. | `OPENSEARCH_HOST` | `localhost` | OpenSearch service host. | | `OPENSEARCH_PORT` | `9200` | OpenSearch service port. | | `OPENSEARCH_USERNAME` | `admin` | OpenSearch administrator username. | +| `OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS` | `1` | Primary shard count for newly-created OpenRAG indices. Existing indices must be reindexed to change shard count. | +| `OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS` | `0` | Replica shard count for OpenRAG indices. OpenRAG reconciles existing managed indices to this value at startup. | | `LANGFLOW_OPENSEARCH_HOST` | Not set | By default, OpenRAG passes the `OPENSEARCH_HOST` value to Langflow. Use the `LANGFLOW_OPENSEARCH_*` variables to set a different OpenSearch endpoint for Langflow specifically. OpenRAG itself still uses the `OPENSEARCH_HOST` value. | | `LANGFLOW_OPENSEARCH_PORT` | Not set | By default, OpenRAG passes the `OPENSEARCH_PORT` value to Langflow. Use the `LANGFLOW_OPENSEARCH_*` variables to set a different OpenSearch endpoint for Langflow specifically. OpenRAG itself still uses the `OPENSEARCH_PORT` value. | diff --git a/flows/components/opensearch_multimodal.py b/flows/components/opensearch_multimodal.py index ed609526c..c5b8be318 100644 --- a/flows/components/opensearch_multimodal.py +++ b/flows/components/opensearch_multimodal.py @@ -2,6 +2,7 @@ import copy import json +import os import uuid from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Any @@ -29,6 +30,18 @@ REQUEST_TIMEOUT = 60 MAX_RETRIES = 5 + +def _get_min_env_int(key: str, default: int, minimum: int) -> int: + try: + value = int(os.getenv(key, default)) + except (TypeError, ValueError): + value = default + return max(value, minimum) + + +OPENSEARCH_NUMBER_OF_SHARDS = _get_min_env_int("OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS", 1, 1) +OPENSEARCH_NUMBER_OF_REPLICAS = _get_min_env_int("OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS", 0, 0) + # watsonx.ai surfaces rate-limit state via these (mostly non-standard) response # headers. The IBM SDK acts on the x-requests-limit-* family directly; we log # them on a failed embedding call to aid plan/region tuning. @@ -53,7 +66,9 @@ def _log_watsonx_rate_limit_headers(error: Exception) -> None: if not headers: return status = getattr(response, "status_code", "unknown") - observed = {h: headers.get(h) for h in _WATSONX_RATE_LIMIT_HEADERS if headers.get(h) is not None} + observed = { + h: headers.get(h) for h in _WATSONX_RATE_LIMIT_HEADERS if headers.get(h) is not None + } if str(status) == "429" or observed: logger.warning(f"watsonx rate-limit response (status={status}): {observed}") except Exception as log_error: # never let diagnostics mask the real error @@ -371,7 +386,7 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon "Valid JSON Web Token for authentication. " "Will be sent in the Authorization header (with optional 'Bearer ' prefix)." ), - required=False + required=False, ), StrInput( name="jwt_header", @@ -536,10 +551,8 @@ def raw_search(self, query: str | dict | None = None) -> Data: # Apply score_threshold / scoreThreshold as min_score if not already set if "min_score" not in query_body: - score_threshold = self._resolve_score_threshold(filter_obj) if score_threshold is not None: - query_body["min_score"] = score_threshold client = self.build_client() @@ -664,7 +677,11 @@ def _default_text_mapping( Dictionary containing OpenSearch index mapping configuration """ return { - "settings": {"index": {"knn": True, "knn.algo_param.ef_search": ef_search}}, + "settings": { + "index": {"knn": True, "knn.algo_param.ef_search": ef_search}, + "number_of_shards": OPENSEARCH_NUMBER_OF_SHARDS, + "number_of_replicas": OPENSEARCH_NUMBER_OF_REPLICAS, + }, "mappings": { "properties": { vector_field: { @@ -1446,7 +1463,6 @@ def _add_documents_to_vector_store(self, client: OpenSearch) -> None: logger.debug(f"Is IBM/watsonx embedding: {is_ibm}") if is_ibm: - # Hand the full batch to the SDK and let it batch/throttle/retry. # Retry attempts and base backoff are tunable via the SDK's own # WATSONX_MAX_RETRIES / WATSONX_DELAY_TIME environment variables. @@ -1722,7 +1738,6 @@ def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]: context_clauses.append({"terms": {field: values}}) return context_clauses - def _parse_filter_expression(self) -> dict | None: """Parse and validate optional filter_expression JSON. @@ -1777,8 +1792,9 @@ def _resolve_score_threshold(self, filter_obj: dict | None) -> float | None: return None return float(score_threshold) - def _detect_available_models(self, client: OpenSearch, filter_clauses: list[dict] | None = None) -> list[str]: - + def _detect_available_models( + self, client: OpenSearch, filter_clauses: list[dict] | None = None + ) -> list[str]: """Detect which embedding models have documents in the index. Uses aggregation to find all unique embedding_model values, optionally @@ -2401,7 +2417,6 @@ def search(self, query: str | None = None) -> list[dict[str, Any]]: ] def search_documents(self) -> Table: - """Search documents and return results as a Table. This is the main interface method that performs the multi-model search using the diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json index 5fc809c84..13ea9de34 100644 --- a/flows/ingestion_flow.json +++ b/flows/ingestion_flow.json @@ -3280,7 +3280,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport uuid\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import Any\n\nimport httpx\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n MultilineInput,\n Output,\n SecretStrInput,\n StrInput,\n TableInput,\n)\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\nfrom lfx.schema.dataframe import Table\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import OpenSearchException, RequestError\n\nREQUEST_TIMEOUT = 60\nMAX_RETRIES = 5\n\n# watsonx.ai surfaces rate-limit state via these (mostly non-standard) response\n# headers. The IBM SDK acts on the x-requests-limit-* family directly; we log\n# them on a failed embedding call to aid plan/region tuning.\n_WATSONX_RATE_LIMIT_HEADERS = (\n \"x-requests-limit-rate\",\n \"x-requests-limit-remaining\",\n \"x-requests-limit-reset\",\n \"Retry-After\",\n)\n\n\ndef _log_watsonx_rate_limit_headers(error: Exception) -> None:\n \"\"\"Best-effort diagnostic: log watsonx rate-limit headers from a failed call.\n\n The watsonx SDK raises ``ApiRequestFailure``, which carries the originating\n httpx/requests ``Response`` as ``.response``. On a 429 exhaustion we surface\n the documented rate-limit headers so operators can tune throughput.\n \"\"\"\n try:\n response = getattr(error, \"response\", None)\n headers = getattr(response, \"headers\", None)\n if not headers:\n return\n status = getattr(response, \"status_code\", \"unknown\")\n observed = {h: headers.get(h) for h in _WATSONX_RATE_LIMIT_HEADERS if headers.get(h) is not None}\n if str(status) == \"429\" or observed:\n logger.warning(f\"watsonx rate-limit response (status={status}): {observed}\")\n except Exception as log_error: # never let diagnostics mask the real error\n logger.debug(f\"Could not extract watsonx rate-limit headers: {log_error}\")\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n logger.info(f\"chunk_embedding_{normalize_model_name(model_name)}\")\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n\n Model Name Resolution:\n - Priority: deployment > model > model_name attributes\n - This ensures correct matching between embedding objects and index fields\n - When multiple embeddings are provided, specify embedding_model_name to select which one to use\n - During search, each detected model in the index is matched to its corresponding embedding object\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model Multi-Embedding)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search. \"\n \"To search use the tools search_documents and raw_search. \"\n \"Search documents takes a query for vector search, for example\\n\"\n ' {search_query: \"components in openrag\"}'\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n \"request_timeout\",\n \"max_retries\",\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ]\n _openrag_ingest_global_placeholders = {\n \"openrag_ingest_url\": \"OPENRAG_INGEST_URL\",\n \"openrag_ingest_token\": \"OPENRAG_INGEST_TOKEN\",\n \"openrag_ingest_run_id\": \"OPENRAG_INGEST_RUN_ID\",\n }\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\", \"JSON\"],\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"nmslib\", \"faiss\", \"lucene\", \"jvector\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'nmslib' works with standard \"\n \"OpenSearch. 'jvector' requires OpenSearch 2.9+. 'lucene' requires index.knn: true. \"\n \"Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(\n name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"], is_list=True\n ),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model to use for ingestion. This selects which embedding from the list \"\n \"will be used to embed documents. Matches on deployment, model, model_id, or model_name. \"\n \"For duplicate deployments, use combined format: 'deployment:model' \"\n \"(e.g., 'text-embedding-ada-002:text-embedding-3-large'). \"\n \"Leave empty to use the first embedding. Error message will show all available identifiers.\"\n ),\n advanced=False,\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"openrag\",\n options=[\"basic\", \"jwt\", \"openrag\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"'jwt' for JSON Web Token (Bearer) authentication, or 'openrag' to \"\n \"delegate writes to the OpenRAG backend ingest callback (no direct \"\n \"OpenSearch credentials required — only OPENRAG_* fields).\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=False,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n required=False\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=False,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n # ----- Timeout / Retry -----\n StrInput(\n name=\"request_timeout\",\n display_name=\"Request Timeout (seconds)\",\n value=\"60\",\n advanced=True,\n info=(\n \"Time in seconds to wait for a response from OpenSearch. \"\n \"Increase for large bulk ingestion or complex hybrid queries.\"\n ),\n ),\n StrInput(\n name=\"max_retries\",\n display_name=\"Max Retries\",\n value=\"3\",\n advanced=True,\n info=\"Number of retries for failed connections before raising an error.\",\n ),\n StrInput(\n name=\"openrag_ingest_url\",\n display_name=\"OpenRAG Ingest URL\",\n value=\"OPENRAG_INGEST_URL\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Internal OpenRAG callback URL for backend-owned document indexing.\",\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n IntInput(\n name=\"openrag_ingest_batch_size\",\n display_name=\"OpenRAG Ingest Batch Size\",\n value=100,\n advanced=True,\n ),\n ]\n outputs = [\n Output(\n display_name=\"Search Results\",\n name=\"search_results\",\n method=\"search_documents\",\n ),\n Output(display_name=\"Raw Search\", name=\"raw_search\", method=\"raw_search\"),\n ]\n\n def raw_search(self, query: str | dict | None = None) -> Data:\n \"\"\"Execute a raw OpenSearch query against the target index.\n\n Args:\n query (dict[str, Any]): The OpenSearch query DSL dictionary.\n\n Returns:\n Data: Search results as a Data object.\n\n Raises:\n ValueError: If 'query' is not a valid OpenSearch query (must be a non-empty dict).\n \"\"\"\n raw_query = query if query is not None else self.search_query\n\n if raw_query is None or (isinstance(raw_query, str) and not raw_query.strip()):\n self.log(\"No query provided for raw search - returning empty results\")\n return Data(data={})\n\n if isinstance(raw_query, dict):\n query_body = copy.deepcopy(raw_query)\n elif isinstance(raw_query, str):\n s = raw_query.strip()\n\n # First, optimistically try to parse as JSON DSL\n try:\n query_body = json.loads(s)\n except json.JSONDecodeError:\n # Fallback: treat as a basic text query over common fields\n query_body = {\n \"query\": {\n \"multi_match\": {\n \"query\": s,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n }\n }\n }\n else:\n msg = f\"Unsupported raw_search query type: {type(raw_query)!r}\"\n raise TypeError(msg)\n\n filter_obj = self._parse_filter_expression()\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n if filter_clauses:\n if \"query\" in query_body:\n original_query = query_body[\"query\"]\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [original_query],\n \"filter\": filter_clauses,\n }\n }\n else:\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [{\"match_all\": {}}],\n \"filter\": filter_clauses,\n }\n }\n\n if filter_obj:\n # Apply limit if not already set in the raw query\n if \"size\" not in query_body:\n limit = self._resolve_limit(filter_obj, default_limit=None)\n if limit is not None:\n query_body[\"size\"] = limit\n\n # Apply score_threshold / scoreThreshold as min_score if not already set\n if \"min_score\" not in query_body:\n\n score_threshold = self._resolve_score_threshold(filter_obj)\n if score_threshold is not None:\n\n query_body[\"min_score\"] = score_threshold\n\n client = self.build_client()\n logger.info(f\"query: {query_body}\")\n resp = client.search(\n index=self.index_name,\n body=query_body,\n params={\"terminate_after\": 0},\n )\n # Remove any _source keys whose value is a list of floats (embedding vectors)\n # Minimum length threshold to identify embedding vectors\n min_vector_length = 100\n\n def is_vector(val):\n # Accepts if it's a list of numbers (float or int) and has reasonable vector length\n return (\n isinstance(val, list)\n and len(val) > min_vector_length\n and all(isinstance(x, (float, int)) for x in val)\n )\n\n if \"hits\" in resp and \"hits\" in resp[\"hits\"]:\n for hit in resp[\"hits\"][\"hits\"]:\n source = hit.get(\"_source\")\n if isinstance(source, dict):\n keys_to_remove = [k for k, v in source.items() if is_vector(v)]\n for k in keys_to_remove:\n source.pop(k)\n logger.info(f\"Raw search response (all embedding vectors removed): {resp}\")\n return Data(**resp)\n\n def _get_embedding_model_name(self, embedding_obj=None) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Priority: deployment > model > model_id > model_name\n This ensures we use the actual model being deployed, not just the configured model.\n Supports multiple embedding providers (OpenAI, Watsonx, Cohere, etc.)\n\n Args:\n embedding_obj: Specific embedding object to get name from (optional)\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from provided embedding object\n if embedding_obj:\n # Priority: deployment > model > model_id > model_name\n if hasattr(embedding_obj, \"deployment\") and embedding_obj.deployment:\n return str(embedding_obj.deployment)\n if hasattr(embedding_obj, \"model\") and embedding_obj.model:\n return str(embedding_obj.model)\n if hasattr(embedding_obj, \"model_id\") and embedding_obj.model_id:\n return str(embedding_obj.model_id)\n if hasattr(embedding_obj, \"model_name\") and embedding_obj.model_name:\n return str(embedding_obj.model_name)\n\n # Try to get from embedding component (legacy single embedding)\n if hasattr(self, \"embedding\") and self.embedding:\n # Handle list of embeddings\n if isinstance(self.embedding, list) and len(self.embedding) > 0:\n first_emb = self.embedding[0]\n if hasattr(first_emb, \"deployment\") and first_emb.deployment:\n return str(first_emb.deployment)\n if hasattr(first_emb, \"model\") and first_emb.model:\n return str(first_emb.model)\n if hasattr(first_emb, \"model_id\") and first_emb.model_id:\n return str(first_emb.model_id)\n if hasattr(first_emb, \"model_name\") and first_emb.model_name:\n return str(first_emb.model_name)\n # Handle single embedding\n elif not isinstance(self.embedding, list):\n if hasattr(self.embedding, \"deployment\") and self.embedding.deployment:\n return str(self.embedding.deployment)\n if hasattr(self.embedding, \"model\") and self.embedding.model:\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_id\") and self.embedding.model_id:\n return str(self.embedding.model_id)\n if hasattr(self.embedding, \"model_name\") and self.embedding.model_name:\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'deployment', 'model', 'model_id', or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Note: Some OpenSearch versions/configurations have issues with dynamically adding\n knn_vector mappings (NullPointerException). This method checks if the field\n already exists before attempting to add it, and gracefully skips if the field\n is already properly configured.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n # First, check if the field already exists and is properly mapped\n properties = self._get_index_properties(client)\n if self._is_knn_vector_field(properties, field_name):\n # Field already exists as knn_vector - verify dimensions match\n existing_dim = self._get_field_dimension(properties, field_name)\n if existing_dim is not None and existing_dim != dim:\n logger.warning(\n f\"Field '{field_name}' exists with dimension {existing_dim}, \"\n f\"but current embedding has dimension {dim}. Using existing mapping.\"\n )\n else:\n logger.info(\n f\"[OpenSearchMultimodel] Field '{field_name}' already exists\"\n f\"as knn_vector with matching dimensions - skipping mapping update\"\n )\n return\n\n # Field doesn't exist, try to add the mapping\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\"type\": \"keyword\"},\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except RequestError as e:\n error_str = str(e).lower()\n if \"invalid engine\" in error_str and \"jvector\" in error_str:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to OpenSearch 2.9+.\"\n )\n raise ValueError(msg) from e\n if \"index.knn\" in error_str:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from e\n raise\n except Exception as e:\n # Check if this is the known OpenSearch k-NN NullPointerException issue\n error_str = str(e).lower()\n if \"null\" in error_str or \"nullpointerexception\" in error_str:\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}\"\n f\"due to OpenSearch k-NN plugin issue: {e}. \"\n f\"This is a known issue with some OpenSearch versions. \"\n f\"[OpenSearchMultimodel] Skipping mapping update. \"\n f\"Please ensure the index has the correct mapping for KNN search to work.\"\n )\n # Skip and continue - ingestion will proceed, but KNN search may fail if mapping doesn't exist\n return\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}: {e}\"\n )\n raise\n\n # Verify the field was added correctly\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n msg = f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n logger.error(msg)\n raise ValueError(msg)\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return (\n http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n )\n\n @staticmethod\n def _openrag_input_to_str(value: Any) -> str:\n if value is None:\n return \"\"\n if hasattr(value, \"get_secret_value\"):\n value = value.get_secret_value()\n if hasattr(value, \"text\"):\n value = value.text\n return str(value or \"\").strip()\n\n def _openrag_callback_value(self, attr_name: str) -> str:\n value = self._openrag_input_to_str(getattr(self, attr_name, \"\"))\n if value == self._openrag_ingest_global_placeholders.get(attr_name):\n return \"\"\n return value\n\n def _openrag_ingest_callback_config(self) -> tuple[str, str, str] | None:\n url = self._openrag_callback_value(\"openrag_ingest_url\")\n token = self._openrag_callback_value(\"openrag_ingest_token\")\n ingest_run_id = self._openrag_callback_value(\"openrag_ingest_run_id\")\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n debug_payload = {\n \"openrag_ingest_url\": url,\n \"openrag_ingest_url_len\": len(url),\n \"openrag_ingest_token_masked\": masked_token,\n \"openrag_ingest_token_len\": len(token),\n \"openrag_ingest_run_id\": ingest_run_id,\n \"raw_url_type\": type(self.openrag_ingest_url).__name__,\n \"raw_token_type\": type(self.openrag_ingest_token).__name__,\n \"raw_run_id_type\": type(self.openrag_ingest_run_id).__name__,\n }\n logger.warning(f\"[OpenRAG callback config] {debug_payload}\")\n try:\n self.log(f\"[OpenRAG callback config] {debug_payload}\")\n except Exception:\n pass\n\n if not url and not token and not ingest_run_id:\n return None\n if not url or not token or not ingest_run_id:\n msg = \"OpenRAG ingest callback requires url, token, and ingest_run_id.\"\n raise ValueError(msg)\n return url, token, ingest_run_id\n\n def _post_openrag_ingest_batches(\n self,\n *,\n requests: list[dict],\n vector_field: str,\n text_field: str,\n ) -> None:\n callback_config = self._openrag_ingest_callback_config()\n if callback_config is None:\n return\n\n url, token, ingest_run_id = callback_config\n batch_size = max(self._parse_int_param(\"openrag_ingest_batch_size\", 100), 1)\n timeout = self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT)\n headers = {\"Authorization\": f\"Bearer {token}\"}\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n request_summary = {\n \"url\": url,\n \"ingest_run_id\": ingest_run_id,\n \"token_masked\": masked_token,\n \"total_chunks\": len(requests),\n \"batch_size\": batch_size,\n \"timeout_s\": timeout,\n }\n logger.warning(f\"[OpenRAG ingest POST] {request_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST] {request_summary}\")\n except Exception:\n pass\n\n with httpx.Client(timeout=timeout) as client:\n total_batches = (len(requests) + batch_size - 1) // batch_size\n for batch_number, start in enumerate(range(0, len(requests), batch_size), start=1):\n batch = requests[start : start + batch_size]\n final = batch_number == total_batches\n payload = {\n \"ingest_run_id\": ingest_run_id,\n \"batch_id\": batch_number,\n \"final\": final,\n \"chunks\": [\n self._openrag_chunk_payload(\n request,\n vector_field=vector_field,\n text_field=text_field,\n )\n for request in batch\n ],\n }\n logger.warning(\n f\"[OpenRAG ingest POST] -> batch={batch_number}/{total_batches} \"\n f\"url={url} chunks={len(payload['chunks'])} final={final}\"\n )\n response = client.post(url, json=payload, headers=headers)\n response_summary = {\n \"batch\": batch_number,\n \"url\": url,\n \"status\": response.status_code,\n \"final_url\": str(response.request.url),\n \"response_headers\": dict(response.headers),\n \"body_preview\": response.text[:500],\n }\n logger.warning(f\"[OpenRAG ingest POST resp] {response_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST resp] {response_summary}\")\n except Exception:\n pass\n if response.status_code >= 400:\n msg = (\n \"OpenRAG ingest callback failed \"\n f\"(batch={batch_number}, status={response.status_code}, \"\n f\"url={url}): {response.text[:1000]}\"\n )\n raise RuntimeError(msg)\n\n self.log(f\"Posted {len(requests)} chunks to OpenRAG backend ingest callback.\")\n\n @staticmethod\n def _openrag_chunk_payload(\n request: dict,\n *,\n vector_field: str,\n text_field: str,\n ) -> dict:\n metadata = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\", vector_field, text_field}\n }\n page = metadata.get(\"page\")\n if isinstance(page, str) and page.isdigit():\n page = int(page)\n return {\n \"id\": request.get(\"_id\") or request.get(\"id\"),\n \"text\": request.get(text_field, \"\"),\n \"vector\": request[vector_field],\n \"page\": page if isinstance(page, int) else None,\n \"metadata\": metadata,\n }\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n logger.debug(f\"[OpenSearchMultimodel] Bulk ingesting embeddings for {index_name}\")\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n\n # Normalize ACL fields that may arrive as JSON strings from flows\n for key in (\"allowed_users\", \"allowed_groups\", \"allowed_principals\"):\n value = metadata.get(key)\n if isinstance(value, str):\n try:\n parsed = json.loads(value)\n if isinstance(parsed, list):\n metadata[key] = parsed\n except (json.JSONDecodeError, TypeError):\n # Leave value as-is if it isn't valid JSON\n pass\n\n metadata_document_id = str(metadata.get(\"document_id\") or \"\").strip()\n if metadata_document_id and metadata_document_id.lower() != \"none\":\n generated_id = f\"{metadata_document_id}_{i}\"\n else:\n generated_id = str(uuid.uuid4())\n _id = ids[i] if ids else generated_id\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n if self._openrag_ingest_callback_config() is not None:\n self._post_openrag_ingest_batches(\n requests=requests,\n vector_field=vector_field,\n text_field=text_field,\n )\n return return_ids\n try:\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n except Exception as bulk_error:\n if \"Unsupported request type for filter level DLS\" not in str(bulk_error):\n raise\n logger.warning(\n \"[OpenSearchMultimodel] Bulk ingest is blocked by filter-level DLS; \"\n \"falling back to per-document index requests.\"\n )\n self._index_embeddings_individually(client, requests)\n return return_ids\n\n def _index_embeddings_individually(\n self,\n client: OpenSearch,\n requests: list[dict],\n ) -> None:\n \"\"\"Index documents one at a time when OpenSearch DLS rejects bulk writes.\"\"\"\n for request in requests:\n document_id = request.get(\"_id\") or request.get(\"id\")\n body = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\"}\n }\n client.index(index=request[\"_index\"], id=document_id, body=body)\n\n def _log_index_admin_skip(self, operation: str, error: Exception) -> None:\n \"\"\"Log index-admin operations that may be blocked under filter-level DLS.\"\"\"\n logger.warning(\n f\"[OpenSearchMultimodel] Could not run index-admin operation '{operation}': {error}. \"\n \"Assuming the backend pre-created the required index/mapping and continuing.\"\n )\n\n # ---------- param helpers ----------\n def _parse_int_param(self, attr_name: str, default: int) -> int:\n \"\"\"Parse a string attribute to int, returning *default* on failure.\"\"\"\n raw = getattr(self, attr_name, None)\n if raw is None or str(raw).strip() == \"\":\n return default\n try:\n value = int(str(raw).strip())\n except ValueError:\n logger.warning(\n f\"Invalid integer value '{raw}' for {attr_name}, using default {default}\"\n )\n return default\n\n if value < 0:\n logger.warning(f\"Negative value '{raw}' for {attr_name}, using default {default}\")\n return default\n\n return value\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n if mode == \"openrag\":\n # Writes are delegated to the OpenRAG backend ingest callback,\n # so no direct OpenSearch credentials are needed. Only the\n # OPENRAG_* fields are required for ingestion to function.\n missing = [\n name\n for name, value in (\n (\"openrag_ingest_url\", self.openrag_ingest_url),\n (\"openrag_ingest_token\", self.openrag_ingest_token),\n (\"openrag_ingest_run_id\", self.openrag_ingest_run_id),\n )\n if not (value or \"\").strip()\n ]\n if missing:\n msg = (\n \"Auth Mode is 'openrag' but required OPENRAG_* fields are \"\n f\"missing: {', '.join(missing)}.\"\n )\n raise ValueError(msg)\n return {}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel] Building OpenSearch client\")\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n timeout=self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT),\n max_retries=self._parse_int_param(\"max_retries\", MAX_RETRIES),\n retry_on_timeout=True,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n client = self.build_client()\n\n # Check if we're in ingestion-only mode (no search query)\n has_search_query = bool((self.search_query or \"\").strip())\n if not has_search_query:\n logger.debug(\n \"[OpenSearchMultimodel] Ingestion-only mode activated: search operations will be skipped\"\n )\n logger.debug(\"[OpenSearchMultimodel] Starting ingestion mode...\")\n\n logger.debug(f\"[OpenSearchMultimodel] Embedding: {self.embedding}\")\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings using the selected model\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel][INGESTION] _add_documents_to_vector_store called\")\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data type: \"\n f\"{type(self.ingest_data)}, length: {len(self.ingest_data) if self.ingest_data else 0}\"\n )\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data content: \"\n f\"{self.ingest_data[:2] if self.ingest_data and len(self.ingest_data) > 0 else 'empty'}\"\n )\n\n docs = self.ingest_data or []\n if not docs:\n logger.debug(\"Ingestion complete: No documents provided\")\n return\n\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Normalize embedding to list first\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n\n # Filter out None values (fail-safe mode) - do this BEFORE checking if empty\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n # NOW check if we have any valid embeddings left after filtering\n if not embeddings_list:\n logger.warning(\n \"All embeddings returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n self.log(\n \"Embedding returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n return\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] Valid embeddings after filtering: {len(embeddings_list)}\"\n )\n self.log(\n f\"[OpenSearchMultimodel][INGESTION] Available embedding models: {len(embeddings_list)}\"\n )\n\n # Select the embedding to use for ingestion\n selected_embedding = None\n embedding_model = None\n\n # If embedding_model_name is specified, find matching embedding\n if (\n hasattr(self, \"embedding_model_name\")\n and self.embedding_model_name\n and self.embedding_model_name.strip()\n ):\n target_model_name = self.embedding_model_name.strip()\n self.log(f\"Looking for embedding model: {target_model_name}\")\n\n for emb_obj in embeddings_list:\n # Check all possible model identifiers (deployment, model, model_id, model_name)\n # Also check available_models list from EmbeddingsWithModels\n possible_names = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n if deployment:\n possible_names.append(str(deployment))\n if model:\n possible_names.append(str(model))\n if model_id:\n possible_names.append(str(model_id))\n if model_name:\n possible_names.append(str(model_name))\n\n # Also add combined identifier\n if deployment and model and deployment != model:\n possible_names.append(f\"{deployment}:{model}\")\n\n # Add all models from available_models dict\n if available_models_attr and isinstance(available_models_attr, dict):\n possible_names.extend(\n str(model_key).strip()\n for model_key in available_models_attr\n if model_key and str(model_key).strip()\n )\n\n # Match if target matches any of the possible names\n if target_model_name in possible_names:\n # Check if target is in available_models dict - use dedicated instance\n if (\n available_models_attr\n and isinstance(available_models_attr, dict)\n and target_model_name in available_models_attr\n ):\n # Use the dedicated embedding instance from the dict\n selected_embedding = available_models_attr[target_model_name]\n embedding_model = target_model_name\n self.log(\n f\"Found dedicated embedding instance for '{embedding_model}' in available_models dict\"\n )\n else:\n # Traditional identifier match\n selected_embedding = emb_obj\n embedding_model = self._get_embedding_model_name(emb_obj)\n self.log(\n f\"Found matching embedding model: {embedding_model} (matched on: {target_model_name})\"\n )\n break\n\n if not selected_embedding:\n # Build detailed list of available embeddings with all their identifiers\n available_info = []\n for idx, emb in enumerate(embeddings_list):\n emb_type = type(emb).__name__\n identifiers = []\n deployment = getattr(emb, \"deployment\", None)\n model = getattr(emb, \"model\", None)\n model_id = getattr(emb, \"model_id\", None)\n model_name = getattr(emb, \"model_name\", None)\n available_models_attr = getattr(emb, \"available_models\", None)\n\n if deployment:\n identifiers.append(f\"deployment='{deployment}'\")\n if model:\n identifiers.append(f\"model='{model}'\")\n if model_id:\n identifiers.append(f\"model_id='{model_id}'\")\n if model_name:\n identifiers.append(f\"model_name='{model_name}'\")\n\n # Add combined identifier as an option\n if deployment and model and deployment != model:\n identifiers.append(f\"combined='{deployment}:{model}'\")\n\n # Add available_models dict if present\n if available_models_attr and isinstance(available_models_attr, dict):\n identifiers.append(f\"available_models={list(available_models_attr.keys())}\")\n\n available_info.append(\n f\" [{idx}] {emb_type}: {', '.join(identifiers) if identifiers else 'No identifiers'}\"\n )\n\n msg = (\n f\"Embedding model '{target_model_name}' not found in available embeddings.\\n\\n\"\n f\"Available embeddings:\\n\" + \"\\n\".join(available_info) + \"\\n\\n\"\n \"Please set 'embedding_model_name' to one of the identifier values shown above \"\n \"(use the value after the '=' sign, without quotes).\\n\"\n \"For duplicate deployments, use the 'combined' format.\\n\"\n \"Or leave it empty to use the first embedding.\"\n )\n raise ValueError(msg)\n else:\n # Use first embedding if no model name specified\n selected_embedding = embeddings_list[0]\n embedding_model = self._get_embedding_model_name(selected_embedding)\n self.log(f\"No embedding_model_name specified, using first embedding: {embedding_model}\")\n\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n logger.info(f\"Selected embedding model for ingestion: '{embedding_model}'\")\n self.log(f\"Using embedding model for ingestion: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Log embedding details for debugging\n if hasattr(selected_embedding, \"deployment\"):\n logger.info(f\"Embedding deployment: {selected_embedding.deployment}\")\n if hasattr(selected_embedding, \"model\"):\n logger.info(f\"Embedding model: {selected_embedding.model}\")\n if hasattr(selected_embedding, \"model_id\"):\n logger.info(f\"Embedding model_id: {selected_embedding.model_id}\")\n if hasattr(selected_embedding, \"dimensions\"):\n logger.info(f\"Embedding dimensions: {selected_embedding.dimensions}\")\n if hasattr(selected_embedding, \"available_models\"):\n logger.info(f\"Embedding available_models: {selected_embedding.available_models}\")\n\n # No model switching needed - each model in available_models has its own dedicated instance\n # The selected_embedding is already configured correctly for the target model\n logger.info(\n f\"Using embedding instance for '{embedding_model}' - pre-configured and ready to use\"\n )\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n logger.debug(f\"[LF] Docs metadata {self.docs_metadata}\")\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n\n # Determine whether the selected embedding is watsonx/IBM. The watsonx\n # SDK ships its own rate-limit machinery (input batching, proactive\n # x-requests-limit-* TokenBucket throttling, and jittered exponential\n # backoff on 429), so we lean on it instead of retrying on top of it.\n # The type-name check also covers watsonx-hosted, non-\"ibm/\" models\n # (e.g. intfloat/multilingual-e5-large).\n is_ibm = (embedding_model and \"ibm\" in str(embedding_model).lower()) or (\n selected_embedding and \"watsonx\" in type(selected_embedding).__name__.lower()\n )\n logger.debug(f\"Is IBM/watsonx embedding: {is_ibm}\")\n\n if is_ibm:\n\n # Hand the full batch to the SDK and let it batch/throttle/retry.\n # Retry attempts and base backoff are tunable via the SDK's own\n # WATSONX_MAX_RETRIES / WATSONX_DELAY_TIME environment variables.\n logger.info(\n f\"Embedding {len(texts)} chunks via watsonx SDK batch (SDK-managed throttle + 429 retry)\"\n )\n try:\n vectors: list[list[float]] = selected_embedding.embed_documents(texts)\n logger.info(f\"Successfully embedded {len(vectors)} chunks via watsonx SDK\")\n except Exception as embed_error:\n _log_watsonx_rate_limit_headers(embed_error)\n logger.error(\n f\"Failed to embed {len(texts)} chunks via watsonx SDK. Error: {embed_error}\",\n )\n raise\n\n else:\n # Non-watsonx providers (OpenAI, Ollama) lack the watsonx SDK's\n # built-in rate-limit handling, so embed per chunk in parallel with\n # a generic rate-limit-aware tenacity retry.\n vectors: list[list[float]] = [None] * len(texts)\n from tenacity import (\n retry,\n retry_if_exception,\n stop_after_attempt,\n wait_exponential,\n )\n\n def is_rate_limit_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a rate limit error (429).\"\"\"\n error_str = str(exception).lower()\n return \"429\" in error_str or \"rate_limit\" in error_str or \"rate limit\" in error_str\n\n def is_other_retryable_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a transient network error worth retrying.\"\"\"\n if is_rate_limit_error(exception):\n return False\n return isinstance(exception, (ConnectionError, TimeoutError, OSError))\n\n # Retry decorator for rate limit errors (longer backoff)\n retry_on_rate_limit = retry(\n retry=retry_if_exception(is_rate_limit_error),\n stop=stop_after_attempt(5),\n wait=wait_exponential(multiplier=2, min=2, max=30),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Rate limit hit for chunk (attempt {retry_state.attempt_number}/5), \"\n f\"backing off for {retry_state.next_action.sleep:.1f}s\"\n ),\n )\n\n # Retry decorator for other errors (shorter backoff)\n retry_on_other_errors = retry(\n retry=retry_if_exception(is_other_retryable_error),\n stop=stop_after_attempt(3),\n wait=wait_exponential(multiplier=1, min=1, max=8),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Error embedding chunk (attempt {retry_state.attempt_number}/3), \"\n f\"retrying in {retry_state.next_action.sleep:.1f}s: {retry_state.outcome.exception()}\"\n ),\n )\n\n def embed_chunk_with_retry(chunk_text: str, chunk_idx: int) -> list[float]:\n \"\"\"Embed a single chunk with rate-limit-aware retry logic.\"\"\"\n\n @retry_on_rate_limit\n @retry_on_other_errors\n def _embed(text: str) -> list[float]:\n return selected_embedding.embed_documents([text])[0]\n\n try:\n return _embed(chunk_text)\n except Exception as e:\n logger.error(\n f\"Failed to embed chunk {chunk_idx} after all retries: {e}\",\n error=str(e),\n )\n raise\n\n max_workers = min(max(len(texts), 1), 8)\n logger.debug(f\"Using parallel processing with {max_workers} workers\")\n\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {\n executor.submit(embed_chunk_with_retry, chunk, idx): idx\n for idx, chunk in enumerate(texts)\n }\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n\n if not vectors:\n self.log(f\"No vectors generated from documents for model {embedding_model}.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n use_openrag_ingest_callback = self._openrag_ingest_callback_config() is not None\n\n is_aoss = False\n mapping: dict | None = None\n\n engine = getattr(self, \"engine\", \"jvector\")\n\n if use_openrag_ingest_callback:\n self.log(\"Using OpenRAG backend ingest callback; skipping direct OpenSearch writes.\")\n else:\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping (index.knn: true is required for vector search)\n index_exists = True\n try:\n index_exists = bool(client.indices.exists(index=self.index_name))\n except OpenSearchException as exists_error:\n self._log_index_admin_skip(\"indices.exists\", exists_error)\n\n try:\n if not index_exists:\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error == \"resource_already_exists_exception\":\n pass # Index was created concurrently\n else:\n error_msg = str(creation_error).lower()\n if \"invalid engine\" in error_msg or \"illegal_argument\" in error_msg:\n if \"jvector\" in error_msg:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to 2.9+.\"\n )\n raise ValueError(msg) from creation_error\n if \"index.knn\" in error_msg:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from creation_error\n logger.warning(f\"Failed to create index '{self.index_name}': {creation_error}\")\n raise\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(\n f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\"\n )\n logger.info(f\"Will store embeddings in field: {dynamic_field_name}\")\n logger.info(f\"Will tag documents with embedding_model: {embedding_model}\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n logger.info(\n f\"Ingestion complete: Successfully indexed {len(return_ids)} documents with model '{embedding_model}'\"\n )\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if (\n \"term\" in f\n and isinstance(f[\"term\"], dict)\n and not self._is_placeholder_term(f[\"term\"])\n ):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n\n def _parse_filter_expression(self) -> dict | None:\n \"\"\"Parse and validate optional filter_expression JSON.\n\n Returns:\n Parsed JSON object as a dict, or None when unset/blank.\n\n Raises:\n ValueError: If JSON is invalid or does not decode to an object.\n \"\"\"\n filter_expression = getattr(self, \"filter_expression\", \"\")\n if not isinstance(filter_expression, str) or not filter_expression.strip():\n return None\n try:\n filter_obj = json.loads(filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not isinstance(filter_obj, dict):\n msg = \"Invalid filter_expression JSON type: expected a JSON object.\"\n raise TypeError(msg)\n return filter_obj\n\n def _resolve_limit(self, filter_obj: dict | None, default_limit: int | None) -> int | None:\n \"\"\"Resolve an integer result limit from filter settings.\"\"\"\n if not filter_obj:\n return default_limit\n raw_limit = filter_obj.get(\"limit\", default_limit)\n if raw_limit is None:\n return None\n if isinstance(raw_limit, bool):\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise TypeError(msg)\n try:\n limit = int(raw_limit)\n except (TypeError, ValueError) as e:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg) from e\n if limit <= 0:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg)\n return limit\n\n def _resolve_score_threshold(self, filter_obj: dict | None) -> float | None:\n \"\"\"Resolve optional positive min score from filter settings.\"\"\"\n if not filter_obj:\n return None\n score_threshold = filter_obj.get(\"score_threshold\")\n if score_threshold is None:\n score_threshold = filter_obj.get(\"scoreThreshold\")\n if not isinstance(score_threshold, (int, float)) or score_threshold <= 0:\n return None\n return float(score_threshold)\n\n def _detect_available_models(self, client: OpenSearch, filter_clauses: list[dict] | None = None) -> list[str]:\n\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}}},\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\"bool\": {\"filter\": filter_clauses}}\n\n logger.debug(f\"Model detection query: {agg_query}\")\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n # Log detailed bucket info for debugging\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n if not models:\n total_hits = result.get(\"hits\", {}).get(\"total\", {})\n total_count = (\n total_hits.get(\"value\", 0) if isinstance(total_hits, dict) else total_hits\n )\n logger.warning(\n f\"No embedding_model values found in index '{self.index_name}'. \"\n f\"Total docs in index: {total_count}. \"\n f\"This may indicate documents were indexed without the embedding_model field.\"\n )\n except (OpenSearchException, KeyError, ValueError) as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n fallback_model = self._get_embedding_model_name()\n logger.info(f\"Using fallback model: {fallback_model}\")\n return [fallback_model]\n else:\n return models\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except OpenSearchException as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n return bool(isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\")\n\n def _get_field_dimension(\n self, properties: dict[str, Any] | None, field_name: str\n ) -> int | None:\n \"\"\"Get the dimension of a knn_vector field from the index mapping.\n\n Args:\n properties: Index properties from mapping\n field_name: Name of the vector field\n\n Returns:\n Dimension of the field, or None if not found\n \"\"\"\n if not field_name or properties is None:\n return None\n\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return None\n\n # Check direct knn_vector field\n if field_def.get(\"type\") == \"knn_vector\":\n return field_def.get(\"dimension\")\n\n # Check nested properties\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return nested_props.get(\"dimension\")\n\n return None\n\n def _get_filename_agg_field(self, index_properties: dict[str, Any] | None) -> str:\n \"\"\"Choose the appropriate field for filename aggregations.\"\"\"\n if not index_properties:\n return \"filename.keyword\"\n\n filename_def = index_properties.get(\"filename\")\n if not isinstance(filename_def, dict):\n return \"filename.keyword\"\n\n field_type = filename_def.get(\"type\")\n fields_def = filename_def.get(\"fields\", {})\n\n # Top-level keyword with no subfields\n if field_type == \"keyword\" and not isinstance(fields_def, dict):\n return \"filename\"\n\n # Text field with keyword subfield\n if isinstance(fields_def, dict) and \"keyword\" in fields_def:\n return \"filename.keyword\"\n\n # Fallback: aggregate on filename directly\n return \"filename\"\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = self._parse_filter_expression()\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Check if embedding is None (fail-safe mode)\n if self.embedding is None or (\n isinstance(self.embedding, list) and all(e is None for e in self.embedding)\n ):\n logger.error(\"Embedding returned None (fail-safe mode enabled). Cannot perform search.\")\n return []\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models\n query_embeddings = {}\n\n # Normalize embedding to list\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n # Filter out None values (fail-safe mode)\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n if not embeddings_list:\n logger.error(\n \"No valid embeddings available after filtering None values (fail-safe mode). Cannot perform search.\"\n )\n return []\n\n # Create a comprehensive map of model names to embedding objects\n # Check all possible identifiers (deployment, model, model_id, model_name)\n # Also leverage available_models list from EmbeddingsWithModels\n # Handle duplicate identifiers by creating combined keys\n embedding_by_model = {}\n identifier_conflicts = {} # Track which identifiers have conflicts\n\n for idx, emb_obj in enumerate(embeddings_list):\n # Get all possible identifiers for this embedding\n identifiers = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n dimensions = getattr(emb_obj, \"dimensions\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Embedding object {idx}: deployment={deployment}, model={model}, \"\n f\"model_id={model_id}, model_name={model_name}, dimensions={dimensions}, \"\n f\"available_models={available_models_attr}\"\n )\n\n # If this embedding has available_models dict, map all models to their dedicated instances\n if available_models_attr and isinstance(available_models_attr, dict):\n logger.info(\n f\"Embedding object {idx} provides {len(available_models_attr)} models via available_models dict\"\n )\n for model_name_key, dedicated_embedding in available_models_attr.items():\n if model_name_key and str(model_name_key).strip():\n model_str = str(model_name_key).strip()\n if model_str not in embedding_by_model:\n # Use the dedicated embedding instance from the dict\n embedding_by_model[model_str] = dedicated_embedding\n logger.info(\n f\"Mapped available model '{model_str}' to dedicated embedding instance\"\n )\n else:\n # Conflict detected - track it\n if model_str not in identifier_conflicts:\n identifier_conflicts[model_str] = [embedding_by_model[model_str]]\n identifier_conflicts[model_str].append(dedicated_embedding)\n logger.warning(\n f\"Available model '{model_str}' has conflict - used by multiple embeddings\"\n )\n\n # Also map traditional identifiers (for backward compatibility)\n if deployment:\n identifiers.append(str(deployment))\n if model:\n identifiers.append(str(model))\n if model_id:\n identifiers.append(str(model_id))\n if model_name:\n identifiers.append(str(model_name))\n\n # Map all identifiers to this embedding object\n for identifier in identifiers:\n if identifier not in embedding_by_model:\n embedding_by_model[identifier] = emb_obj\n logger.info(f\"Mapped identifier '{identifier}' to embedding object {idx}\")\n else:\n # Conflict detected - track it\n if identifier not in identifier_conflicts:\n identifier_conflicts[identifier] = [embedding_by_model[identifier]]\n identifier_conflicts[identifier].append(emb_obj)\n logger.warning(\n f\"Identifier '{identifier}' has conflict - used by multiple embeddings\"\n )\n\n # For embeddings with model+deployment, create combined identifier\n # This helps when deployment is the same but model differs\n if deployment and model and deployment != model:\n combined_id = f\"{deployment}:{model}\"\n if combined_id not in embedding_by_model:\n embedding_by_model[combined_id] = emb_obj\n logger.info(\n f\"Created combined identifier '{combined_id}' for embedding object {idx}\"\n )\n\n # Log conflicts\n if identifier_conflicts:\n logger.warning(\n f\"Found {len(identifier_conflicts)} conflicting identifiers. \"\n f\"Consider using combined format 'deployment:model' or specifying unique model names.\"\n )\n for conflict_id, emb_list in identifier_conflicts.items():\n logger.warning(\n f\" Conflict on '{conflict_id}': {len(emb_list)} embeddings use this identifier\"\n )\n\n logger.info(f\"Generating embeddings for {len(available_models)} models in index\")\n logger.info(f\"Available embedding identifiers: {list(embedding_by_model.keys())}\")\n self.log(f\"[SEARCH] Models detected in index: {available_models}\")\n self.log(f\"[SEARCH] Available embedding identifiers: {list(embedding_by_model.keys())}\")\n\n # Track matching status for debugging\n matched_models = []\n unmatched_models = []\n\n for model_name in available_models:\n try:\n # Check if we have an embedding object for this model\n if model_name in embedding_by_model:\n # Use the matching embedding object directly\n emb_obj = embedding_by_model[model_name]\n emb_deployment = getattr(emb_obj, \"deployment\", None)\n emb_model = getattr(emb_obj, \"model\", None)\n emb_model_id = getattr(emb_obj, \"model_id\", None)\n emb_dimensions = getattr(emb_obj, \"dimensions\", None)\n emb_available_models = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Using embedding object for model '{model_name}': \"\n f\"deployment={emb_deployment}, model={emb_model}, model_id={emb_model_id}, \"\n f\"dimensions={emb_dimensions}\"\n )\n\n # Check if this is a dedicated instance from available_models dict\n if emb_available_models and isinstance(emb_available_models, dict):\n logger.info(\n f\"Model '{model_name}' using dedicated instance from available_models dict \"\n f\"(pre-configured with correct model and dimensions)\"\n )\n\n # Use the embedding instance directly - no model switching needed!\n vec = emb_obj.embed_query(q)\n query_embeddings[model_name] = vec\n matched_models.append(model_name)\n logger.info(\n f\"Generated embedding for model: {model_name} (actual dimensions: {len(vec)})\"\n )\n self.log(f\"[MATCH] Model '{model_name}' - generated {len(vec)}-dim embedding\")\n else:\n # No matching embedding found for this model\n unmatched_models.append(model_name)\n logger.warning(\n f\"No matching embedding found for model '{model_name}'. \"\n f\"This model will be skipped. Available identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(\n f\"[NO MATCH] Model '{model_name}' - available: {list(embedding_by_model.keys())}\"\n )\n except (\n RuntimeError,\n ValueError,\n ConnectionError,\n TimeoutError,\n AttributeError,\n KeyError,\n ) as e:\n logger.warning(f\"Failed to generate embedding for {model_name}: {e}\")\n self.log(f\"[ERROR] Embedding generation failed for '{model_name}': {e}\")\n\n # Log summary of model matching\n logger.info(\n f\"Model matching summary: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n self.log(\n f\"[SUMMARY] Model matching: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n if unmatched_models:\n self.log(f\"[WARN] Unmatched models in index: {unmatched_models}\")\n\n if not query_embeddings:\n msg = (\n f\"Failed to generate embeddings for any model. \"\n f\"Index has models: {available_models}, but no matching embedding objects found. \"\n f\"Available embedding identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(f\"[FAIL] Search failed: {msg}\")\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n vector_dim = len(embedding_vector)\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n self.log(\n f\"[SKIP] Field '{selected_field}' not a knn_vector - skipping model '{model_name}'\"\n )\n continue\n\n # Validate vector dimensions match the field dimensions\n field_dim = self._get_field_dimension(index_properties, selected_field)\n if field_dim is not None and field_dim != vector_dim:\n logger.error(\n f\"Dimension mismatch for model '{model_name}': \"\n f\"Query vector has {vector_dim} dimensions but field '{selected_field}' expects {field_dim}. \"\n f\"Skipping this model to prevent search errors.\"\n )\n self.log(\n f\"[DIM MISMATCH] Model '{model_name}': query={vector_dim} vs field={field_dim} - skipping\"\n )\n continue\n\n logger.info(\n f\"Adding KNN query for model '{model_name}': field='{selected_field}', \"\n f\"query_dims={vector_dim}, field_dims={field_dim or 'unknown'}\"\n )\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n self.log(\n f\"[WARN] No valid KNN queries could be built. \"\n f\"Query embeddings generated: {list(query_embeddings.keys())}, \"\n f\"but no matching knn_vector fields found in index.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1,\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = self._resolve_limit(filter_obj, default_limit=self.number_of_results)\n score_threshold = self._resolve_score_threshold(filter_obj)\n\n # Determine the best aggregation field for filename based on index mapping\n filename_agg_field = self._get_filename_agg_field(index_properties)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates,\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": filename_agg_field, \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n \"allowed_principals\",\n ],\n \"size\": limit,\n }\n\n if score_threshold is not None:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models: \"\n f\"{list(query_embeddings.keys())}\"\n )\n self.log(\n f\"[EXEC] Executing search with {len(knn_queries_with_candidates)} KNN queries, limit={limit}\"\n )\n self.log(f\"[EXEC] Embedding models used: {list(query_embeddings.keys())}\")\n self.log(f\"[EXEC] KNN fields being queried: {embedding_fields}\")\n\n try:\n resp = client.search(index=self.index_name, body=body, params={\"terminate_after\": 0})\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = (\n knn_queries_without_candidates\n )\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n self.log(f\"[RESULT] Search complete: {len(hits)} results found\")\n\n if len(hits) == 0:\n self.log(\n f\"[EMPTY] Debug info: \"\n f\"models_in_index={available_models}, \"\n f\"matched_models={matched_models}, \"\n f\"knn_fields={embedding_fields}, \"\n f\"filters={len(filter_clauses)} clauses\"\n )\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> Table:\n\n \"\"\"Search documents and return results as a Table.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Table (DataFrame) format\n so downstream Parser components can consume them directly.\n\n Always builds the vector store (triggering ingestion if needed), then performs\n search only if a query is provided.\n\n Returns:\n Table containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n\n try:\n # Always build/cache the vector store to ensure ingestion happens\n logger.info(f\"Search query: {self.search_query}\")\n if self._cached_vector_store is None:\n self.build_vector_store()\n\n # Only perform search if query is provided\n search_query = (self.search_query or \"\").strip()\n if not search_query:\n self.log(\"No search query provided - ingestion completed, returning empty results\")\n\n return Table(data=[])\n\n # Perform search with the provided query\n raw = self.search(search_query)\n raw_list = [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n return Table(data=raw_list)\n\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(\n self, build_config: dict, field_value: str, field_name: str | None = None\n ) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n is_openrag = mode == \"openrag\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n # build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n # In 'openrag' mode, expose the OPENRAG_* fields up front\n # since they are the only credentials required.\n for openrag_field in (\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ):\n if openrag_field in build_config:\n build_config[openrag_field][\"advanced\"] = not is_openrag\n build_config[openrag_field][\"required\"] = (\n is_openrag and openrag_field != \"openrag_ingest_batch_size\"\n )\n\n if is_basic or is_openrag:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" + "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport os\nimport uuid\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import Any\n\nimport httpx\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n MultilineInput,\n Output,\n SecretStrInput,\n StrInput,\n TableInput,\n)\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\nfrom lfx.schema.dataframe import Table\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import OpenSearchException, RequestError\n\nREQUEST_TIMEOUT = 60\nMAX_RETRIES = 5\n\n\ndef _get_min_env_int(key: str, default: int, minimum: int) -> int:\n try:\n value = int(os.getenv(key, default))\n except (TypeError, ValueError):\n value = default\n return max(value, minimum)\n\n\nOPENSEARCH_NUMBER_OF_SHARDS = _get_min_env_int(\"OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS\", 1, 1)\nOPENSEARCH_NUMBER_OF_REPLICAS = _get_min_env_int(\"OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS\", 0, 0)\n\n# watsonx.ai surfaces rate-limit state via these (mostly non-standard) response\n# headers. The IBM SDK acts on the x-requests-limit-* family directly; we log\n# them on a failed embedding call to aid plan/region tuning.\n_WATSONX_RATE_LIMIT_HEADERS = (\n \"x-requests-limit-rate\",\n \"x-requests-limit-remaining\",\n \"x-requests-limit-reset\",\n \"Retry-After\",\n)\n\n\ndef _log_watsonx_rate_limit_headers(error: Exception) -> None:\n \"\"\"Best-effort diagnostic: log watsonx rate-limit headers from a failed call.\n\n The watsonx SDK raises ``ApiRequestFailure``, which carries the originating\n httpx/requests ``Response`` as ``.response``. On a 429 exhaustion we surface\n the documented rate-limit headers so operators can tune throughput.\n \"\"\"\n try:\n response = getattr(error, \"response\", None)\n headers = getattr(response, \"headers\", None)\n if not headers:\n return\n status = getattr(response, \"status_code\", \"unknown\")\n observed = {\n h: headers.get(h) for h in _WATSONX_RATE_LIMIT_HEADERS if headers.get(h) is not None\n }\n if str(status) == \"429\" or observed:\n logger.warning(f\"watsonx rate-limit response (status={status}): {observed}\")\n except Exception as log_error: # never let diagnostics mask the real error\n logger.debug(f\"Could not extract watsonx rate-limit headers: {log_error}\")\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n logger.info(f\"chunk_embedding_{normalize_model_name(model_name)}\")\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n\n Model Name Resolution:\n - Priority: deployment > model > model_name attributes\n - This ensures correct matching between embedding objects and index fields\n - When multiple embeddings are provided, specify embedding_model_name to select which one to use\n - During search, each detected model in the index is matched to its corresponding embedding object\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model Multi-Embedding)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search. \"\n \"To search use the tools search_documents and raw_search. \"\n \"Search documents takes a query for vector search, for example\\n\"\n ' {search_query: \"components in openrag\"}'\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n \"request_timeout\",\n \"max_retries\",\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ]\n _openrag_ingest_global_placeholders = {\n \"openrag_ingest_url\": \"OPENRAG_INGEST_URL\",\n \"openrag_ingest_token\": \"OPENRAG_INGEST_TOKEN\",\n \"openrag_ingest_run_id\": \"OPENRAG_INGEST_RUN_ID\",\n }\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\", \"JSON\"],\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"nmslib\", \"faiss\", \"lucene\", \"jvector\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'nmslib' works with standard \"\n \"OpenSearch. 'jvector' requires OpenSearch 2.9+. 'lucene' requires index.knn: true. \"\n \"Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(\n name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"], is_list=True\n ),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model to use for ingestion. This selects which embedding from the list \"\n \"will be used to embed documents. Matches on deployment, model, model_id, or model_name. \"\n \"For duplicate deployments, use combined format: 'deployment:model' \"\n \"(e.g., 'text-embedding-ada-002:text-embedding-3-large'). \"\n \"Leave empty to use the first embedding. Error message will show all available identifiers.\"\n ),\n advanced=False,\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"openrag\",\n options=[\"basic\", \"jwt\", \"openrag\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"'jwt' for JSON Web Token (Bearer) authentication, or 'openrag' to \"\n \"delegate writes to the OpenRAG backend ingest callback (no direct \"\n \"OpenSearch credentials required — only OPENRAG_* fields).\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=False,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n required=False,\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=False,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n # ----- Timeout / Retry -----\n StrInput(\n name=\"request_timeout\",\n display_name=\"Request Timeout (seconds)\",\n value=\"60\",\n advanced=True,\n info=(\n \"Time in seconds to wait for a response from OpenSearch. \"\n \"Increase for large bulk ingestion or complex hybrid queries.\"\n ),\n ),\n StrInput(\n name=\"max_retries\",\n display_name=\"Max Retries\",\n value=\"3\",\n advanced=True,\n info=\"Number of retries for failed connections before raising an error.\",\n ),\n StrInput(\n name=\"openrag_ingest_url\",\n display_name=\"OpenRAG Ingest URL\",\n value=\"OPENRAG_INGEST_URL\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Internal OpenRAG callback URL for backend-owned document indexing.\",\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n IntInput(\n name=\"openrag_ingest_batch_size\",\n display_name=\"OpenRAG Ingest Batch Size\",\n value=100,\n advanced=True,\n ),\n ]\n outputs = [\n Output(\n display_name=\"Search Results\",\n name=\"search_results\",\n method=\"search_documents\",\n ),\n Output(display_name=\"Raw Search\", name=\"raw_search\", method=\"raw_search\"),\n ]\n\n def raw_search(self, query: str | dict | None = None) -> Data:\n \"\"\"Execute a raw OpenSearch query against the target index.\n\n Args:\n query (dict[str, Any]): The OpenSearch query DSL dictionary.\n\n Returns:\n Data: Search results as a Data object.\n\n Raises:\n ValueError: If 'query' is not a valid OpenSearch query (must be a non-empty dict).\n \"\"\"\n raw_query = query if query is not None else self.search_query\n\n if raw_query is None or (isinstance(raw_query, str) and not raw_query.strip()):\n self.log(\"No query provided for raw search - returning empty results\")\n return Data(data={})\n\n if isinstance(raw_query, dict):\n query_body = copy.deepcopy(raw_query)\n elif isinstance(raw_query, str):\n s = raw_query.strip()\n\n # First, optimistically try to parse as JSON DSL\n try:\n query_body = json.loads(s)\n except json.JSONDecodeError:\n # Fallback: treat as a basic text query over common fields\n query_body = {\n \"query\": {\n \"multi_match\": {\n \"query\": s,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n }\n }\n }\n else:\n msg = f\"Unsupported raw_search query type: {type(raw_query)!r}\"\n raise TypeError(msg)\n\n filter_obj = self._parse_filter_expression()\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n if filter_clauses:\n if \"query\" in query_body:\n original_query = query_body[\"query\"]\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [original_query],\n \"filter\": filter_clauses,\n }\n }\n else:\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [{\"match_all\": {}}],\n \"filter\": filter_clauses,\n }\n }\n\n if filter_obj:\n # Apply limit if not already set in the raw query\n if \"size\" not in query_body:\n limit = self._resolve_limit(filter_obj, default_limit=None)\n if limit is not None:\n query_body[\"size\"] = limit\n\n # Apply score_threshold / scoreThreshold as min_score if not already set\n if \"min_score\" not in query_body:\n score_threshold = self._resolve_score_threshold(filter_obj)\n if score_threshold is not None:\n query_body[\"min_score\"] = score_threshold\n\n client = self.build_client()\n logger.info(f\"query: {query_body}\")\n resp = client.search(\n index=self.index_name,\n body=query_body,\n params={\"terminate_after\": 0},\n )\n # Remove any _source keys whose value is a list of floats (embedding vectors)\n # Minimum length threshold to identify embedding vectors\n min_vector_length = 100\n\n def is_vector(val):\n # Accepts if it's a list of numbers (float or int) and has reasonable vector length\n return (\n isinstance(val, list)\n and len(val) > min_vector_length\n and all(isinstance(x, (float, int)) for x in val)\n )\n\n if \"hits\" in resp and \"hits\" in resp[\"hits\"]:\n for hit in resp[\"hits\"][\"hits\"]:\n source = hit.get(\"_source\")\n if isinstance(source, dict):\n keys_to_remove = [k for k, v in source.items() if is_vector(v)]\n for k in keys_to_remove:\n source.pop(k)\n logger.info(f\"Raw search response (all embedding vectors removed): {resp}\")\n return Data(**resp)\n\n def _get_embedding_model_name(self, embedding_obj=None) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Priority: deployment > model > model_id > model_name\n This ensures we use the actual model being deployed, not just the configured model.\n Supports multiple embedding providers (OpenAI, Watsonx, Cohere, etc.)\n\n Args:\n embedding_obj: Specific embedding object to get name from (optional)\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from provided embedding object\n if embedding_obj:\n # Priority: deployment > model > model_id > model_name\n if hasattr(embedding_obj, \"deployment\") and embedding_obj.deployment:\n return str(embedding_obj.deployment)\n if hasattr(embedding_obj, \"model\") and embedding_obj.model:\n return str(embedding_obj.model)\n if hasattr(embedding_obj, \"model_id\") and embedding_obj.model_id:\n return str(embedding_obj.model_id)\n if hasattr(embedding_obj, \"model_name\") and embedding_obj.model_name:\n return str(embedding_obj.model_name)\n\n # Try to get from embedding component (legacy single embedding)\n if hasattr(self, \"embedding\") and self.embedding:\n # Handle list of embeddings\n if isinstance(self.embedding, list) and len(self.embedding) > 0:\n first_emb = self.embedding[0]\n if hasattr(first_emb, \"deployment\") and first_emb.deployment:\n return str(first_emb.deployment)\n if hasattr(first_emb, \"model\") and first_emb.model:\n return str(first_emb.model)\n if hasattr(first_emb, \"model_id\") and first_emb.model_id:\n return str(first_emb.model_id)\n if hasattr(first_emb, \"model_name\") and first_emb.model_name:\n return str(first_emb.model_name)\n # Handle single embedding\n elif not isinstance(self.embedding, list):\n if hasattr(self.embedding, \"deployment\") and self.embedding.deployment:\n return str(self.embedding.deployment)\n if hasattr(self.embedding, \"model\") and self.embedding.model:\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_id\") and self.embedding.model_id:\n return str(self.embedding.model_id)\n if hasattr(self.embedding, \"model_name\") and self.embedding.model_name:\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'deployment', 'model', 'model_id', or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\n \"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search},\n \"number_of_shards\": OPENSEARCH_NUMBER_OF_SHARDS,\n \"number_of_replicas\": OPENSEARCH_NUMBER_OF_REPLICAS,\n },\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Note: Some OpenSearch versions/configurations have issues with dynamically adding\n knn_vector mappings (NullPointerException). This method checks if the field\n already exists before attempting to add it, and gracefully skips if the field\n is already properly configured.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n # First, check if the field already exists and is properly mapped\n properties = self._get_index_properties(client)\n if self._is_knn_vector_field(properties, field_name):\n # Field already exists as knn_vector - verify dimensions match\n existing_dim = self._get_field_dimension(properties, field_name)\n if existing_dim is not None and existing_dim != dim:\n logger.warning(\n f\"Field '{field_name}' exists with dimension {existing_dim}, \"\n f\"but current embedding has dimension {dim}. Using existing mapping.\"\n )\n else:\n logger.info(\n f\"[OpenSearchMultimodel] Field '{field_name}' already exists\"\n f\"as knn_vector with matching dimensions - skipping mapping update\"\n )\n return\n\n # Field doesn't exist, try to add the mapping\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\"type\": \"keyword\"},\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except RequestError as e:\n error_str = str(e).lower()\n if \"invalid engine\" in error_str and \"jvector\" in error_str:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to OpenSearch 2.9+.\"\n )\n raise ValueError(msg) from e\n if \"index.knn\" in error_str:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from e\n raise\n except Exception as e:\n # Check if this is the known OpenSearch k-NN NullPointerException issue\n error_str = str(e).lower()\n if \"null\" in error_str or \"nullpointerexception\" in error_str:\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}\"\n f\"due to OpenSearch k-NN plugin issue: {e}. \"\n f\"This is a known issue with some OpenSearch versions. \"\n f\"[OpenSearchMultimodel] Skipping mapping update. \"\n f\"Please ensure the index has the correct mapping for KNN search to work.\"\n )\n # Skip and continue - ingestion will proceed, but KNN search may fail if mapping doesn't exist\n return\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}: {e}\"\n )\n raise\n\n # Verify the field was added correctly\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n msg = f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n logger.error(msg)\n raise ValueError(msg)\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return (\n http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n )\n\n @staticmethod\n def _openrag_input_to_str(value: Any) -> str:\n if value is None:\n return \"\"\n if hasattr(value, \"get_secret_value\"):\n value = value.get_secret_value()\n if hasattr(value, \"text\"):\n value = value.text\n return str(value or \"\").strip()\n\n def _openrag_callback_value(self, attr_name: str) -> str:\n value = self._openrag_input_to_str(getattr(self, attr_name, \"\"))\n if value == self._openrag_ingest_global_placeholders.get(attr_name):\n return \"\"\n return value\n\n def _openrag_ingest_callback_config(self) -> tuple[str, str, str] | None:\n url = self._openrag_callback_value(\"openrag_ingest_url\")\n token = self._openrag_callback_value(\"openrag_ingest_token\")\n ingest_run_id = self._openrag_callback_value(\"openrag_ingest_run_id\")\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n debug_payload = {\n \"openrag_ingest_url\": url,\n \"openrag_ingest_url_len\": len(url),\n \"openrag_ingest_token_masked\": masked_token,\n \"openrag_ingest_token_len\": len(token),\n \"openrag_ingest_run_id\": ingest_run_id,\n \"raw_url_type\": type(self.openrag_ingest_url).__name__,\n \"raw_token_type\": type(self.openrag_ingest_token).__name__,\n \"raw_run_id_type\": type(self.openrag_ingest_run_id).__name__,\n }\n logger.warning(f\"[OpenRAG callback config] {debug_payload}\")\n try:\n self.log(f\"[OpenRAG callback config] {debug_payload}\")\n except Exception:\n pass\n\n if not url and not token and not ingest_run_id:\n return None\n if not url or not token or not ingest_run_id:\n msg = \"OpenRAG ingest callback requires url, token, and ingest_run_id.\"\n raise ValueError(msg)\n return url, token, ingest_run_id\n\n def _post_openrag_ingest_batches(\n self,\n *,\n requests: list[dict],\n vector_field: str,\n text_field: str,\n ) -> None:\n callback_config = self._openrag_ingest_callback_config()\n if callback_config is None:\n return\n\n url, token, ingest_run_id = callback_config\n batch_size = max(self._parse_int_param(\"openrag_ingest_batch_size\", 100), 1)\n timeout = self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT)\n headers = {\"Authorization\": f\"Bearer {token}\"}\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n request_summary = {\n \"url\": url,\n \"ingest_run_id\": ingest_run_id,\n \"token_masked\": masked_token,\n \"total_chunks\": len(requests),\n \"batch_size\": batch_size,\n \"timeout_s\": timeout,\n }\n logger.warning(f\"[OpenRAG ingest POST] {request_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST] {request_summary}\")\n except Exception:\n pass\n\n with httpx.Client(timeout=timeout) as client:\n total_batches = (len(requests) + batch_size - 1) // batch_size\n for batch_number, start in enumerate(range(0, len(requests), batch_size), start=1):\n batch = requests[start : start + batch_size]\n final = batch_number == total_batches\n payload = {\n \"ingest_run_id\": ingest_run_id,\n \"batch_id\": batch_number,\n \"final\": final,\n \"chunks\": [\n self._openrag_chunk_payload(\n request,\n vector_field=vector_field,\n text_field=text_field,\n )\n for request in batch\n ],\n }\n logger.warning(\n f\"[OpenRAG ingest POST] -> batch={batch_number}/{total_batches} \"\n f\"url={url} chunks={len(payload['chunks'])} final={final}\"\n )\n response = client.post(url, json=payload, headers=headers)\n response_summary = {\n \"batch\": batch_number,\n \"url\": url,\n \"status\": response.status_code,\n \"final_url\": str(response.request.url),\n \"response_headers\": dict(response.headers),\n \"body_preview\": response.text[:500],\n }\n logger.warning(f\"[OpenRAG ingest POST resp] {response_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST resp] {response_summary}\")\n except Exception:\n pass\n if response.status_code >= 400:\n msg = (\n \"OpenRAG ingest callback failed \"\n f\"(batch={batch_number}, status={response.status_code}, \"\n f\"url={url}): {response.text[:1000]}\"\n )\n raise RuntimeError(msg)\n\n self.log(f\"Posted {len(requests)} chunks to OpenRAG backend ingest callback.\")\n\n @staticmethod\n def _openrag_chunk_payload(\n request: dict,\n *,\n vector_field: str,\n text_field: str,\n ) -> dict:\n metadata = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\", vector_field, text_field}\n }\n page = metadata.get(\"page\")\n if isinstance(page, str) and page.isdigit():\n page = int(page)\n return {\n \"id\": request.get(\"_id\") or request.get(\"id\"),\n \"text\": request.get(text_field, \"\"),\n \"vector\": request[vector_field],\n \"page\": page if isinstance(page, int) else None,\n \"metadata\": metadata,\n }\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n logger.debug(f\"[OpenSearchMultimodel] Bulk ingesting embeddings for {index_name}\")\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n\n # Normalize ACL fields that may arrive as JSON strings from flows\n for key in (\"allowed_users\", \"allowed_groups\", \"allowed_principals\"):\n value = metadata.get(key)\n if isinstance(value, str):\n try:\n parsed = json.loads(value)\n if isinstance(parsed, list):\n metadata[key] = parsed\n except (json.JSONDecodeError, TypeError):\n # Leave value as-is if it isn't valid JSON\n pass\n\n metadata_document_id = str(metadata.get(\"document_id\") or \"\").strip()\n if metadata_document_id and metadata_document_id.lower() != \"none\":\n generated_id = f\"{metadata_document_id}_{i}\"\n else:\n generated_id = str(uuid.uuid4())\n _id = ids[i] if ids else generated_id\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n if self._openrag_ingest_callback_config() is not None:\n self._post_openrag_ingest_batches(\n requests=requests,\n vector_field=vector_field,\n text_field=text_field,\n )\n return return_ids\n try:\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n except Exception as bulk_error:\n if \"Unsupported request type for filter level DLS\" not in str(bulk_error):\n raise\n logger.warning(\n \"[OpenSearchMultimodel] Bulk ingest is blocked by filter-level DLS; \"\n \"falling back to per-document index requests.\"\n )\n self._index_embeddings_individually(client, requests)\n return return_ids\n\n def _index_embeddings_individually(\n self,\n client: OpenSearch,\n requests: list[dict],\n ) -> None:\n \"\"\"Index documents one at a time when OpenSearch DLS rejects bulk writes.\"\"\"\n for request in requests:\n document_id = request.get(\"_id\") or request.get(\"id\")\n body = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\"}\n }\n client.index(index=request[\"_index\"], id=document_id, body=body)\n\n def _log_index_admin_skip(self, operation: str, error: Exception) -> None:\n \"\"\"Log index-admin operations that may be blocked under filter-level DLS.\"\"\"\n logger.warning(\n f\"[OpenSearchMultimodel] Could not run index-admin operation '{operation}': {error}. \"\n \"Assuming the backend pre-created the required index/mapping and continuing.\"\n )\n\n # ---------- param helpers ----------\n def _parse_int_param(self, attr_name: str, default: int) -> int:\n \"\"\"Parse a string attribute to int, returning *default* on failure.\"\"\"\n raw = getattr(self, attr_name, None)\n if raw is None or str(raw).strip() == \"\":\n return default\n try:\n value = int(str(raw).strip())\n except ValueError:\n logger.warning(\n f\"Invalid integer value '{raw}' for {attr_name}, using default {default}\"\n )\n return default\n\n if value < 0:\n logger.warning(f\"Negative value '{raw}' for {attr_name}, using default {default}\")\n return default\n\n return value\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n if mode == \"openrag\":\n # Writes are delegated to the OpenRAG backend ingest callback,\n # so no direct OpenSearch credentials are needed. Only the\n # OPENRAG_* fields are required for ingestion to function.\n missing = [\n name\n for name, value in (\n (\"openrag_ingest_url\", self.openrag_ingest_url),\n (\"openrag_ingest_token\", self.openrag_ingest_token),\n (\"openrag_ingest_run_id\", self.openrag_ingest_run_id),\n )\n if not (value or \"\").strip()\n ]\n if missing:\n msg = (\n \"Auth Mode is 'openrag' but required OPENRAG_* fields are \"\n f\"missing: {', '.join(missing)}.\"\n )\n raise ValueError(msg)\n return {}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel] Building OpenSearch client\")\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n timeout=self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT),\n max_retries=self._parse_int_param(\"max_retries\", MAX_RETRIES),\n retry_on_timeout=True,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n client = self.build_client()\n\n # Check if we're in ingestion-only mode (no search query)\n has_search_query = bool((self.search_query or \"\").strip())\n if not has_search_query:\n logger.debug(\n \"[OpenSearchMultimodel] Ingestion-only mode activated: search operations will be skipped\"\n )\n logger.debug(\"[OpenSearchMultimodel] Starting ingestion mode...\")\n\n logger.debug(f\"[OpenSearchMultimodel] Embedding: {self.embedding}\")\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings using the selected model\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel][INGESTION] _add_documents_to_vector_store called\")\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data type: \"\n f\"{type(self.ingest_data)}, length: {len(self.ingest_data) if self.ingest_data else 0}\"\n )\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data content: \"\n f\"{self.ingest_data[:2] if self.ingest_data and len(self.ingest_data) > 0 else 'empty'}\"\n )\n\n docs = self.ingest_data or []\n if not docs:\n logger.debug(\"Ingestion complete: No documents provided\")\n return\n\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Normalize embedding to list first\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n\n # Filter out None values (fail-safe mode) - do this BEFORE checking if empty\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n # NOW check if we have any valid embeddings left after filtering\n if not embeddings_list:\n logger.warning(\n \"All embeddings returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n self.log(\n \"Embedding returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n return\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] Valid embeddings after filtering: {len(embeddings_list)}\"\n )\n self.log(\n f\"[OpenSearchMultimodel][INGESTION] Available embedding models: {len(embeddings_list)}\"\n )\n\n # Select the embedding to use for ingestion\n selected_embedding = None\n embedding_model = None\n\n # If embedding_model_name is specified, find matching embedding\n if (\n hasattr(self, \"embedding_model_name\")\n and self.embedding_model_name\n and self.embedding_model_name.strip()\n ):\n target_model_name = self.embedding_model_name.strip()\n self.log(f\"Looking for embedding model: {target_model_name}\")\n\n for emb_obj in embeddings_list:\n # Check all possible model identifiers (deployment, model, model_id, model_name)\n # Also check available_models list from EmbeddingsWithModels\n possible_names = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n if deployment:\n possible_names.append(str(deployment))\n if model:\n possible_names.append(str(model))\n if model_id:\n possible_names.append(str(model_id))\n if model_name:\n possible_names.append(str(model_name))\n\n # Also add combined identifier\n if deployment and model and deployment != model:\n possible_names.append(f\"{deployment}:{model}\")\n\n # Add all models from available_models dict\n if available_models_attr and isinstance(available_models_attr, dict):\n possible_names.extend(\n str(model_key).strip()\n for model_key in available_models_attr\n if model_key and str(model_key).strip()\n )\n\n # Match if target matches any of the possible names\n if target_model_name in possible_names:\n # Check if target is in available_models dict - use dedicated instance\n if (\n available_models_attr\n and isinstance(available_models_attr, dict)\n and target_model_name in available_models_attr\n ):\n # Use the dedicated embedding instance from the dict\n selected_embedding = available_models_attr[target_model_name]\n embedding_model = target_model_name\n self.log(\n f\"Found dedicated embedding instance for '{embedding_model}' in available_models dict\"\n )\n else:\n # Traditional identifier match\n selected_embedding = emb_obj\n embedding_model = self._get_embedding_model_name(emb_obj)\n self.log(\n f\"Found matching embedding model: {embedding_model} (matched on: {target_model_name})\"\n )\n break\n\n if not selected_embedding:\n # Build detailed list of available embeddings with all their identifiers\n available_info = []\n for idx, emb in enumerate(embeddings_list):\n emb_type = type(emb).__name__\n identifiers = []\n deployment = getattr(emb, \"deployment\", None)\n model = getattr(emb, \"model\", None)\n model_id = getattr(emb, \"model_id\", None)\n model_name = getattr(emb, \"model_name\", None)\n available_models_attr = getattr(emb, \"available_models\", None)\n\n if deployment:\n identifiers.append(f\"deployment='{deployment}'\")\n if model:\n identifiers.append(f\"model='{model}'\")\n if model_id:\n identifiers.append(f\"model_id='{model_id}'\")\n if model_name:\n identifiers.append(f\"model_name='{model_name}'\")\n\n # Add combined identifier as an option\n if deployment and model and deployment != model:\n identifiers.append(f\"combined='{deployment}:{model}'\")\n\n # Add available_models dict if present\n if available_models_attr and isinstance(available_models_attr, dict):\n identifiers.append(f\"available_models={list(available_models_attr.keys())}\")\n\n available_info.append(\n f\" [{idx}] {emb_type}: {', '.join(identifiers) if identifiers else 'No identifiers'}\"\n )\n\n msg = (\n f\"Embedding model '{target_model_name}' not found in available embeddings.\\n\\n\"\n f\"Available embeddings:\\n\" + \"\\n\".join(available_info) + \"\\n\\n\"\n \"Please set 'embedding_model_name' to one of the identifier values shown above \"\n \"(use the value after the '=' sign, without quotes).\\n\"\n \"For duplicate deployments, use the 'combined' format.\\n\"\n \"Or leave it empty to use the first embedding.\"\n )\n raise ValueError(msg)\n else:\n # Use first embedding if no model name specified\n selected_embedding = embeddings_list[0]\n embedding_model = self._get_embedding_model_name(selected_embedding)\n self.log(f\"No embedding_model_name specified, using first embedding: {embedding_model}\")\n\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n logger.info(f\"Selected embedding model for ingestion: '{embedding_model}'\")\n self.log(f\"Using embedding model for ingestion: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Log embedding details for debugging\n if hasattr(selected_embedding, \"deployment\"):\n logger.info(f\"Embedding deployment: {selected_embedding.deployment}\")\n if hasattr(selected_embedding, \"model\"):\n logger.info(f\"Embedding model: {selected_embedding.model}\")\n if hasattr(selected_embedding, \"model_id\"):\n logger.info(f\"Embedding model_id: {selected_embedding.model_id}\")\n if hasattr(selected_embedding, \"dimensions\"):\n logger.info(f\"Embedding dimensions: {selected_embedding.dimensions}\")\n if hasattr(selected_embedding, \"available_models\"):\n logger.info(f\"Embedding available_models: {selected_embedding.available_models}\")\n\n # No model switching needed - each model in available_models has its own dedicated instance\n # The selected_embedding is already configured correctly for the target model\n logger.info(\n f\"Using embedding instance for '{embedding_model}' - pre-configured and ready to use\"\n )\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n logger.debug(f\"[LF] Docs metadata {self.docs_metadata}\")\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n\n # Determine whether the selected embedding is watsonx/IBM. The watsonx\n # SDK ships its own rate-limit machinery (input batching, proactive\n # x-requests-limit-* TokenBucket throttling, and jittered exponential\n # backoff on 429), so we lean on it instead of retrying on top of it.\n # The type-name check also covers watsonx-hosted, non-\"ibm/\" models\n # (e.g. intfloat/multilingual-e5-large).\n is_ibm = (embedding_model and \"ibm\" in str(embedding_model).lower()) or (\n selected_embedding and \"watsonx\" in type(selected_embedding).__name__.lower()\n )\n logger.debug(f\"Is IBM/watsonx embedding: {is_ibm}\")\n\n if is_ibm:\n # Hand the full batch to the SDK and let it batch/throttle/retry.\n # Retry attempts and base backoff are tunable via the SDK's own\n # WATSONX_MAX_RETRIES / WATSONX_DELAY_TIME environment variables.\n logger.info(\n f\"Embedding {len(texts)} chunks via watsonx SDK batch (SDK-managed throttle + 429 retry)\"\n )\n try:\n vectors: list[list[float]] = selected_embedding.embed_documents(texts)\n logger.info(f\"Successfully embedded {len(vectors)} chunks via watsonx SDK\")\n except Exception as embed_error:\n _log_watsonx_rate_limit_headers(embed_error)\n logger.error(\n f\"Failed to embed {len(texts)} chunks via watsonx SDK. Error: {embed_error}\",\n )\n raise\n\n else:\n # Non-watsonx providers (OpenAI, Ollama) lack the watsonx SDK's\n # built-in rate-limit handling, so embed per chunk in parallel with\n # a generic rate-limit-aware tenacity retry.\n vectors: list[list[float]] = [None] * len(texts)\n from tenacity import (\n retry,\n retry_if_exception,\n stop_after_attempt,\n wait_exponential,\n )\n\n def is_rate_limit_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a rate limit error (429).\"\"\"\n error_str = str(exception).lower()\n return \"429\" in error_str or \"rate_limit\" in error_str or \"rate limit\" in error_str\n\n def is_other_retryable_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a transient network error worth retrying.\"\"\"\n if is_rate_limit_error(exception):\n return False\n return isinstance(exception, (ConnectionError, TimeoutError, OSError))\n\n # Retry decorator for rate limit errors (longer backoff)\n retry_on_rate_limit = retry(\n retry=retry_if_exception(is_rate_limit_error),\n stop=stop_after_attempt(5),\n wait=wait_exponential(multiplier=2, min=2, max=30),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Rate limit hit for chunk (attempt {retry_state.attempt_number}/5), \"\n f\"backing off for {retry_state.next_action.sleep:.1f}s\"\n ),\n )\n\n # Retry decorator for other errors (shorter backoff)\n retry_on_other_errors = retry(\n retry=retry_if_exception(is_other_retryable_error),\n stop=stop_after_attempt(3),\n wait=wait_exponential(multiplier=1, min=1, max=8),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Error embedding chunk (attempt {retry_state.attempt_number}/3), \"\n f\"retrying in {retry_state.next_action.sleep:.1f}s: {retry_state.outcome.exception()}\"\n ),\n )\n\n def embed_chunk_with_retry(chunk_text: str, chunk_idx: int) -> list[float]:\n \"\"\"Embed a single chunk with rate-limit-aware retry logic.\"\"\"\n\n @retry_on_rate_limit\n @retry_on_other_errors\n def _embed(text: str) -> list[float]:\n return selected_embedding.embed_documents([text])[0]\n\n try:\n return _embed(chunk_text)\n except Exception as e:\n logger.error(\n f\"Failed to embed chunk {chunk_idx} after all retries: {e}\",\n error=str(e),\n )\n raise\n\n max_workers = min(max(len(texts), 1), 8)\n logger.debug(f\"Using parallel processing with {max_workers} workers\")\n\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {\n executor.submit(embed_chunk_with_retry, chunk, idx): idx\n for idx, chunk in enumerate(texts)\n }\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n\n if not vectors:\n self.log(f\"No vectors generated from documents for model {embedding_model}.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n use_openrag_ingest_callback = self._openrag_ingest_callback_config() is not None\n\n is_aoss = False\n mapping: dict | None = None\n\n engine = getattr(self, \"engine\", \"jvector\")\n\n if use_openrag_ingest_callback:\n self.log(\"Using OpenRAG backend ingest callback; skipping direct OpenSearch writes.\")\n else:\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping (index.knn: true is required for vector search)\n index_exists = True\n try:\n index_exists = bool(client.indices.exists(index=self.index_name))\n except OpenSearchException as exists_error:\n self._log_index_admin_skip(\"indices.exists\", exists_error)\n\n try:\n if not index_exists:\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error == \"resource_already_exists_exception\":\n pass # Index was created concurrently\n else:\n error_msg = str(creation_error).lower()\n if \"invalid engine\" in error_msg or \"illegal_argument\" in error_msg:\n if \"jvector\" in error_msg:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to 2.9+.\"\n )\n raise ValueError(msg) from creation_error\n if \"index.knn\" in error_msg:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from creation_error\n logger.warning(f\"Failed to create index '{self.index_name}': {creation_error}\")\n raise\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(\n f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\"\n )\n logger.info(f\"Will store embeddings in field: {dynamic_field_name}\")\n logger.info(f\"Will tag documents with embedding_model: {embedding_model}\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n logger.info(\n f\"Ingestion complete: Successfully indexed {len(return_ids)} documents with model '{embedding_model}'\"\n )\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if (\n \"term\" in f\n and isinstance(f[\"term\"], dict)\n and not self._is_placeholder_term(f[\"term\"])\n ):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n def _parse_filter_expression(self) -> dict | None:\n \"\"\"Parse and validate optional filter_expression JSON.\n\n Returns:\n Parsed JSON object as a dict, or None when unset/blank.\n\n Raises:\n ValueError: If JSON is invalid or does not decode to an object.\n \"\"\"\n filter_expression = getattr(self, \"filter_expression\", \"\")\n if not isinstance(filter_expression, str) or not filter_expression.strip():\n return None\n try:\n filter_obj = json.loads(filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not isinstance(filter_obj, dict):\n msg = \"Invalid filter_expression JSON type: expected a JSON object.\"\n raise TypeError(msg)\n return filter_obj\n\n def _resolve_limit(self, filter_obj: dict | None, default_limit: int | None) -> int | None:\n \"\"\"Resolve an integer result limit from filter settings.\"\"\"\n if not filter_obj:\n return default_limit\n raw_limit = filter_obj.get(\"limit\", default_limit)\n if raw_limit is None:\n return None\n if isinstance(raw_limit, bool):\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise TypeError(msg)\n try:\n limit = int(raw_limit)\n except (TypeError, ValueError) as e:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg) from e\n if limit <= 0:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg)\n return limit\n\n def _resolve_score_threshold(self, filter_obj: dict | None) -> float | None:\n \"\"\"Resolve optional positive min score from filter settings.\"\"\"\n if not filter_obj:\n return None\n score_threshold = filter_obj.get(\"score_threshold\")\n if score_threshold is None:\n score_threshold = filter_obj.get(\"scoreThreshold\")\n if not isinstance(score_threshold, (int, float)) or score_threshold <= 0:\n return None\n return float(score_threshold)\n\n def _detect_available_models(\n self, client: OpenSearch, filter_clauses: list[dict] | None = None\n ) -> list[str]:\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}}},\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\"bool\": {\"filter\": filter_clauses}}\n\n logger.debug(f\"Model detection query: {agg_query}\")\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n # Log detailed bucket info for debugging\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n if not models:\n total_hits = result.get(\"hits\", {}).get(\"total\", {})\n total_count = (\n total_hits.get(\"value\", 0) if isinstance(total_hits, dict) else total_hits\n )\n logger.warning(\n f\"No embedding_model values found in index '{self.index_name}'. \"\n f\"Total docs in index: {total_count}. \"\n f\"This may indicate documents were indexed without the embedding_model field.\"\n )\n except (OpenSearchException, KeyError, ValueError) as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n fallback_model = self._get_embedding_model_name()\n logger.info(f\"Using fallback model: {fallback_model}\")\n return [fallback_model]\n else:\n return models\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except OpenSearchException as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n return bool(isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\")\n\n def _get_field_dimension(\n self, properties: dict[str, Any] | None, field_name: str\n ) -> int | None:\n \"\"\"Get the dimension of a knn_vector field from the index mapping.\n\n Args:\n properties: Index properties from mapping\n field_name: Name of the vector field\n\n Returns:\n Dimension of the field, or None if not found\n \"\"\"\n if not field_name or properties is None:\n return None\n\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return None\n\n # Check direct knn_vector field\n if field_def.get(\"type\") == \"knn_vector\":\n return field_def.get(\"dimension\")\n\n # Check nested properties\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return nested_props.get(\"dimension\")\n\n return None\n\n def _get_filename_agg_field(self, index_properties: dict[str, Any] | None) -> str:\n \"\"\"Choose the appropriate field for filename aggregations.\"\"\"\n if not index_properties:\n return \"filename.keyword\"\n\n filename_def = index_properties.get(\"filename\")\n if not isinstance(filename_def, dict):\n return \"filename.keyword\"\n\n field_type = filename_def.get(\"type\")\n fields_def = filename_def.get(\"fields\", {})\n\n # Top-level keyword with no subfields\n if field_type == \"keyword\" and not isinstance(fields_def, dict):\n return \"filename\"\n\n # Text field with keyword subfield\n if isinstance(fields_def, dict) and \"keyword\" in fields_def:\n return \"filename.keyword\"\n\n # Fallback: aggregate on filename directly\n return \"filename\"\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = self._parse_filter_expression()\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Check if embedding is None (fail-safe mode)\n if self.embedding is None or (\n isinstance(self.embedding, list) and all(e is None for e in self.embedding)\n ):\n logger.error(\"Embedding returned None (fail-safe mode enabled). Cannot perform search.\")\n return []\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models\n query_embeddings = {}\n\n # Normalize embedding to list\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n # Filter out None values (fail-safe mode)\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n if not embeddings_list:\n logger.error(\n \"No valid embeddings available after filtering None values (fail-safe mode). Cannot perform search.\"\n )\n return []\n\n # Create a comprehensive map of model names to embedding objects\n # Check all possible identifiers (deployment, model, model_id, model_name)\n # Also leverage available_models list from EmbeddingsWithModels\n # Handle duplicate identifiers by creating combined keys\n embedding_by_model = {}\n identifier_conflicts = {} # Track which identifiers have conflicts\n\n for idx, emb_obj in enumerate(embeddings_list):\n # Get all possible identifiers for this embedding\n identifiers = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n dimensions = getattr(emb_obj, \"dimensions\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Embedding object {idx}: deployment={deployment}, model={model}, \"\n f\"model_id={model_id}, model_name={model_name}, dimensions={dimensions}, \"\n f\"available_models={available_models_attr}\"\n )\n\n # If this embedding has available_models dict, map all models to their dedicated instances\n if available_models_attr and isinstance(available_models_attr, dict):\n logger.info(\n f\"Embedding object {idx} provides {len(available_models_attr)} models via available_models dict\"\n )\n for model_name_key, dedicated_embedding in available_models_attr.items():\n if model_name_key and str(model_name_key).strip():\n model_str = str(model_name_key).strip()\n if model_str not in embedding_by_model:\n # Use the dedicated embedding instance from the dict\n embedding_by_model[model_str] = dedicated_embedding\n logger.info(\n f\"Mapped available model '{model_str}' to dedicated embedding instance\"\n )\n else:\n # Conflict detected - track it\n if model_str not in identifier_conflicts:\n identifier_conflicts[model_str] = [embedding_by_model[model_str]]\n identifier_conflicts[model_str].append(dedicated_embedding)\n logger.warning(\n f\"Available model '{model_str}' has conflict - used by multiple embeddings\"\n )\n\n # Also map traditional identifiers (for backward compatibility)\n if deployment:\n identifiers.append(str(deployment))\n if model:\n identifiers.append(str(model))\n if model_id:\n identifiers.append(str(model_id))\n if model_name:\n identifiers.append(str(model_name))\n\n # Map all identifiers to this embedding object\n for identifier in identifiers:\n if identifier not in embedding_by_model:\n embedding_by_model[identifier] = emb_obj\n logger.info(f\"Mapped identifier '{identifier}' to embedding object {idx}\")\n else:\n # Conflict detected - track it\n if identifier not in identifier_conflicts:\n identifier_conflicts[identifier] = [embedding_by_model[identifier]]\n identifier_conflicts[identifier].append(emb_obj)\n logger.warning(\n f\"Identifier '{identifier}' has conflict - used by multiple embeddings\"\n )\n\n # For embeddings with model+deployment, create combined identifier\n # This helps when deployment is the same but model differs\n if deployment and model and deployment != model:\n combined_id = f\"{deployment}:{model}\"\n if combined_id not in embedding_by_model:\n embedding_by_model[combined_id] = emb_obj\n logger.info(\n f\"Created combined identifier '{combined_id}' for embedding object {idx}\"\n )\n\n # Log conflicts\n if identifier_conflicts:\n logger.warning(\n f\"Found {len(identifier_conflicts)} conflicting identifiers. \"\n f\"Consider using combined format 'deployment:model' or specifying unique model names.\"\n )\n for conflict_id, emb_list in identifier_conflicts.items():\n logger.warning(\n f\" Conflict on '{conflict_id}': {len(emb_list)} embeddings use this identifier\"\n )\n\n logger.info(f\"Generating embeddings for {len(available_models)} models in index\")\n logger.info(f\"Available embedding identifiers: {list(embedding_by_model.keys())}\")\n self.log(f\"[SEARCH] Models detected in index: {available_models}\")\n self.log(f\"[SEARCH] Available embedding identifiers: {list(embedding_by_model.keys())}\")\n\n # Track matching status for debugging\n matched_models = []\n unmatched_models = []\n\n for model_name in available_models:\n try:\n # Check if we have an embedding object for this model\n if model_name in embedding_by_model:\n # Use the matching embedding object directly\n emb_obj = embedding_by_model[model_name]\n emb_deployment = getattr(emb_obj, \"deployment\", None)\n emb_model = getattr(emb_obj, \"model\", None)\n emb_model_id = getattr(emb_obj, \"model_id\", None)\n emb_dimensions = getattr(emb_obj, \"dimensions\", None)\n emb_available_models = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Using embedding object for model '{model_name}': \"\n f\"deployment={emb_deployment}, model={emb_model}, model_id={emb_model_id}, \"\n f\"dimensions={emb_dimensions}\"\n )\n\n # Check if this is a dedicated instance from available_models dict\n if emb_available_models and isinstance(emb_available_models, dict):\n logger.info(\n f\"Model '{model_name}' using dedicated instance from available_models dict \"\n f\"(pre-configured with correct model and dimensions)\"\n )\n\n # Use the embedding instance directly - no model switching needed!\n vec = emb_obj.embed_query(q)\n query_embeddings[model_name] = vec\n matched_models.append(model_name)\n logger.info(\n f\"Generated embedding for model: {model_name} (actual dimensions: {len(vec)})\"\n )\n self.log(f\"[MATCH] Model '{model_name}' - generated {len(vec)}-dim embedding\")\n else:\n # No matching embedding found for this model\n unmatched_models.append(model_name)\n logger.warning(\n f\"No matching embedding found for model '{model_name}'. \"\n f\"This model will be skipped. Available identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(\n f\"[NO MATCH] Model '{model_name}' - available: {list(embedding_by_model.keys())}\"\n )\n except (\n RuntimeError,\n ValueError,\n ConnectionError,\n TimeoutError,\n AttributeError,\n KeyError,\n ) as e:\n logger.warning(f\"Failed to generate embedding for {model_name}: {e}\")\n self.log(f\"[ERROR] Embedding generation failed for '{model_name}': {e}\")\n\n # Log summary of model matching\n logger.info(\n f\"Model matching summary: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n self.log(\n f\"[SUMMARY] Model matching: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n if unmatched_models:\n self.log(f\"[WARN] Unmatched models in index: {unmatched_models}\")\n\n if not query_embeddings:\n msg = (\n f\"Failed to generate embeddings for any model. \"\n f\"Index has models: {available_models}, but no matching embedding objects found. \"\n f\"Available embedding identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(f\"[FAIL] Search failed: {msg}\")\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n vector_dim = len(embedding_vector)\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n self.log(\n f\"[SKIP] Field '{selected_field}' not a knn_vector - skipping model '{model_name}'\"\n )\n continue\n\n # Validate vector dimensions match the field dimensions\n field_dim = self._get_field_dimension(index_properties, selected_field)\n if field_dim is not None and field_dim != vector_dim:\n logger.error(\n f\"Dimension mismatch for model '{model_name}': \"\n f\"Query vector has {vector_dim} dimensions but field '{selected_field}' expects {field_dim}. \"\n f\"Skipping this model to prevent search errors.\"\n )\n self.log(\n f\"[DIM MISMATCH] Model '{model_name}': query={vector_dim} vs field={field_dim} - skipping\"\n )\n continue\n\n logger.info(\n f\"Adding KNN query for model '{model_name}': field='{selected_field}', \"\n f\"query_dims={vector_dim}, field_dims={field_dim or 'unknown'}\"\n )\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n self.log(\n f\"[WARN] No valid KNN queries could be built. \"\n f\"Query embeddings generated: {list(query_embeddings.keys())}, \"\n f\"but no matching knn_vector fields found in index.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1,\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = self._resolve_limit(filter_obj, default_limit=self.number_of_results)\n score_threshold = self._resolve_score_threshold(filter_obj)\n\n # Determine the best aggregation field for filename based on index mapping\n filename_agg_field = self._get_filename_agg_field(index_properties)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates,\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": filename_agg_field, \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n \"allowed_principals\",\n ],\n \"size\": limit,\n }\n\n if score_threshold is not None:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models: \"\n f\"{list(query_embeddings.keys())}\"\n )\n self.log(\n f\"[EXEC] Executing search with {len(knn_queries_with_candidates)} KNN queries, limit={limit}\"\n )\n self.log(f\"[EXEC] Embedding models used: {list(query_embeddings.keys())}\")\n self.log(f\"[EXEC] KNN fields being queried: {embedding_fields}\")\n\n try:\n resp = client.search(index=self.index_name, body=body, params={\"terminate_after\": 0})\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = (\n knn_queries_without_candidates\n )\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n self.log(f\"[RESULT] Search complete: {len(hits)} results found\")\n\n if len(hits) == 0:\n self.log(\n f\"[EMPTY] Debug info: \"\n f\"models_in_index={available_models}, \"\n f\"matched_models={matched_models}, \"\n f\"knn_fields={embedding_fields}, \"\n f\"filters={len(filter_clauses)} clauses\"\n )\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> Table:\n \"\"\"Search documents and return results as a Table.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Table (DataFrame) format\n so downstream Parser components can consume them directly.\n\n Always builds the vector store (triggering ingestion if needed), then performs\n search only if a query is provided.\n\n Returns:\n Table containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n\n try:\n # Always build/cache the vector store to ensure ingestion happens\n logger.info(f\"Search query: {self.search_query}\")\n if self._cached_vector_store is None:\n self.build_vector_store()\n\n # Only perform search if query is provided\n search_query = (self.search_query or \"\").strip()\n if not search_query:\n self.log(\"No search query provided - ingestion completed, returning empty results\")\n\n return Table(data=[])\n\n # Perform search with the provided query\n raw = self.search(search_query)\n raw_list = [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n return Table(data=raw_list)\n\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(\n self, build_config: dict, field_value: str, field_name: str | None = None\n ) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n is_openrag = mode == \"openrag\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n # build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n # In 'openrag' mode, expose the OPENRAG_* fields up front\n # since they are the only credentials required.\n for openrag_field in (\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ):\n if openrag_field in build_config:\n build_config[openrag_field][\"advanced\"] = not is_openrag\n build_config[openrag_field][\"required\"] = (\n is_openrag and openrag_field != \"openrag_ingest_batch_size\"\n )\n\n if is_basic or is_openrag:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" }, "docs_metadata": { "_input_type": "TableInput", diff --git a/flows/openrag_agent.json b/flows/openrag_agent.json index da40458ff..300a4886c 100644 --- a/flows/openrag_agent.json +++ b/flows/openrag_agent.json @@ -2522,7 +2522,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport uuid\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import Any\n\nimport httpx\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n MultilineInput,\n Output,\n SecretStrInput,\n StrInput,\n TableInput,\n)\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\nfrom lfx.schema.dataframe import Table\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import OpenSearchException, RequestError\n\nREQUEST_TIMEOUT = 60\nMAX_RETRIES = 5\n\n# watsonx.ai surfaces rate-limit state via these (mostly non-standard) response\n# headers. The IBM SDK acts on the x-requests-limit-* family directly; we log\n# them on a failed embedding call to aid plan/region tuning.\n_WATSONX_RATE_LIMIT_HEADERS = (\n \"x-requests-limit-rate\",\n \"x-requests-limit-remaining\",\n \"x-requests-limit-reset\",\n \"Retry-After\",\n)\n\n\ndef _log_watsonx_rate_limit_headers(error: Exception) -> None:\n \"\"\"Best-effort diagnostic: log watsonx rate-limit headers from a failed call.\n\n The watsonx SDK raises ``ApiRequestFailure``, which carries the originating\n httpx/requests ``Response`` as ``.response``. On a 429 exhaustion we surface\n the documented rate-limit headers so operators can tune throughput.\n \"\"\"\n try:\n response = getattr(error, \"response\", None)\n headers = getattr(response, \"headers\", None)\n if not headers:\n return\n status = getattr(response, \"status_code\", \"unknown\")\n observed = {h: headers.get(h) for h in _WATSONX_RATE_LIMIT_HEADERS if headers.get(h) is not None}\n if str(status) == \"429\" or observed:\n logger.warning(f\"watsonx rate-limit response (status={status}): {observed}\")\n except Exception as log_error: # never let diagnostics mask the real error\n logger.debug(f\"Could not extract watsonx rate-limit headers: {log_error}\")\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n logger.info(f\"chunk_embedding_{normalize_model_name(model_name)}\")\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n\n Model Name Resolution:\n - Priority: deployment > model > model_name attributes\n - This ensures correct matching between embedding objects and index fields\n - When multiple embeddings are provided, specify embedding_model_name to select which one to use\n - During search, each detected model in the index is matched to its corresponding embedding object\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model Multi-Embedding)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search. \"\n \"To search use the tools search_documents and raw_search. \"\n \"Search documents takes a query for vector search, for example\\n\"\n ' {search_query: \"components in openrag\"}'\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n \"request_timeout\",\n \"max_retries\",\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ]\n _openrag_ingest_global_placeholders = {\n \"openrag_ingest_url\": \"OPENRAG_INGEST_URL\",\n \"openrag_ingest_token\": \"OPENRAG_INGEST_TOKEN\",\n \"openrag_ingest_run_id\": \"OPENRAG_INGEST_RUN_ID\",\n }\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\", \"JSON\"],\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"nmslib\", \"faiss\", \"lucene\", \"jvector\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'nmslib' works with standard \"\n \"OpenSearch. 'jvector' requires OpenSearch 2.9+. 'lucene' requires index.knn: true. \"\n \"Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(\n name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"], is_list=True\n ),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model to use for ingestion. This selects which embedding from the list \"\n \"will be used to embed documents. Matches on deployment, model, model_id, or model_name. \"\n \"For duplicate deployments, use combined format: 'deployment:model' \"\n \"(e.g., 'text-embedding-ada-002:text-embedding-3-large'). \"\n \"Leave empty to use the first embedding. Error message will show all available identifiers.\"\n ),\n advanced=False,\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"openrag\",\n options=[\"basic\", \"jwt\", \"openrag\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"'jwt' for JSON Web Token (Bearer) authentication, or 'openrag' to \"\n \"delegate writes to the OpenRAG backend ingest callback (no direct \"\n \"OpenSearch credentials required — only OPENRAG_* fields).\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=False,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n required=False\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=False,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n # ----- Timeout / Retry -----\n StrInput(\n name=\"request_timeout\",\n display_name=\"Request Timeout (seconds)\",\n value=\"60\",\n advanced=True,\n info=(\n \"Time in seconds to wait for a response from OpenSearch. \"\n \"Increase for large bulk ingestion or complex hybrid queries.\"\n ),\n ),\n StrInput(\n name=\"max_retries\",\n display_name=\"Max Retries\",\n value=\"3\",\n advanced=True,\n info=\"Number of retries for failed connections before raising an error.\",\n ),\n StrInput(\n name=\"openrag_ingest_url\",\n display_name=\"OpenRAG Ingest URL\",\n value=\"OPENRAG_INGEST_URL\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Internal OpenRAG callback URL for backend-owned document indexing.\",\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n IntInput(\n name=\"openrag_ingest_batch_size\",\n display_name=\"OpenRAG Ingest Batch Size\",\n value=100,\n advanced=True,\n ),\n ]\n outputs = [\n Output(\n display_name=\"Search Results\",\n name=\"search_results\",\n method=\"search_documents\",\n ),\n Output(display_name=\"Raw Search\", name=\"raw_search\", method=\"raw_search\"),\n ]\n\n def raw_search(self, query: str | dict | None = None) -> Data:\n \"\"\"Execute a raw OpenSearch query against the target index.\n\n Args:\n query (dict[str, Any]): The OpenSearch query DSL dictionary.\n\n Returns:\n Data: Search results as a Data object.\n\n Raises:\n ValueError: If 'query' is not a valid OpenSearch query (must be a non-empty dict).\n \"\"\"\n raw_query = query if query is not None else self.search_query\n\n if raw_query is None or (isinstance(raw_query, str) and not raw_query.strip()):\n self.log(\"No query provided for raw search - returning empty results\")\n return Data(data={})\n\n if isinstance(raw_query, dict):\n query_body = copy.deepcopy(raw_query)\n elif isinstance(raw_query, str):\n s = raw_query.strip()\n\n # First, optimistically try to parse as JSON DSL\n try:\n query_body = json.loads(s)\n except json.JSONDecodeError:\n # Fallback: treat as a basic text query over common fields\n query_body = {\n \"query\": {\n \"multi_match\": {\n \"query\": s,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n }\n }\n }\n else:\n msg = f\"Unsupported raw_search query type: {type(raw_query)!r}\"\n raise TypeError(msg)\n\n filter_obj = self._parse_filter_expression()\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n if filter_clauses:\n if \"query\" in query_body:\n original_query = query_body[\"query\"]\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [original_query],\n \"filter\": filter_clauses,\n }\n }\n else:\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [{\"match_all\": {}}],\n \"filter\": filter_clauses,\n }\n }\n\n if filter_obj:\n # Apply limit if not already set in the raw query\n if \"size\" not in query_body:\n limit = self._resolve_limit(filter_obj, default_limit=None)\n if limit is not None:\n query_body[\"size\"] = limit\n\n # Apply score_threshold / scoreThreshold as min_score if not already set\n if \"min_score\" not in query_body:\n\n score_threshold = self._resolve_score_threshold(filter_obj)\n if score_threshold is not None:\n\n query_body[\"min_score\"] = score_threshold\n\n client = self.build_client()\n logger.info(f\"query: {query_body}\")\n resp = client.search(\n index=self.index_name,\n body=query_body,\n params={\"terminate_after\": 0},\n )\n # Remove any _source keys whose value is a list of floats (embedding vectors)\n # Minimum length threshold to identify embedding vectors\n min_vector_length = 100\n\n def is_vector(val):\n # Accepts if it's a list of numbers (float or int) and has reasonable vector length\n return (\n isinstance(val, list)\n and len(val) > min_vector_length\n and all(isinstance(x, (float, int)) for x in val)\n )\n\n if \"hits\" in resp and \"hits\" in resp[\"hits\"]:\n for hit in resp[\"hits\"][\"hits\"]:\n source = hit.get(\"_source\")\n if isinstance(source, dict):\n keys_to_remove = [k for k, v in source.items() if is_vector(v)]\n for k in keys_to_remove:\n source.pop(k)\n logger.info(f\"Raw search response (all embedding vectors removed): {resp}\")\n return Data(**resp)\n\n def _get_embedding_model_name(self, embedding_obj=None) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Priority: deployment > model > model_id > model_name\n This ensures we use the actual model being deployed, not just the configured model.\n Supports multiple embedding providers (OpenAI, Watsonx, Cohere, etc.)\n\n Args:\n embedding_obj: Specific embedding object to get name from (optional)\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from provided embedding object\n if embedding_obj:\n # Priority: deployment > model > model_id > model_name\n if hasattr(embedding_obj, \"deployment\") and embedding_obj.deployment:\n return str(embedding_obj.deployment)\n if hasattr(embedding_obj, \"model\") and embedding_obj.model:\n return str(embedding_obj.model)\n if hasattr(embedding_obj, \"model_id\") and embedding_obj.model_id:\n return str(embedding_obj.model_id)\n if hasattr(embedding_obj, \"model_name\") and embedding_obj.model_name:\n return str(embedding_obj.model_name)\n\n # Try to get from embedding component (legacy single embedding)\n if hasattr(self, \"embedding\") and self.embedding:\n # Handle list of embeddings\n if isinstance(self.embedding, list) and len(self.embedding) > 0:\n first_emb = self.embedding[0]\n if hasattr(first_emb, \"deployment\") and first_emb.deployment:\n return str(first_emb.deployment)\n if hasattr(first_emb, \"model\") and first_emb.model:\n return str(first_emb.model)\n if hasattr(first_emb, \"model_id\") and first_emb.model_id:\n return str(first_emb.model_id)\n if hasattr(first_emb, \"model_name\") and first_emb.model_name:\n return str(first_emb.model_name)\n # Handle single embedding\n elif not isinstance(self.embedding, list):\n if hasattr(self.embedding, \"deployment\") and self.embedding.deployment:\n return str(self.embedding.deployment)\n if hasattr(self.embedding, \"model\") and self.embedding.model:\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_id\") and self.embedding.model_id:\n return str(self.embedding.model_id)\n if hasattr(self.embedding, \"model_name\") and self.embedding.model_name:\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'deployment', 'model', 'model_id', or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Note: Some OpenSearch versions/configurations have issues with dynamically adding\n knn_vector mappings (NullPointerException). This method checks if the field\n already exists before attempting to add it, and gracefully skips if the field\n is already properly configured.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n # First, check if the field already exists and is properly mapped\n properties = self._get_index_properties(client)\n if self._is_knn_vector_field(properties, field_name):\n # Field already exists as knn_vector - verify dimensions match\n existing_dim = self._get_field_dimension(properties, field_name)\n if existing_dim is not None and existing_dim != dim:\n logger.warning(\n f\"Field '{field_name}' exists with dimension {existing_dim}, \"\n f\"but current embedding has dimension {dim}. Using existing mapping.\"\n )\n else:\n logger.info(\n f\"[OpenSearchMultimodel] Field '{field_name}' already exists\"\n f\"as knn_vector with matching dimensions - skipping mapping update\"\n )\n return\n\n # Field doesn't exist, try to add the mapping\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\"type\": \"keyword\"},\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except RequestError as e:\n error_str = str(e).lower()\n if \"invalid engine\" in error_str and \"jvector\" in error_str:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to OpenSearch 2.9+.\"\n )\n raise ValueError(msg) from e\n if \"index.knn\" in error_str:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from e\n raise\n except Exception as e:\n # Check if this is the known OpenSearch k-NN NullPointerException issue\n error_str = str(e).lower()\n if \"null\" in error_str or \"nullpointerexception\" in error_str:\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}\"\n f\"due to OpenSearch k-NN plugin issue: {e}. \"\n f\"This is a known issue with some OpenSearch versions. \"\n f\"[OpenSearchMultimodel] Skipping mapping update. \"\n f\"Please ensure the index has the correct mapping for KNN search to work.\"\n )\n # Skip and continue - ingestion will proceed, but KNN search may fail if mapping doesn't exist\n return\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}: {e}\"\n )\n raise\n\n # Verify the field was added correctly\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n msg = f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n logger.error(msg)\n raise ValueError(msg)\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return (\n http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n )\n\n @staticmethod\n def _openrag_input_to_str(value: Any) -> str:\n if value is None:\n return \"\"\n if hasattr(value, \"get_secret_value\"):\n value = value.get_secret_value()\n if hasattr(value, \"text\"):\n value = value.text\n return str(value or \"\").strip()\n\n def _openrag_callback_value(self, attr_name: str) -> str:\n value = self._openrag_input_to_str(getattr(self, attr_name, \"\"))\n if value == self._openrag_ingest_global_placeholders.get(attr_name):\n return \"\"\n return value\n\n def _openrag_ingest_callback_config(self) -> tuple[str, str, str] | None:\n url = self._openrag_callback_value(\"openrag_ingest_url\")\n token = self._openrag_callback_value(\"openrag_ingest_token\")\n ingest_run_id = self._openrag_callback_value(\"openrag_ingest_run_id\")\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n debug_payload = {\n \"openrag_ingest_url\": url,\n \"openrag_ingest_url_len\": len(url),\n \"openrag_ingest_token_masked\": masked_token,\n \"openrag_ingest_token_len\": len(token),\n \"openrag_ingest_run_id\": ingest_run_id,\n \"raw_url_type\": type(self.openrag_ingest_url).__name__,\n \"raw_token_type\": type(self.openrag_ingest_token).__name__,\n \"raw_run_id_type\": type(self.openrag_ingest_run_id).__name__,\n }\n logger.warning(f\"[OpenRAG callback config] {debug_payload}\")\n try:\n self.log(f\"[OpenRAG callback config] {debug_payload}\")\n except Exception:\n pass\n\n if not url and not token and not ingest_run_id:\n return None\n if not url or not token or not ingest_run_id:\n msg = \"OpenRAG ingest callback requires url, token, and ingest_run_id.\"\n raise ValueError(msg)\n return url, token, ingest_run_id\n\n def _post_openrag_ingest_batches(\n self,\n *,\n requests: list[dict],\n vector_field: str,\n text_field: str,\n ) -> None:\n callback_config = self._openrag_ingest_callback_config()\n if callback_config is None:\n return\n\n url, token, ingest_run_id = callback_config\n batch_size = max(self._parse_int_param(\"openrag_ingest_batch_size\", 100), 1)\n timeout = self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT)\n headers = {\"Authorization\": f\"Bearer {token}\"}\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n request_summary = {\n \"url\": url,\n \"ingest_run_id\": ingest_run_id,\n \"token_masked\": masked_token,\n \"total_chunks\": len(requests),\n \"batch_size\": batch_size,\n \"timeout_s\": timeout,\n }\n logger.warning(f\"[OpenRAG ingest POST] {request_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST] {request_summary}\")\n except Exception:\n pass\n\n with httpx.Client(timeout=timeout) as client:\n total_batches = (len(requests) + batch_size - 1) // batch_size\n for batch_number, start in enumerate(range(0, len(requests), batch_size), start=1):\n batch = requests[start : start + batch_size]\n final = batch_number == total_batches\n payload = {\n \"ingest_run_id\": ingest_run_id,\n \"batch_id\": batch_number,\n \"final\": final,\n \"chunks\": [\n self._openrag_chunk_payload(\n request,\n vector_field=vector_field,\n text_field=text_field,\n )\n for request in batch\n ],\n }\n logger.warning(\n f\"[OpenRAG ingest POST] -> batch={batch_number}/{total_batches} \"\n f\"url={url} chunks={len(payload['chunks'])} final={final}\"\n )\n response = client.post(url, json=payload, headers=headers)\n response_summary = {\n \"batch\": batch_number,\n \"url\": url,\n \"status\": response.status_code,\n \"final_url\": str(response.request.url),\n \"response_headers\": dict(response.headers),\n \"body_preview\": response.text[:500],\n }\n logger.warning(f\"[OpenRAG ingest POST resp] {response_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST resp] {response_summary}\")\n except Exception:\n pass\n if response.status_code >= 400:\n msg = (\n \"OpenRAG ingest callback failed \"\n f\"(batch={batch_number}, status={response.status_code}, \"\n f\"url={url}): {response.text[:1000]}\"\n )\n raise RuntimeError(msg)\n\n self.log(f\"Posted {len(requests)} chunks to OpenRAG backend ingest callback.\")\n\n @staticmethod\n def _openrag_chunk_payload(\n request: dict,\n *,\n vector_field: str,\n text_field: str,\n ) -> dict:\n metadata = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\", vector_field, text_field}\n }\n page = metadata.get(\"page\")\n if isinstance(page, str) and page.isdigit():\n page = int(page)\n return {\n \"id\": request.get(\"_id\") or request.get(\"id\"),\n \"text\": request.get(text_field, \"\"),\n \"vector\": request[vector_field],\n \"page\": page if isinstance(page, int) else None,\n \"metadata\": metadata,\n }\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n logger.debug(f\"[OpenSearchMultimodel] Bulk ingesting embeddings for {index_name}\")\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n\n # Normalize ACL fields that may arrive as JSON strings from flows\n for key in (\"allowed_users\", \"allowed_groups\", \"allowed_principals\"):\n value = metadata.get(key)\n if isinstance(value, str):\n try:\n parsed = json.loads(value)\n if isinstance(parsed, list):\n metadata[key] = parsed\n except (json.JSONDecodeError, TypeError):\n # Leave value as-is if it isn't valid JSON\n pass\n\n metadata_document_id = str(metadata.get(\"document_id\") or \"\").strip()\n if metadata_document_id and metadata_document_id.lower() != \"none\":\n generated_id = f\"{metadata_document_id}_{i}\"\n else:\n generated_id = str(uuid.uuid4())\n _id = ids[i] if ids else generated_id\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n if self._openrag_ingest_callback_config() is not None:\n self._post_openrag_ingest_batches(\n requests=requests,\n vector_field=vector_field,\n text_field=text_field,\n )\n return return_ids\n try:\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n except Exception as bulk_error:\n if \"Unsupported request type for filter level DLS\" not in str(bulk_error):\n raise\n logger.warning(\n \"[OpenSearchMultimodel] Bulk ingest is blocked by filter-level DLS; \"\n \"falling back to per-document index requests.\"\n )\n self._index_embeddings_individually(client, requests)\n return return_ids\n\n def _index_embeddings_individually(\n self,\n client: OpenSearch,\n requests: list[dict],\n ) -> None:\n \"\"\"Index documents one at a time when OpenSearch DLS rejects bulk writes.\"\"\"\n for request in requests:\n document_id = request.get(\"_id\") or request.get(\"id\")\n body = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\"}\n }\n client.index(index=request[\"_index\"], id=document_id, body=body)\n\n def _log_index_admin_skip(self, operation: str, error: Exception) -> None:\n \"\"\"Log index-admin operations that may be blocked under filter-level DLS.\"\"\"\n logger.warning(\n f\"[OpenSearchMultimodel] Could not run index-admin operation '{operation}': {error}. \"\n \"Assuming the backend pre-created the required index/mapping and continuing.\"\n )\n\n # ---------- param helpers ----------\n def _parse_int_param(self, attr_name: str, default: int) -> int:\n \"\"\"Parse a string attribute to int, returning *default* on failure.\"\"\"\n raw = getattr(self, attr_name, None)\n if raw is None or str(raw).strip() == \"\":\n return default\n try:\n value = int(str(raw).strip())\n except ValueError:\n logger.warning(\n f\"Invalid integer value '{raw}' for {attr_name}, using default {default}\"\n )\n return default\n\n if value < 0:\n logger.warning(f\"Negative value '{raw}' for {attr_name}, using default {default}\")\n return default\n\n return value\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n if mode == \"openrag\":\n # Writes are delegated to the OpenRAG backend ingest callback,\n # so no direct OpenSearch credentials are needed. Only the\n # OPENRAG_* fields are required for ingestion to function.\n missing = [\n name\n for name, value in (\n (\"openrag_ingest_url\", self.openrag_ingest_url),\n (\"openrag_ingest_token\", self.openrag_ingest_token),\n (\"openrag_ingest_run_id\", self.openrag_ingest_run_id),\n )\n if not (value or \"\").strip()\n ]\n if missing:\n msg = (\n \"Auth Mode is 'openrag' but required OPENRAG_* fields are \"\n f\"missing: {', '.join(missing)}.\"\n )\n raise ValueError(msg)\n return {}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel] Building OpenSearch client\")\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n timeout=self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT),\n max_retries=self._parse_int_param(\"max_retries\", MAX_RETRIES),\n retry_on_timeout=True,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n client = self.build_client()\n\n # Check if we're in ingestion-only mode (no search query)\n has_search_query = bool((self.search_query or \"\").strip())\n if not has_search_query:\n logger.debug(\n \"[OpenSearchMultimodel] Ingestion-only mode activated: search operations will be skipped\"\n )\n logger.debug(\"[OpenSearchMultimodel] Starting ingestion mode...\")\n\n logger.debug(f\"[OpenSearchMultimodel] Embedding: {self.embedding}\")\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings using the selected model\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel][INGESTION] _add_documents_to_vector_store called\")\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data type: \"\n f\"{type(self.ingest_data)}, length: {len(self.ingest_data) if self.ingest_data else 0}\"\n )\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data content: \"\n f\"{self.ingest_data[:2] if self.ingest_data and len(self.ingest_data) > 0 else 'empty'}\"\n )\n\n docs = self.ingest_data or []\n if not docs:\n logger.debug(\"Ingestion complete: No documents provided\")\n return\n\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Normalize embedding to list first\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n\n # Filter out None values (fail-safe mode) - do this BEFORE checking if empty\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n # NOW check if we have any valid embeddings left after filtering\n if not embeddings_list:\n logger.warning(\n \"All embeddings returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n self.log(\n \"Embedding returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n return\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] Valid embeddings after filtering: {len(embeddings_list)}\"\n )\n self.log(\n f\"[OpenSearchMultimodel][INGESTION] Available embedding models: {len(embeddings_list)}\"\n )\n\n # Select the embedding to use for ingestion\n selected_embedding = None\n embedding_model = None\n\n # If embedding_model_name is specified, find matching embedding\n if (\n hasattr(self, \"embedding_model_name\")\n and self.embedding_model_name\n and self.embedding_model_name.strip()\n ):\n target_model_name = self.embedding_model_name.strip()\n self.log(f\"Looking for embedding model: {target_model_name}\")\n\n for emb_obj in embeddings_list:\n # Check all possible model identifiers (deployment, model, model_id, model_name)\n # Also check available_models list from EmbeddingsWithModels\n possible_names = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n if deployment:\n possible_names.append(str(deployment))\n if model:\n possible_names.append(str(model))\n if model_id:\n possible_names.append(str(model_id))\n if model_name:\n possible_names.append(str(model_name))\n\n # Also add combined identifier\n if deployment and model and deployment != model:\n possible_names.append(f\"{deployment}:{model}\")\n\n # Add all models from available_models dict\n if available_models_attr and isinstance(available_models_attr, dict):\n possible_names.extend(\n str(model_key).strip()\n for model_key in available_models_attr\n if model_key and str(model_key).strip()\n )\n\n # Match if target matches any of the possible names\n if target_model_name in possible_names:\n # Check if target is in available_models dict - use dedicated instance\n if (\n available_models_attr\n and isinstance(available_models_attr, dict)\n and target_model_name in available_models_attr\n ):\n # Use the dedicated embedding instance from the dict\n selected_embedding = available_models_attr[target_model_name]\n embedding_model = target_model_name\n self.log(\n f\"Found dedicated embedding instance for '{embedding_model}' in available_models dict\"\n )\n else:\n # Traditional identifier match\n selected_embedding = emb_obj\n embedding_model = self._get_embedding_model_name(emb_obj)\n self.log(\n f\"Found matching embedding model: {embedding_model} (matched on: {target_model_name})\"\n )\n break\n\n if not selected_embedding:\n # Build detailed list of available embeddings with all their identifiers\n available_info = []\n for idx, emb in enumerate(embeddings_list):\n emb_type = type(emb).__name__\n identifiers = []\n deployment = getattr(emb, \"deployment\", None)\n model = getattr(emb, \"model\", None)\n model_id = getattr(emb, \"model_id\", None)\n model_name = getattr(emb, \"model_name\", None)\n available_models_attr = getattr(emb, \"available_models\", None)\n\n if deployment:\n identifiers.append(f\"deployment='{deployment}'\")\n if model:\n identifiers.append(f\"model='{model}'\")\n if model_id:\n identifiers.append(f\"model_id='{model_id}'\")\n if model_name:\n identifiers.append(f\"model_name='{model_name}'\")\n\n # Add combined identifier as an option\n if deployment and model and deployment != model:\n identifiers.append(f\"combined='{deployment}:{model}'\")\n\n # Add available_models dict if present\n if available_models_attr and isinstance(available_models_attr, dict):\n identifiers.append(f\"available_models={list(available_models_attr.keys())}\")\n\n available_info.append(\n f\" [{idx}] {emb_type}: {', '.join(identifiers) if identifiers else 'No identifiers'}\"\n )\n\n msg = (\n f\"Embedding model '{target_model_name}' not found in available embeddings.\\n\\n\"\n f\"Available embeddings:\\n\" + \"\\n\".join(available_info) + \"\\n\\n\"\n \"Please set 'embedding_model_name' to one of the identifier values shown above \"\n \"(use the value after the '=' sign, without quotes).\\n\"\n \"For duplicate deployments, use the 'combined' format.\\n\"\n \"Or leave it empty to use the first embedding.\"\n )\n raise ValueError(msg)\n else:\n # Use first embedding if no model name specified\n selected_embedding = embeddings_list[0]\n embedding_model = self._get_embedding_model_name(selected_embedding)\n self.log(f\"No embedding_model_name specified, using first embedding: {embedding_model}\")\n\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n logger.info(f\"Selected embedding model for ingestion: '{embedding_model}'\")\n self.log(f\"Using embedding model for ingestion: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Log embedding details for debugging\n if hasattr(selected_embedding, \"deployment\"):\n logger.info(f\"Embedding deployment: {selected_embedding.deployment}\")\n if hasattr(selected_embedding, \"model\"):\n logger.info(f\"Embedding model: {selected_embedding.model}\")\n if hasattr(selected_embedding, \"model_id\"):\n logger.info(f\"Embedding model_id: {selected_embedding.model_id}\")\n if hasattr(selected_embedding, \"dimensions\"):\n logger.info(f\"Embedding dimensions: {selected_embedding.dimensions}\")\n if hasattr(selected_embedding, \"available_models\"):\n logger.info(f\"Embedding available_models: {selected_embedding.available_models}\")\n\n # No model switching needed - each model in available_models has its own dedicated instance\n # The selected_embedding is already configured correctly for the target model\n logger.info(\n f\"Using embedding instance for '{embedding_model}' - pre-configured and ready to use\"\n )\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n logger.debug(f\"[LF] Docs metadata {self.docs_metadata}\")\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n\n # Determine whether the selected embedding is watsonx/IBM. The watsonx\n # SDK ships its own rate-limit machinery (input batching, proactive\n # x-requests-limit-* TokenBucket throttling, and jittered exponential\n # backoff on 429), so we lean on it instead of retrying on top of it.\n # The type-name check also covers watsonx-hosted, non-\"ibm/\" models\n # (e.g. intfloat/multilingual-e5-large).\n is_ibm = (embedding_model and \"ibm\" in str(embedding_model).lower()) or (\n selected_embedding and \"watsonx\" in type(selected_embedding).__name__.lower()\n )\n logger.debug(f\"Is IBM/watsonx embedding: {is_ibm}\")\n\n if is_ibm:\n\n # Hand the full batch to the SDK and let it batch/throttle/retry.\n # Retry attempts and base backoff are tunable via the SDK's own\n # WATSONX_MAX_RETRIES / WATSONX_DELAY_TIME environment variables.\n logger.info(\n f\"Embedding {len(texts)} chunks via watsonx SDK batch (SDK-managed throttle + 429 retry)\"\n )\n try:\n vectors: list[list[float]] = selected_embedding.embed_documents(texts)\n logger.info(f\"Successfully embedded {len(vectors)} chunks via watsonx SDK\")\n except Exception as embed_error:\n _log_watsonx_rate_limit_headers(embed_error)\n logger.error(\n f\"Failed to embed {len(texts)} chunks via watsonx SDK. Error: {embed_error}\",\n )\n raise\n\n else:\n # Non-watsonx providers (OpenAI, Ollama) lack the watsonx SDK's\n # built-in rate-limit handling, so embed per chunk in parallel with\n # a generic rate-limit-aware tenacity retry.\n vectors: list[list[float]] = [None] * len(texts)\n from tenacity import (\n retry,\n retry_if_exception,\n stop_after_attempt,\n wait_exponential,\n )\n\n def is_rate_limit_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a rate limit error (429).\"\"\"\n error_str = str(exception).lower()\n return \"429\" in error_str or \"rate_limit\" in error_str or \"rate limit\" in error_str\n\n def is_other_retryable_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a transient network error worth retrying.\"\"\"\n if is_rate_limit_error(exception):\n return False\n return isinstance(exception, (ConnectionError, TimeoutError, OSError))\n\n # Retry decorator for rate limit errors (longer backoff)\n retry_on_rate_limit = retry(\n retry=retry_if_exception(is_rate_limit_error),\n stop=stop_after_attempt(5),\n wait=wait_exponential(multiplier=2, min=2, max=30),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Rate limit hit for chunk (attempt {retry_state.attempt_number}/5), \"\n f\"backing off for {retry_state.next_action.sleep:.1f}s\"\n ),\n )\n\n # Retry decorator for other errors (shorter backoff)\n retry_on_other_errors = retry(\n retry=retry_if_exception(is_other_retryable_error),\n stop=stop_after_attempt(3),\n wait=wait_exponential(multiplier=1, min=1, max=8),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Error embedding chunk (attempt {retry_state.attempt_number}/3), \"\n f\"retrying in {retry_state.next_action.sleep:.1f}s: {retry_state.outcome.exception()}\"\n ),\n )\n\n def embed_chunk_with_retry(chunk_text: str, chunk_idx: int) -> list[float]:\n \"\"\"Embed a single chunk with rate-limit-aware retry logic.\"\"\"\n\n @retry_on_rate_limit\n @retry_on_other_errors\n def _embed(text: str) -> list[float]:\n return selected_embedding.embed_documents([text])[0]\n\n try:\n return _embed(chunk_text)\n except Exception as e:\n logger.error(\n f\"Failed to embed chunk {chunk_idx} after all retries: {e}\",\n error=str(e),\n )\n raise\n\n max_workers = min(max(len(texts), 1), 8)\n logger.debug(f\"Using parallel processing with {max_workers} workers\")\n\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {\n executor.submit(embed_chunk_with_retry, chunk, idx): idx\n for idx, chunk in enumerate(texts)\n }\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n\n if not vectors:\n self.log(f\"No vectors generated from documents for model {embedding_model}.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n use_openrag_ingest_callback = self._openrag_ingest_callback_config() is not None\n\n is_aoss = False\n mapping: dict | None = None\n\n engine = getattr(self, \"engine\", \"jvector\")\n\n if use_openrag_ingest_callback:\n self.log(\"Using OpenRAG backend ingest callback; skipping direct OpenSearch writes.\")\n else:\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping (index.knn: true is required for vector search)\n index_exists = True\n try:\n index_exists = bool(client.indices.exists(index=self.index_name))\n except OpenSearchException as exists_error:\n self._log_index_admin_skip(\"indices.exists\", exists_error)\n\n try:\n if not index_exists:\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error == \"resource_already_exists_exception\":\n pass # Index was created concurrently\n else:\n error_msg = str(creation_error).lower()\n if \"invalid engine\" in error_msg or \"illegal_argument\" in error_msg:\n if \"jvector\" in error_msg:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to 2.9+.\"\n )\n raise ValueError(msg) from creation_error\n if \"index.knn\" in error_msg:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from creation_error\n logger.warning(f\"Failed to create index '{self.index_name}': {creation_error}\")\n raise\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(\n f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\"\n )\n logger.info(f\"Will store embeddings in field: {dynamic_field_name}\")\n logger.info(f\"Will tag documents with embedding_model: {embedding_model}\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n logger.info(\n f\"Ingestion complete: Successfully indexed {len(return_ids)} documents with model '{embedding_model}'\"\n )\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if (\n \"term\" in f\n and isinstance(f[\"term\"], dict)\n and not self._is_placeholder_term(f[\"term\"])\n ):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n\n def _parse_filter_expression(self) -> dict | None:\n \"\"\"Parse and validate optional filter_expression JSON.\n\n Returns:\n Parsed JSON object as a dict, or None when unset/blank.\n\n Raises:\n ValueError: If JSON is invalid or does not decode to an object.\n \"\"\"\n filter_expression = getattr(self, \"filter_expression\", \"\")\n if not isinstance(filter_expression, str) or not filter_expression.strip():\n return None\n try:\n filter_obj = json.loads(filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not isinstance(filter_obj, dict):\n msg = \"Invalid filter_expression JSON type: expected a JSON object.\"\n raise TypeError(msg)\n return filter_obj\n\n def _resolve_limit(self, filter_obj: dict | None, default_limit: int | None) -> int | None:\n \"\"\"Resolve an integer result limit from filter settings.\"\"\"\n if not filter_obj:\n return default_limit\n raw_limit = filter_obj.get(\"limit\", default_limit)\n if raw_limit is None:\n return None\n if isinstance(raw_limit, bool):\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise TypeError(msg)\n try:\n limit = int(raw_limit)\n except (TypeError, ValueError) as e:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg) from e\n if limit <= 0:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg)\n return limit\n\n def _resolve_score_threshold(self, filter_obj: dict | None) -> float | None:\n \"\"\"Resolve optional positive min score from filter settings.\"\"\"\n if not filter_obj:\n return None\n score_threshold = filter_obj.get(\"score_threshold\")\n if score_threshold is None:\n score_threshold = filter_obj.get(\"scoreThreshold\")\n if not isinstance(score_threshold, (int, float)) or score_threshold <= 0:\n return None\n return float(score_threshold)\n\n def _detect_available_models(self, client: OpenSearch, filter_clauses: list[dict] | None = None) -> list[str]:\n\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}}},\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\"bool\": {\"filter\": filter_clauses}}\n\n logger.debug(f\"Model detection query: {agg_query}\")\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n # Log detailed bucket info for debugging\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n if not models:\n total_hits = result.get(\"hits\", {}).get(\"total\", {})\n total_count = (\n total_hits.get(\"value\", 0) if isinstance(total_hits, dict) else total_hits\n )\n logger.warning(\n f\"No embedding_model values found in index '{self.index_name}'. \"\n f\"Total docs in index: {total_count}. \"\n f\"This may indicate documents were indexed without the embedding_model field.\"\n )\n except (OpenSearchException, KeyError, ValueError) as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n fallback_model = self._get_embedding_model_name()\n logger.info(f\"Using fallback model: {fallback_model}\")\n return [fallback_model]\n else:\n return models\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except OpenSearchException as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n return bool(isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\")\n\n def _get_field_dimension(\n self, properties: dict[str, Any] | None, field_name: str\n ) -> int | None:\n \"\"\"Get the dimension of a knn_vector field from the index mapping.\n\n Args:\n properties: Index properties from mapping\n field_name: Name of the vector field\n\n Returns:\n Dimension of the field, or None if not found\n \"\"\"\n if not field_name or properties is None:\n return None\n\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return None\n\n # Check direct knn_vector field\n if field_def.get(\"type\") == \"knn_vector\":\n return field_def.get(\"dimension\")\n\n # Check nested properties\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return nested_props.get(\"dimension\")\n\n return None\n\n def _get_filename_agg_field(self, index_properties: dict[str, Any] | None) -> str:\n \"\"\"Choose the appropriate field for filename aggregations.\"\"\"\n if not index_properties:\n return \"filename.keyword\"\n\n filename_def = index_properties.get(\"filename\")\n if not isinstance(filename_def, dict):\n return \"filename.keyword\"\n\n field_type = filename_def.get(\"type\")\n fields_def = filename_def.get(\"fields\", {})\n\n # Top-level keyword with no subfields\n if field_type == \"keyword\" and not isinstance(fields_def, dict):\n return \"filename\"\n\n # Text field with keyword subfield\n if isinstance(fields_def, dict) and \"keyword\" in fields_def:\n return \"filename.keyword\"\n\n # Fallback: aggregate on filename directly\n return \"filename\"\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = self._parse_filter_expression()\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Check if embedding is None (fail-safe mode)\n if self.embedding is None or (\n isinstance(self.embedding, list) and all(e is None for e in self.embedding)\n ):\n logger.error(\"Embedding returned None (fail-safe mode enabled). Cannot perform search.\")\n return []\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models\n query_embeddings = {}\n\n # Normalize embedding to list\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n # Filter out None values (fail-safe mode)\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n if not embeddings_list:\n logger.error(\n \"No valid embeddings available after filtering None values (fail-safe mode). Cannot perform search.\"\n )\n return []\n\n # Create a comprehensive map of model names to embedding objects\n # Check all possible identifiers (deployment, model, model_id, model_name)\n # Also leverage available_models list from EmbeddingsWithModels\n # Handle duplicate identifiers by creating combined keys\n embedding_by_model = {}\n identifier_conflicts = {} # Track which identifiers have conflicts\n\n for idx, emb_obj in enumerate(embeddings_list):\n # Get all possible identifiers for this embedding\n identifiers = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n dimensions = getattr(emb_obj, \"dimensions\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Embedding object {idx}: deployment={deployment}, model={model}, \"\n f\"model_id={model_id}, model_name={model_name}, dimensions={dimensions}, \"\n f\"available_models={available_models_attr}\"\n )\n\n # If this embedding has available_models dict, map all models to their dedicated instances\n if available_models_attr and isinstance(available_models_attr, dict):\n logger.info(\n f\"Embedding object {idx} provides {len(available_models_attr)} models via available_models dict\"\n )\n for model_name_key, dedicated_embedding in available_models_attr.items():\n if model_name_key and str(model_name_key).strip():\n model_str = str(model_name_key).strip()\n if model_str not in embedding_by_model:\n # Use the dedicated embedding instance from the dict\n embedding_by_model[model_str] = dedicated_embedding\n logger.info(\n f\"Mapped available model '{model_str}' to dedicated embedding instance\"\n )\n else:\n # Conflict detected - track it\n if model_str not in identifier_conflicts:\n identifier_conflicts[model_str] = [embedding_by_model[model_str]]\n identifier_conflicts[model_str].append(dedicated_embedding)\n logger.warning(\n f\"Available model '{model_str}' has conflict - used by multiple embeddings\"\n )\n\n # Also map traditional identifiers (for backward compatibility)\n if deployment:\n identifiers.append(str(deployment))\n if model:\n identifiers.append(str(model))\n if model_id:\n identifiers.append(str(model_id))\n if model_name:\n identifiers.append(str(model_name))\n\n # Map all identifiers to this embedding object\n for identifier in identifiers:\n if identifier not in embedding_by_model:\n embedding_by_model[identifier] = emb_obj\n logger.info(f\"Mapped identifier '{identifier}' to embedding object {idx}\")\n else:\n # Conflict detected - track it\n if identifier not in identifier_conflicts:\n identifier_conflicts[identifier] = [embedding_by_model[identifier]]\n identifier_conflicts[identifier].append(emb_obj)\n logger.warning(\n f\"Identifier '{identifier}' has conflict - used by multiple embeddings\"\n )\n\n # For embeddings with model+deployment, create combined identifier\n # This helps when deployment is the same but model differs\n if deployment and model and deployment != model:\n combined_id = f\"{deployment}:{model}\"\n if combined_id not in embedding_by_model:\n embedding_by_model[combined_id] = emb_obj\n logger.info(\n f\"Created combined identifier '{combined_id}' for embedding object {idx}\"\n )\n\n # Log conflicts\n if identifier_conflicts:\n logger.warning(\n f\"Found {len(identifier_conflicts)} conflicting identifiers. \"\n f\"Consider using combined format 'deployment:model' or specifying unique model names.\"\n )\n for conflict_id, emb_list in identifier_conflicts.items():\n logger.warning(\n f\" Conflict on '{conflict_id}': {len(emb_list)} embeddings use this identifier\"\n )\n\n logger.info(f\"Generating embeddings for {len(available_models)} models in index\")\n logger.info(f\"Available embedding identifiers: {list(embedding_by_model.keys())}\")\n self.log(f\"[SEARCH] Models detected in index: {available_models}\")\n self.log(f\"[SEARCH] Available embedding identifiers: {list(embedding_by_model.keys())}\")\n\n # Track matching status for debugging\n matched_models = []\n unmatched_models = []\n\n for model_name in available_models:\n try:\n # Check if we have an embedding object for this model\n if model_name in embedding_by_model:\n # Use the matching embedding object directly\n emb_obj = embedding_by_model[model_name]\n emb_deployment = getattr(emb_obj, \"deployment\", None)\n emb_model = getattr(emb_obj, \"model\", None)\n emb_model_id = getattr(emb_obj, \"model_id\", None)\n emb_dimensions = getattr(emb_obj, \"dimensions\", None)\n emb_available_models = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Using embedding object for model '{model_name}': \"\n f\"deployment={emb_deployment}, model={emb_model}, model_id={emb_model_id}, \"\n f\"dimensions={emb_dimensions}\"\n )\n\n # Check if this is a dedicated instance from available_models dict\n if emb_available_models and isinstance(emb_available_models, dict):\n logger.info(\n f\"Model '{model_name}' using dedicated instance from available_models dict \"\n f\"(pre-configured with correct model and dimensions)\"\n )\n\n # Use the embedding instance directly - no model switching needed!\n vec = emb_obj.embed_query(q)\n query_embeddings[model_name] = vec\n matched_models.append(model_name)\n logger.info(\n f\"Generated embedding for model: {model_name} (actual dimensions: {len(vec)})\"\n )\n self.log(f\"[MATCH] Model '{model_name}' - generated {len(vec)}-dim embedding\")\n else:\n # No matching embedding found for this model\n unmatched_models.append(model_name)\n logger.warning(\n f\"No matching embedding found for model '{model_name}'. \"\n f\"This model will be skipped. Available identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(\n f\"[NO MATCH] Model '{model_name}' - available: {list(embedding_by_model.keys())}\"\n )\n except (\n RuntimeError,\n ValueError,\n ConnectionError,\n TimeoutError,\n AttributeError,\n KeyError,\n ) as e:\n logger.warning(f\"Failed to generate embedding for {model_name}: {e}\")\n self.log(f\"[ERROR] Embedding generation failed for '{model_name}': {e}\")\n\n # Log summary of model matching\n logger.info(\n f\"Model matching summary: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n self.log(\n f\"[SUMMARY] Model matching: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n if unmatched_models:\n self.log(f\"[WARN] Unmatched models in index: {unmatched_models}\")\n\n if not query_embeddings:\n msg = (\n f\"Failed to generate embeddings for any model. \"\n f\"Index has models: {available_models}, but no matching embedding objects found. \"\n f\"Available embedding identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(f\"[FAIL] Search failed: {msg}\")\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n vector_dim = len(embedding_vector)\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n self.log(\n f\"[SKIP] Field '{selected_field}' not a knn_vector - skipping model '{model_name}'\"\n )\n continue\n\n # Validate vector dimensions match the field dimensions\n field_dim = self._get_field_dimension(index_properties, selected_field)\n if field_dim is not None and field_dim != vector_dim:\n logger.error(\n f\"Dimension mismatch for model '{model_name}': \"\n f\"Query vector has {vector_dim} dimensions but field '{selected_field}' expects {field_dim}. \"\n f\"Skipping this model to prevent search errors.\"\n )\n self.log(\n f\"[DIM MISMATCH] Model '{model_name}': query={vector_dim} vs field={field_dim} - skipping\"\n )\n continue\n\n logger.info(\n f\"Adding KNN query for model '{model_name}': field='{selected_field}', \"\n f\"query_dims={vector_dim}, field_dims={field_dim or 'unknown'}\"\n )\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n self.log(\n f\"[WARN] No valid KNN queries could be built. \"\n f\"Query embeddings generated: {list(query_embeddings.keys())}, \"\n f\"but no matching knn_vector fields found in index.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1,\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = self._resolve_limit(filter_obj, default_limit=self.number_of_results)\n score_threshold = self._resolve_score_threshold(filter_obj)\n\n # Determine the best aggregation field for filename based on index mapping\n filename_agg_field = self._get_filename_agg_field(index_properties)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates,\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": filename_agg_field, \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n \"allowed_principals\",\n ],\n \"size\": limit,\n }\n\n if score_threshold is not None:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models: \"\n f\"{list(query_embeddings.keys())}\"\n )\n self.log(\n f\"[EXEC] Executing search with {len(knn_queries_with_candidates)} KNN queries, limit={limit}\"\n )\n self.log(f\"[EXEC] Embedding models used: {list(query_embeddings.keys())}\")\n self.log(f\"[EXEC] KNN fields being queried: {embedding_fields}\")\n\n try:\n resp = client.search(index=self.index_name, body=body, params={\"terminate_after\": 0})\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = (\n knn_queries_without_candidates\n )\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n self.log(f\"[RESULT] Search complete: {len(hits)} results found\")\n\n if len(hits) == 0:\n self.log(\n f\"[EMPTY] Debug info: \"\n f\"models_in_index={available_models}, \"\n f\"matched_models={matched_models}, \"\n f\"knn_fields={embedding_fields}, \"\n f\"filters={len(filter_clauses)} clauses\"\n )\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> Table:\n\n \"\"\"Search documents and return results as a Table.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Table (DataFrame) format\n so downstream Parser components can consume them directly.\n\n Always builds the vector store (triggering ingestion if needed), then performs\n search only if a query is provided.\n\n Returns:\n Table containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n\n try:\n # Always build/cache the vector store to ensure ingestion happens\n logger.info(f\"Search query: {self.search_query}\")\n if self._cached_vector_store is None:\n self.build_vector_store()\n\n # Only perform search if query is provided\n search_query = (self.search_query or \"\").strip()\n if not search_query:\n self.log(\"No search query provided - ingestion completed, returning empty results\")\n\n return Table(data=[])\n\n # Perform search with the provided query\n raw = self.search(search_query)\n raw_list = [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n return Table(data=raw_list)\n\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(\n self, build_config: dict, field_value: str, field_name: str | None = None\n ) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n is_openrag = mode == \"openrag\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n # build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n # In 'openrag' mode, expose the OPENRAG_* fields up front\n # since they are the only credentials required.\n for openrag_field in (\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ):\n if openrag_field in build_config:\n build_config[openrag_field][\"advanced\"] = not is_openrag\n build_config[openrag_field][\"required\"] = (\n is_openrag and openrag_field != \"openrag_ingest_batch_size\"\n )\n\n if is_basic or is_openrag:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" + "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport os\nimport uuid\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import Any\n\nimport httpx\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n MultilineInput,\n Output,\n SecretStrInput,\n StrInput,\n TableInput,\n)\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\nfrom lfx.schema.dataframe import Table\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import OpenSearchException, RequestError\n\nREQUEST_TIMEOUT = 60\nMAX_RETRIES = 5\n\n\ndef _get_min_env_int(key: str, default: int, minimum: int) -> int:\n try:\n value = int(os.getenv(key, default))\n except (TypeError, ValueError):\n value = default\n return max(value, minimum)\n\n\nOPENSEARCH_NUMBER_OF_SHARDS = _get_min_env_int(\"OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS\", 1, 1)\nOPENSEARCH_NUMBER_OF_REPLICAS = _get_min_env_int(\"OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS\", 0, 0)\n\n# watsonx.ai surfaces rate-limit state via these (mostly non-standard) response\n# headers. The IBM SDK acts on the x-requests-limit-* family directly; we log\n# them on a failed embedding call to aid plan/region tuning.\n_WATSONX_RATE_LIMIT_HEADERS = (\n \"x-requests-limit-rate\",\n \"x-requests-limit-remaining\",\n \"x-requests-limit-reset\",\n \"Retry-After\",\n)\n\n\ndef _log_watsonx_rate_limit_headers(error: Exception) -> None:\n \"\"\"Best-effort diagnostic: log watsonx rate-limit headers from a failed call.\n\n The watsonx SDK raises ``ApiRequestFailure``, which carries the originating\n httpx/requests ``Response`` as ``.response``. On a 429 exhaustion we surface\n the documented rate-limit headers so operators can tune throughput.\n \"\"\"\n try:\n response = getattr(error, \"response\", None)\n headers = getattr(response, \"headers\", None)\n if not headers:\n return\n status = getattr(response, \"status_code\", \"unknown\")\n observed = {\n h: headers.get(h) for h in _WATSONX_RATE_LIMIT_HEADERS if headers.get(h) is not None\n }\n if str(status) == \"429\" or observed:\n logger.warning(f\"watsonx rate-limit response (status={status}): {observed}\")\n except Exception as log_error: # never let diagnostics mask the real error\n logger.debug(f\"Could not extract watsonx rate-limit headers: {log_error}\")\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n logger.info(f\"chunk_embedding_{normalize_model_name(model_name)}\")\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n\n Model Name Resolution:\n - Priority: deployment > model > model_name attributes\n - This ensures correct matching between embedding objects and index fields\n - When multiple embeddings are provided, specify embedding_model_name to select which one to use\n - During search, each detected model in the index is matched to its corresponding embedding object\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model Multi-Embedding)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search. \"\n \"To search use the tools search_documents and raw_search. \"\n \"Search documents takes a query for vector search, for example\\n\"\n ' {search_query: \"components in openrag\"}'\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n \"request_timeout\",\n \"max_retries\",\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ]\n _openrag_ingest_global_placeholders = {\n \"openrag_ingest_url\": \"OPENRAG_INGEST_URL\",\n \"openrag_ingest_token\": \"OPENRAG_INGEST_TOKEN\",\n \"openrag_ingest_run_id\": \"OPENRAG_INGEST_RUN_ID\",\n }\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\", \"JSON\"],\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"nmslib\", \"faiss\", \"lucene\", \"jvector\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'nmslib' works with standard \"\n \"OpenSearch. 'jvector' requires OpenSearch 2.9+. 'lucene' requires index.knn: true. \"\n \"Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(\n name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"], is_list=True\n ),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model to use for ingestion. This selects which embedding from the list \"\n \"will be used to embed documents. Matches on deployment, model, model_id, or model_name. \"\n \"For duplicate deployments, use combined format: 'deployment:model' \"\n \"(e.g., 'text-embedding-ada-002:text-embedding-3-large'). \"\n \"Leave empty to use the first embedding. Error message will show all available identifiers.\"\n ),\n advanced=False,\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"openrag\",\n options=[\"basic\", \"jwt\", \"openrag\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"'jwt' for JSON Web Token (Bearer) authentication, or 'openrag' to \"\n \"delegate writes to the OpenRAG backend ingest callback (no direct \"\n \"OpenSearch credentials required — only OPENRAG_* fields).\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=False,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n required=False,\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=False,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n # ----- Timeout / Retry -----\n StrInput(\n name=\"request_timeout\",\n display_name=\"Request Timeout (seconds)\",\n value=\"60\",\n advanced=True,\n info=(\n \"Time in seconds to wait for a response from OpenSearch. \"\n \"Increase for large bulk ingestion or complex hybrid queries.\"\n ),\n ),\n StrInput(\n name=\"max_retries\",\n display_name=\"Max Retries\",\n value=\"3\",\n advanced=True,\n info=\"Number of retries for failed connections before raising an error.\",\n ),\n StrInput(\n name=\"openrag_ingest_url\",\n display_name=\"OpenRAG Ingest URL\",\n value=\"OPENRAG_INGEST_URL\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Internal OpenRAG callback URL for backend-owned document indexing.\",\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n IntInput(\n name=\"openrag_ingest_batch_size\",\n display_name=\"OpenRAG Ingest Batch Size\",\n value=100,\n advanced=True,\n ),\n ]\n outputs = [\n Output(\n display_name=\"Search Results\",\n name=\"search_results\",\n method=\"search_documents\",\n ),\n Output(display_name=\"Raw Search\", name=\"raw_search\", method=\"raw_search\"),\n ]\n\n def raw_search(self, query: str | dict | None = None) -> Data:\n \"\"\"Execute a raw OpenSearch query against the target index.\n\n Args:\n query (dict[str, Any]): The OpenSearch query DSL dictionary.\n\n Returns:\n Data: Search results as a Data object.\n\n Raises:\n ValueError: If 'query' is not a valid OpenSearch query (must be a non-empty dict).\n \"\"\"\n raw_query = query if query is not None else self.search_query\n\n if raw_query is None or (isinstance(raw_query, str) and not raw_query.strip()):\n self.log(\"No query provided for raw search - returning empty results\")\n return Data(data={})\n\n if isinstance(raw_query, dict):\n query_body = copy.deepcopy(raw_query)\n elif isinstance(raw_query, str):\n s = raw_query.strip()\n\n # First, optimistically try to parse as JSON DSL\n try:\n query_body = json.loads(s)\n except json.JSONDecodeError:\n # Fallback: treat as a basic text query over common fields\n query_body = {\n \"query\": {\n \"multi_match\": {\n \"query\": s,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n }\n }\n }\n else:\n msg = f\"Unsupported raw_search query type: {type(raw_query)!r}\"\n raise TypeError(msg)\n\n filter_obj = self._parse_filter_expression()\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n if filter_clauses:\n if \"query\" in query_body:\n original_query = query_body[\"query\"]\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [original_query],\n \"filter\": filter_clauses,\n }\n }\n else:\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [{\"match_all\": {}}],\n \"filter\": filter_clauses,\n }\n }\n\n if filter_obj:\n # Apply limit if not already set in the raw query\n if \"size\" not in query_body:\n limit = self._resolve_limit(filter_obj, default_limit=None)\n if limit is not None:\n query_body[\"size\"] = limit\n\n # Apply score_threshold / scoreThreshold as min_score if not already set\n if \"min_score\" not in query_body:\n score_threshold = self._resolve_score_threshold(filter_obj)\n if score_threshold is not None:\n query_body[\"min_score\"] = score_threshold\n\n client = self.build_client()\n logger.info(f\"query: {query_body}\")\n resp = client.search(\n index=self.index_name,\n body=query_body,\n params={\"terminate_after\": 0},\n )\n # Remove any _source keys whose value is a list of floats (embedding vectors)\n # Minimum length threshold to identify embedding vectors\n min_vector_length = 100\n\n def is_vector(val):\n # Accepts if it's a list of numbers (float or int) and has reasonable vector length\n return (\n isinstance(val, list)\n and len(val) > min_vector_length\n and all(isinstance(x, (float, int)) for x in val)\n )\n\n if \"hits\" in resp and \"hits\" in resp[\"hits\"]:\n for hit in resp[\"hits\"][\"hits\"]:\n source = hit.get(\"_source\")\n if isinstance(source, dict):\n keys_to_remove = [k for k, v in source.items() if is_vector(v)]\n for k in keys_to_remove:\n source.pop(k)\n logger.info(f\"Raw search response (all embedding vectors removed): {resp}\")\n return Data(**resp)\n\n def _get_embedding_model_name(self, embedding_obj=None) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Priority: deployment > model > model_id > model_name\n This ensures we use the actual model being deployed, not just the configured model.\n Supports multiple embedding providers (OpenAI, Watsonx, Cohere, etc.)\n\n Args:\n embedding_obj: Specific embedding object to get name from (optional)\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from provided embedding object\n if embedding_obj:\n # Priority: deployment > model > model_id > model_name\n if hasattr(embedding_obj, \"deployment\") and embedding_obj.deployment:\n return str(embedding_obj.deployment)\n if hasattr(embedding_obj, \"model\") and embedding_obj.model:\n return str(embedding_obj.model)\n if hasattr(embedding_obj, \"model_id\") and embedding_obj.model_id:\n return str(embedding_obj.model_id)\n if hasattr(embedding_obj, \"model_name\") and embedding_obj.model_name:\n return str(embedding_obj.model_name)\n\n # Try to get from embedding component (legacy single embedding)\n if hasattr(self, \"embedding\") and self.embedding:\n # Handle list of embeddings\n if isinstance(self.embedding, list) and len(self.embedding) > 0:\n first_emb = self.embedding[0]\n if hasattr(first_emb, \"deployment\") and first_emb.deployment:\n return str(first_emb.deployment)\n if hasattr(first_emb, \"model\") and first_emb.model:\n return str(first_emb.model)\n if hasattr(first_emb, \"model_id\") and first_emb.model_id:\n return str(first_emb.model_id)\n if hasattr(first_emb, \"model_name\") and first_emb.model_name:\n return str(first_emb.model_name)\n # Handle single embedding\n elif not isinstance(self.embedding, list):\n if hasattr(self.embedding, \"deployment\") and self.embedding.deployment:\n return str(self.embedding.deployment)\n if hasattr(self.embedding, \"model\") and self.embedding.model:\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_id\") and self.embedding.model_id:\n return str(self.embedding.model_id)\n if hasattr(self.embedding, \"model_name\") and self.embedding.model_name:\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'deployment', 'model', 'model_id', or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\n \"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search},\n \"number_of_shards\": OPENSEARCH_NUMBER_OF_SHARDS,\n \"number_of_replicas\": OPENSEARCH_NUMBER_OF_REPLICAS,\n },\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Note: Some OpenSearch versions/configurations have issues with dynamically adding\n knn_vector mappings (NullPointerException). This method checks if the field\n already exists before attempting to add it, and gracefully skips if the field\n is already properly configured.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n # First, check if the field already exists and is properly mapped\n properties = self._get_index_properties(client)\n if self._is_knn_vector_field(properties, field_name):\n # Field already exists as knn_vector - verify dimensions match\n existing_dim = self._get_field_dimension(properties, field_name)\n if existing_dim is not None and existing_dim != dim:\n logger.warning(\n f\"Field '{field_name}' exists with dimension {existing_dim}, \"\n f\"but current embedding has dimension {dim}. Using existing mapping.\"\n )\n else:\n logger.info(\n f\"[OpenSearchMultimodel] Field '{field_name}' already exists\"\n f\"as knn_vector with matching dimensions - skipping mapping update\"\n )\n return\n\n # Field doesn't exist, try to add the mapping\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\"type\": \"keyword\"},\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except RequestError as e:\n error_str = str(e).lower()\n if \"invalid engine\" in error_str and \"jvector\" in error_str:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to OpenSearch 2.9+.\"\n )\n raise ValueError(msg) from e\n if \"index.knn\" in error_str:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from e\n raise\n except Exception as e:\n # Check if this is the known OpenSearch k-NN NullPointerException issue\n error_str = str(e).lower()\n if \"null\" in error_str or \"nullpointerexception\" in error_str:\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}\"\n f\"due to OpenSearch k-NN plugin issue: {e}. \"\n f\"This is a known issue with some OpenSearch versions. \"\n f\"[OpenSearchMultimodel] Skipping mapping update. \"\n f\"Please ensure the index has the correct mapping for KNN search to work.\"\n )\n # Skip and continue - ingestion will proceed, but KNN search may fail if mapping doesn't exist\n return\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}: {e}\"\n )\n raise\n\n # Verify the field was added correctly\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n msg = f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n logger.error(msg)\n raise ValueError(msg)\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return (\n http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n )\n\n @staticmethod\n def _openrag_input_to_str(value: Any) -> str:\n if value is None:\n return \"\"\n if hasattr(value, \"get_secret_value\"):\n value = value.get_secret_value()\n if hasattr(value, \"text\"):\n value = value.text\n return str(value or \"\").strip()\n\n def _openrag_callback_value(self, attr_name: str) -> str:\n value = self._openrag_input_to_str(getattr(self, attr_name, \"\"))\n if value == self._openrag_ingest_global_placeholders.get(attr_name):\n return \"\"\n return value\n\n def _openrag_ingest_callback_config(self) -> tuple[str, str, str] | None:\n url = self._openrag_callback_value(\"openrag_ingest_url\")\n token = self._openrag_callback_value(\"openrag_ingest_token\")\n ingest_run_id = self._openrag_callback_value(\"openrag_ingest_run_id\")\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n debug_payload = {\n \"openrag_ingest_url\": url,\n \"openrag_ingest_url_len\": len(url),\n \"openrag_ingest_token_masked\": masked_token,\n \"openrag_ingest_token_len\": len(token),\n \"openrag_ingest_run_id\": ingest_run_id,\n \"raw_url_type\": type(self.openrag_ingest_url).__name__,\n \"raw_token_type\": type(self.openrag_ingest_token).__name__,\n \"raw_run_id_type\": type(self.openrag_ingest_run_id).__name__,\n }\n logger.warning(f\"[OpenRAG callback config] {debug_payload}\")\n try:\n self.log(f\"[OpenRAG callback config] {debug_payload}\")\n except Exception:\n pass\n\n if not url and not token and not ingest_run_id:\n return None\n if not url or not token or not ingest_run_id:\n msg = \"OpenRAG ingest callback requires url, token, and ingest_run_id.\"\n raise ValueError(msg)\n return url, token, ingest_run_id\n\n def _post_openrag_ingest_batches(\n self,\n *,\n requests: list[dict],\n vector_field: str,\n text_field: str,\n ) -> None:\n callback_config = self._openrag_ingest_callback_config()\n if callback_config is None:\n return\n\n url, token, ingest_run_id = callback_config\n batch_size = max(self._parse_int_param(\"openrag_ingest_batch_size\", 100), 1)\n timeout = self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT)\n headers = {\"Authorization\": f\"Bearer {token}\"}\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n request_summary = {\n \"url\": url,\n \"ingest_run_id\": ingest_run_id,\n \"token_masked\": masked_token,\n \"total_chunks\": len(requests),\n \"batch_size\": batch_size,\n \"timeout_s\": timeout,\n }\n logger.warning(f\"[OpenRAG ingest POST] {request_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST] {request_summary}\")\n except Exception:\n pass\n\n with httpx.Client(timeout=timeout) as client:\n total_batches = (len(requests) + batch_size - 1) // batch_size\n for batch_number, start in enumerate(range(0, len(requests), batch_size), start=1):\n batch = requests[start : start + batch_size]\n final = batch_number == total_batches\n payload = {\n \"ingest_run_id\": ingest_run_id,\n \"batch_id\": batch_number,\n \"final\": final,\n \"chunks\": [\n self._openrag_chunk_payload(\n request,\n vector_field=vector_field,\n text_field=text_field,\n )\n for request in batch\n ],\n }\n logger.warning(\n f\"[OpenRAG ingest POST] -> batch={batch_number}/{total_batches} \"\n f\"url={url} chunks={len(payload['chunks'])} final={final}\"\n )\n response = client.post(url, json=payload, headers=headers)\n response_summary = {\n \"batch\": batch_number,\n \"url\": url,\n \"status\": response.status_code,\n \"final_url\": str(response.request.url),\n \"response_headers\": dict(response.headers),\n \"body_preview\": response.text[:500],\n }\n logger.warning(f\"[OpenRAG ingest POST resp] {response_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST resp] {response_summary}\")\n except Exception:\n pass\n if response.status_code >= 400:\n msg = (\n \"OpenRAG ingest callback failed \"\n f\"(batch={batch_number}, status={response.status_code}, \"\n f\"url={url}): {response.text[:1000]}\"\n )\n raise RuntimeError(msg)\n\n self.log(f\"Posted {len(requests)} chunks to OpenRAG backend ingest callback.\")\n\n @staticmethod\n def _openrag_chunk_payload(\n request: dict,\n *,\n vector_field: str,\n text_field: str,\n ) -> dict:\n metadata = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\", vector_field, text_field}\n }\n page = metadata.get(\"page\")\n if isinstance(page, str) and page.isdigit():\n page = int(page)\n return {\n \"id\": request.get(\"_id\") or request.get(\"id\"),\n \"text\": request.get(text_field, \"\"),\n \"vector\": request[vector_field],\n \"page\": page if isinstance(page, int) else None,\n \"metadata\": metadata,\n }\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n logger.debug(f\"[OpenSearchMultimodel] Bulk ingesting embeddings for {index_name}\")\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n\n # Normalize ACL fields that may arrive as JSON strings from flows\n for key in (\"allowed_users\", \"allowed_groups\", \"allowed_principals\"):\n value = metadata.get(key)\n if isinstance(value, str):\n try:\n parsed = json.loads(value)\n if isinstance(parsed, list):\n metadata[key] = parsed\n except (json.JSONDecodeError, TypeError):\n # Leave value as-is if it isn't valid JSON\n pass\n\n metadata_document_id = str(metadata.get(\"document_id\") or \"\").strip()\n if metadata_document_id and metadata_document_id.lower() != \"none\":\n generated_id = f\"{metadata_document_id}_{i}\"\n else:\n generated_id = str(uuid.uuid4())\n _id = ids[i] if ids else generated_id\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n if self._openrag_ingest_callback_config() is not None:\n self._post_openrag_ingest_batches(\n requests=requests,\n vector_field=vector_field,\n text_field=text_field,\n )\n return return_ids\n try:\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n except Exception as bulk_error:\n if \"Unsupported request type for filter level DLS\" not in str(bulk_error):\n raise\n logger.warning(\n \"[OpenSearchMultimodel] Bulk ingest is blocked by filter-level DLS; \"\n \"falling back to per-document index requests.\"\n )\n self._index_embeddings_individually(client, requests)\n return return_ids\n\n def _index_embeddings_individually(\n self,\n client: OpenSearch,\n requests: list[dict],\n ) -> None:\n \"\"\"Index documents one at a time when OpenSearch DLS rejects bulk writes.\"\"\"\n for request in requests:\n document_id = request.get(\"_id\") or request.get(\"id\")\n body = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\"}\n }\n client.index(index=request[\"_index\"], id=document_id, body=body)\n\n def _log_index_admin_skip(self, operation: str, error: Exception) -> None:\n \"\"\"Log index-admin operations that may be blocked under filter-level DLS.\"\"\"\n logger.warning(\n f\"[OpenSearchMultimodel] Could not run index-admin operation '{operation}': {error}. \"\n \"Assuming the backend pre-created the required index/mapping and continuing.\"\n )\n\n # ---------- param helpers ----------\n def _parse_int_param(self, attr_name: str, default: int) -> int:\n \"\"\"Parse a string attribute to int, returning *default* on failure.\"\"\"\n raw = getattr(self, attr_name, None)\n if raw is None or str(raw).strip() == \"\":\n return default\n try:\n value = int(str(raw).strip())\n except ValueError:\n logger.warning(\n f\"Invalid integer value '{raw}' for {attr_name}, using default {default}\"\n )\n return default\n\n if value < 0:\n logger.warning(f\"Negative value '{raw}' for {attr_name}, using default {default}\")\n return default\n\n return value\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n if mode == \"openrag\":\n # Writes are delegated to the OpenRAG backend ingest callback,\n # so no direct OpenSearch credentials are needed. Only the\n # OPENRAG_* fields are required for ingestion to function.\n missing = [\n name\n for name, value in (\n (\"openrag_ingest_url\", self.openrag_ingest_url),\n (\"openrag_ingest_token\", self.openrag_ingest_token),\n (\"openrag_ingest_run_id\", self.openrag_ingest_run_id),\n )\n if not (value or \"\").strip()\n ]\n if missing:\n msg = (\n \"Auth Mode is 'openrag' but required OPENRAG_* fields are \"\n f\"missing: {', '.join(missing)}.\"\n )\n raise ValueError(msg)\n return {}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel] Building OpenSearch client\")\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n timeout=self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT),\n max_retries=self._parse_int_param(\"max_retries\", MAX_RETRIES),\n retry_on_timeout=True,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n client = self.build_client()\n\n # Check if we're in ingestion-only mode (no search query)\n has_search_query = bool((self.search_query or \"\").strip())\n if not has_search_query:\n logger.debug(\n \"[OpenSearchMultimodel] Ingestion-only mode activated: search operations will be skipped\"\n )\n logger.debug(\"[OpenSearchMultimodel] Starting ingestion mode...\")\n\n logger.debug(f\"[OpenSearchMultimodel] Embedding: {self.embedding}\")\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings using the selected model\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel][INGESTION] _add_documents_to_vector_store called\")\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data type: \"\n f\"{type(self.ingest_data)}, length: {len(self.ingest_data) if self.ingest_data else 0}\"\n )\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data content: \"\n f\"{self.ingest_data[:2] if self.ingest_data and len(self.ingest_data) > 0 else 'empty'}\"\n )\n\n docs = self.ingest_data or []\n if not docs:\n logger.debug(\"Ingestion complete: No documents provided\")\n return\n\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Normalize embedding to list first\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n\n # Filter out None values (fail-safe mode) - do this BEFORE checking if empty\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n # NOW check if we have any valid embeddings left after filtering\n if not embeddings_list:\n logger.warning(\n \"All embeddings returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n self.log(\n \"Embedding returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n return\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] Valid embeddings after filtering: {len(embeddings_list)}\"\n )\n self.log(\n f\"[OpenSearchMultimodel][INGESTION] Available embedding models: {len(embeddings_list)}\"\n )\n\n # Select the embedding to use for ingestion\n selected_embedding = None\n embedding_model = None\n\n # If embedding_model_name is specified, find matching embedding\n if (\n hasattr(self, \"embedding_model_name\")\n and self.embedding_model_name\n and self.embedding_model_name.strip()\n ):\n target_model_name = self.embedding_model_name.strip()\n self.log(f\"Looking for embedding model: {target_model_name}\")\n\n for emb_obj in embeddings_list:\n # Check all possible model identifiers (deployment, model, model_id, model_name)\n # Also check available_models list from EmbeddingsWithModels\n possible_names = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n if deployment:\n possible_names.append(str(deployment))\n if model:\n possible_names.append(str(model))\n if model_id:\n possible_names.append(str(model_id))\n if model_name:\n possible_names.append(str(model_name))\n\n # Also add combined identifier\n if deployment and model and deployment != model:\n possible_names.append(f\"{deployment}:{model}\")\n\n # Add all models from available_models dict\n if available_models_attr and isinstance(available_models_attr, dict):\n possible_names.extend(\n str(model_key).strip()\n for model_key in available_models_attr\n if model_key and str(model_key).strip()\n )\n\n # Match if target matches any of the possible names\n if target_model_name in possible_names:\n # Check if target is in available_models dict - use dedicated instance\n if (\n available_models_attr\n and isinstance(available_models_attr, dict)\n and target_model_name in available_models_attr\n ):\n # Use the dedicated embedding instance from the dict\n selected_embedding = available_models_attr[target_model_name]\n embedding_model = target_model_name\n self.log(\n f\"Found dedicated embedding instance for '{embedding_model}' in available_models dict\"\n )\n else:\n # Traditional identifier match\n selected_embedding = emb_obj\n embedding_model = self._get_embedding_model_name(emb_obj)\n self.log(\n f\"Found matching embedding model: {embedding_model} (matched on: {target_model_name})\"\n )\n break\n\n if not selected_embedding:\n # Build detailed list of available embeddings with all their identifiers\n available_info = []\n for idx, emb in enumerate(embeddings_list):\n emb_type = type(emb).__name__\n identifiers = []\n deployment = getattr(emb, \"deployment\", None)\n model = getattr(emb, \"model\", None)\n model_id = getattr(emb, \"model_id\", None)\n model_name = getattr(emb, \"model_name\", None)\n available_models_attr = getattr(emb, \"available_models\", None)\n\n if deployment:\n identifiers.append(f\"deployment='{deployment}'\")\n if model:\n identifiers.append(f\"model='{model}'\")\n if model_id:\n identifiers.append(f\"model_id='{model_id}'\")\n if model_name:\n identifiers.append(f\"model_name='{model_name}'\")\n\n # Add combined identifier as an option\n if deployment and model and deployment != model:\n identifiers.append(f\"combined='{deployment}:{model}'\")\n\n # Add available_models dict if present\n if available_models_attr and isinstance(available_models_attr, dict):\n identifiers.append(f\"available_models={list(available_models_attr.keys())}\")\n\n available_info.append(\n f\" [{idx}] {emb_type}: {', '.join(identifiers) if identifiers else 'No identifiers'}\"\n )\n\n msg = (\n f\"Embedding model '{target_model_name}' not found in available embeddings.\\n\\n\"\n f\"Available embeddings:\\n\" + \"\\n\".join(available_info) + \"\\n\\n\"\n \"Please set 'embedding_model_name' to one of the identifier values shown above \"\n \"(use the value after the '=' sign, without quotes).\\n\"\n \"For duplicate deployments, use the 'combined' format.\\n\"\n \"Or leave it empty to use the first embedding.\"\n )\n raise ValueError(msg)\n else:\n # Use first embedding if no model name specified\n selected_embedding = embeddings_list[0]\n embedding_model = self._get_embedding_model_name(selected_embedding)\n self.log(f\"No embedding_model_name specified, using first embedding: {embedding_model}\")\n\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n logger.info(f\"Selected embedding model for ingestion: '{embedding_model}'\")\n self.log(f\"Using embedding model for ingestion: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Log embedding details for debugging\n if hasattr(selected_embedding, \"deployment\"):\n logger.info(f\"Embedding deployment: {selected_embedding.deployment}\")\n if hasattr(selected_embedding, \"model\"):\n logger.info(f\"Embedding model: {selected_embedding.model}\")\n if hasattr(selected_embedding, \"model_id\"):\n logger.info(f\"Embedding model_id: {selected_embedding.model_id}\")\n if hasattr(selected_embedding, \"dimensions\"):\n logger.info(f\"Embedding dimensions: {selected_embedding.dimensions}\")\n if hasattr(selected_embedding, \"available_models\"):\n logger.info(f\"Embedding available_models: {selected_embedding.available_models}\")\n\n # No model switching needed - each model in available_models has its own dedicated instance\n # The selected_embedding is already configured correctly for the target model\n logger.info(\n f\"Using embedding instance for '{embedding_model}' - pre-configured and ready to use\"\n )\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n logger.debug(f\"[LF] Docs metadata {self.docs_metadata}\")\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n\n # Determine whether the selected embedding is watsonx/IBM. The watsonx\n # SDK ships its own rate-limit machinery (input batching, proactive\n # x-requests-limit-* TokenBucket throttling, and jittered exponential\n # backoff on 429), so we lean on it instead of retrying on top of it.\n # The type-name check also covers watsonx-hosted, non-\"ibm/\" models\n # (e.g. intfloat/multilingual-e5-large).\n is_ibm = (embedding_model and \"ibm\" in str(embedding_model).lower()) or (\n selected_embedding and \"watsonx\" in type(selected_embedding).__name__.lower()\n )\n logger.debug(f\"Is IBM/watsonx embedding: {is_ibm}\")\n\n if is_ibm:\n # Hand the full batch to the SDK and let it batch/throttle/retry.\n # Retry attempts and base backoff are tunable via the SDK's own\n # WATSONX_MAX_RETRIES / WATSONX_DELAY_TIME environment variables.\n logger.info(\n f\"Embedding {len(texts)} chunks via watsonx SDK batch (SDK-managed throttle + 429 retry)\"\n )\n try:\n vectors: list[list[float]] = selected_embedding.embed_documents(texts)\n logger.info(f\"Successfully embedded {len(vectors)} chunks via watsonx SDK\")\n except Exception as embed_error:\n _log_watsonx_rate_limit_headers(embed_error)\n logger.error(\n f\"Failed to embed {len(texts)} chunks via watsonx SDK. Error: {embed_error}\",\n )\n raise\n\n else:\n # Non-watsonx providers (OpenAI, Ollama) lack the watsonx SDK's\n # built-in rate-limit handling, so embed per chunk in parallel with\n # a generic rate-limit-aware tenacity retry.\n vectors: list[list[float]] = [None] * len(texts)\n from tenacity import (\n retry,\n retry_if_exception,\n stop_after_attempt,\n wait_exponential,\n )\n\n def is_rate_limit_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a rate limit error (429).\"\"\"\n error_str = str(exception).lower()\n return \"429\" in error_str or \"rate_limit\" in error_str or \"rate limit\" in error_str\n\n def is_other_retryable_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a transient network error worth retrying.\"\"\"\n if is_rate_limit_error(exception):\n return False\n return isinstance(exception, (ConnectionError, TimeoutError, OSError))\n\n # Retry decorator for rate limit errors (longer backoff)\n retry_on_rate_limit = retry(\n retry=retry_if_exception(is_rate_limit_error),\n stop=stop_after_attempt(5),\n wait=wait_exponential(multiplier=2, min=2, max=30),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Rate limit hit for chunk (attempt {retry_state.attempt_number}/5), \"\n f\"backing off for {retry_state.next_action.sleep:.1f}s\"\n ),\n )\n\n # Retry decorator for other errors (shorter backoff)\n retry_on_other_errors = retry(\n retry=retry_if_exception(is_other_retryable_error),\n stop=stop_after_attempt(3),\n wait=wait_exponential(multiplier=1, min=1, max=8),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Error embedding chunk (attempt {retry_state.attempt_number}/3), \"\n f\"retrying in {retry_state.next_action.sleep:.1f}s: {retry_state.outcome.exception()}\"\n ),\n )\n\n def embed_chunk_with_retry(chunk_text: str, chunk_idx: int) -> list[float]:\n \"\"\"Embed a single chunk with rate-limit-aware retry logic.\"\"\"\n\n @retry_on_rate_limit\n @retry_on_other_errors\n def _embed(text: str) -> list[float]:\n return selected_embedding.embed_documents([text])[0]\n\n try:\n return _embed(chunk_text)\n except Exception as e:\n logger.error(\n f\"Failed to embed chunk {chunk_idx} after all retries: {e}\",\n error=str(e),\n )\n raise\n\n max_workers = min(max(len(texts), 1), 8)\n logger.debug(f\"Using parallel processing with {max_workers} workers\")\n\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {\n executor.submit(embed_chunk_with_retry, chunk, idx): idx\n for idx, chunk in enumerate(texts)\n }\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n\n if not vectors:\n self.log(f\"No vectors generated from documents for model {embedding_model}.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n use_openrag_ingest_callback = self._openrag_ingest_callback_config() is not None\n\n is_aoss = False\n mapping: dict | None = None\n\n engine = getattr(self, \"engine\", \"jvector\")\n\n if use_openrag_ingest_callback:\n self.log(\"Using OpenRAG backend ingest callback; skipping direct OpenSearch writes.\")\n else:\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping (index.knn: true is required for vector search)\n index_exists = True\n try:\n index_exists = bool(client.indices.exists(index=self.index_name))\n except OpenSearchException as exists_error:\n self._log_index_admin_skip(\"indices.exists\", exists_error)\n\n try:\n if not index_exists:\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error == \"resource_already_exists_exception\":\n pass # Index was created concurrently\n else:\n error_msg = str(creation_error).lower()\n if \"invalid engine\" in error_msg or \"illegal_argument\" in error_msg:\n if \"jvector\" in error_msg:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to 2.9+.\"\n )\n raise ValueError(msg) from creation_error\n if \"index.knn\" in error_msg:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from creation_error\n logger.warning(f\"Failed to create index '{self.index_name}': {creation_error}\")\n raise\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(\n f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\"\n )\n logger.info(f\"Will store embeddings in field: {dynamic_field_name}\")\n logger.info(f\"Will tag documents with embedding_model: {embedding_model}\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n logger.info(\n f\"Ingestion complete: Successfully indexed {len(return_ids)} documents with model '{embedding_model}'\"\n )\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if (\n \"term\" in f\n and isinstance(f[\"term\"], dict)\n and not self._is_placeholder_term(f[\"term\"])\n ):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n def _parse_filter_expression(self) -> dict | None:\n \"\"\"Parse and validate optional filter_expression JSON.\n\n Returns:\n Parsed JSON object as a dict, or None when unset/blank.\n\n Raises:\n ValueError: If JSON is invalid or does not decode to an object.\n \"\"\"\n filter_expression = getattr(self, \"filter_expression\", \"\")\n if not isinstance(filter_expression, str) or not filter_expression.strip():\n return None\n try:\n filter_obj = json.loads(filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not isinstance(filter_obj, dict):\n msg = \"Invalid filter_expression JSON type: expected a JSON object.\"\n raise TypeError(msg)\n return filter_obj\n\n def _resolve_limit(self, filter_obj: dict | None, default_limit: int | None) -> int | None:\n \"\"\"Resolve an integer result limit from filter settings.\"\"\"\n if not filter_obj:\n return default_limit\n raw_limit = filter_obj.get(\"limit\", default_limit)\n if raw_limit is None:\n return None\n if isinstance(raw_limit, bool):\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise TypeError(msg)\n try:\n limit = int(raw_limit)\n except (TypeError, ValueError) as e:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg) from e\n if limit <= 0:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg)\n return limit\n\n def _resolve_score_threshold(self, filter_obj: dict | None) -> float | None:\n \"\"\"Resolve optional positive min score from filter settings.\"\"\"\n if not filter_obj:\n return None\n score_threshold = filter_obj.get(\"score_threshold\")\n if score_threshold is None:\n score_threshold = filter_obj.get(\"scoreThreshold\")\n if not isinstance(score_threshold, (int, float)) or score_threshold <= 0:\n return None\n return float(score_threshold)\n\n def _detect_available_models(\n self, client: OpenSearch, filter_clauses: list[dict] | None = None\n ) -> list[str]:\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}}},\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\"bool\": {\"filter\": filter_clauses}}\n\n logger.debug(f\"Model detection query: {agg_query}\")\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n # Log detailed bucket info for debugging\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n if not models:\n total_hits = result.get(\"hits\", {}).get(\"total\", {})\n total_count = (\n total_hits.get(\"value\", 0) if isinstance(total_hits, dict) else total_hits\n )\n logger.warning(\n f\"No embedding_model values found in index '{self.index_name}'. \"\n f\"Total docs in index: {total_count}. \"\n f\"This may indicate documents were indexed without the embedding_model field.\"\n )\n except (OpenSearchException, KeyError, ValueError) as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n fallback_model = self._get_embedding_model_name()\n logger.info(f\"Using fallback model: {fallback_model}\")\n return [fallback_model]\n else:\n return models\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except OpenSearchException as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n return bool(isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\")\n\n def _get_field_dimension(\n self, properties: dict[str, Any] | None, field_name: str\n ) -> int | None:\n \"\"\"Get the dimension of a knn_vector field from the index mapping.\n\n Args:\n properties: Index properties from mapping\n field_name: Name of the vector field\n\n Returns:\n Dimension of the field, or None if not found\n \"\"\"\n if not field_name or properties is None:\n return None\n\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return None\n\n # Check direct knn_vector field\n if field_def.get(\"type\") == \"knn_vector\":\n return field_def.get(\"dimension\")\n\n # Check nested properties\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return nested_props.get(\"dimension\")\n\n return None\n\n def _get_filename_agg_field(self, index_properties: dict[str, Any] | None) -> str:\n \"\"\"Choose the appropriate field for filename aggregations.\"\"\"\n if not index_properties:\n return \"filename.keyword\"\n\n filename_def = index_properties.get(\"filename\")\n if not isinstance(filename_def, dict):\n return \"filename.keyword\"\n\n field_type = filename_def.get(\"type\")\n fields_def = filename_def.get(\"fields\", {})\n\n # Top-level keyword with no subfields\n if field_type == \"keyword\" and not isinstance(fields_def, dict):\n return \"filename\"\n\n # Text field with keyword subfield\n if isinstance(fields_def, dict) and \"keyword\" in fields_def:\n return \"filename.keyword\"\n\n # Fallback: aggregate on filename directly\n return \"filename\"\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = self._parse_filter_expression()\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Check if embedding is None (fail-safe mode)\n if self.embedding is None or (\n isinstance(self.embedding, list) and all(e is None for e in self.embedding)\n ):\n logger.error(\"Embedding returned None (fail-safe mode enabled). Cannot perform search.\")\n return []\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models\n query_embeddings = {}\n\n # Normalize embedding to list\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n # Filter out None values (fail-safe mode)\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n if not embeddings_list:\n logger.error(\n \"No valid embeddings available after filtering None values (fail-safe mode). Cannot perform search.\"\n )\n return []\n\n # Create a comprehensive map of model names to embedding objects\n # Check all possible identifiers (deployment, model, model_id, model_name)\n # Also leverage available_models list from EmbeddingsWithModels\n # Handle duplicate identifiers by creating combined keys\n embedding_by_model = {}\n identifier_conflicts = {} # Track which identifiers have conflicts\n\n for idx, emb_obj in enumerate(embeddings_list):\n # Get all possible identifiers for this embedding\n identifiers = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n dimensions = getattr(emb_obj, \"dimensions\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Embedding object {idx}: deployment={deployment}, model={model}, \"\n f\"model_id={model_id}, model_name={model_name}, dimensions={dimensions}, \"\n f\"available_models={available_models_attr}\"\n )\n\n # If this embedding has available_models dict, map all models to their dedicated instances\n if available_models_attr and isinstance(available_models_attr, dict):\n logger.info(\n f\"Embedding object {idx} provides {len(available_models_attr)} models via available_models dict\"\n )\n for model_name_key, dedicated_embedding in available_models_attr.items():\n if model_name_key and str(model_name_key).strip():\n model_str = str(model_name_key).strip()\n if model_str not in embedding_by_model:\n # Use the dedicated embedding instance from the dict\n embedding_by_model[model_str] = dedicated_embedding\n logger.info(\n f\"Mapped available model '{model_str}' to dedicated embedding instance\"\n )\n else:\n # Conflict detected - track it\n if model_str not in identifier_conflicts:\n identifier_conflicts[model_str] = [embedding_by_model[model_str]]\n identifier_conflicts[model_str].append(dedicated_embedding)\n logger.warning(\n f\"Available model '{model_str}' has conflict - used by multiple embeddings\"\n )\n\n # Also map traditional identifiers (for backward compatibility)\n if deployment:\n identifiers.append(str(deployment))\n if model:\n identifiers.append(str(model))\n if model_id:\n identifiers.append(str(model_id))\n if model_name:\n identifiers.append(str(model_name))\n\n # Map all identifiers to this embedding object\n for identifier in identifiers:\n if identifier not in embedding_by_model:\n embedding_by_model[identifier] = emb_obj\n logger.info(f\"Mapped identifier '{identifier}' to embedding object {idx}\")\n else:\n # Conflict detected - track it\n if identifier not in identifier_conflicts:\n identifier_conflicts[identifier] = [embedding_by_model[identifier]]\n identifier_conflicts[identifier].append(emb_obj)\n logger.warning(\n f\"Identifier '{identifier}' has conflict - used by multiple embeddings\"\n )\n\n # For embeddings with model+deployment, create combined identifier\n # This helps when deployment is the same but model differs\n if deployment and model and deployment != model:\n combined_id = f\"{deployment}:{model}\"\n if combined_id not in embedding_by_model:\n embedding_by_model[combined_id] = emb_obj\n logger.info(\n f\"Created combined identifier '{combined_id}' for embedding object {idx}\"\n )\n\n # Log conflicts\n if identifier_conflicts:\n logger.warning(\n f\"Found {len(identifier_conflicts)} conflicting identifiers. \"\n f\"Consider using combined format 'deployment:model' or specifying unique model names.\"\n )\n for conflict_id, emb_list in identifier_conflicts.items():\n logger.warning(\n f\" Conflict on '{conflict_id}': {len(emb_list)} embeddings use this identifier\"\n )\n\n logger.info(f\"Generating embeddings for {len(available_models)} models in index\")\n logger.info(f\"Available embedding identifiers: {list(embedding_by_model.keys())}\")\n self.log(f\"[SEARCH] Models detected in index: {available_models}\")\n self.log(f\"[SEARCH] Available embedding identifiers: {list(embedding_by_model.keys())}\")\n\n # Track matching status for debugging\n matched_models = []\n unmatched_models = []\n\n for model_name in available_models:\n try:\n # Check if we have an embedding object for this model\n if model_name in embedding_by_model:\n # Use the matching embedding object directly\n emb_obj = embedding_by_model[model_name]\n emb_deployment = getattr(emb_obj, \"deployment\", None)\n emb_model = getattr(emb_obj, \"model\", None)\n emb_model_id = getattr(emb_obj, \"model_id\", None)\n emb_dimensions = getattr(emb_obj, \"dimensions\", None)\n emb_available_models = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Using embedding object for model '{model_name}': \"\n f\"deployment={emb_deployment}, model={emb_model}, model_id={emb_model_id}, \"\n f\"dimensions={emb_dimensions}\"\n )\n\n # Check if this is a dedicated instance from available_models dict\n if emb_available_models and isinstance(emb_available_models, dict):\n logger.info(\n f\"Model '{model_name}' using dedicated instance from available_models dict \"\n f\"(pre-configured with correct model and dimensions)\"\n )\n\n # Use the embedding instance directly - no model switching needed!\n vec = emb_obj.embed_query(q)\n query_embeddings[model_name] = vec\n matched_models.append(model_name)\n logger.info(\n f\"Generated embedding for model: {model_name} (actual dimensions: {len(vec)})\"\n )\n self.log(f\"[MATCH] Model '{model_name}' - generated {len(vec)}-dim embedding\")\n else:\n # No matching embedding found for this model\n unmatched_models.append(model_name)\n logger.warning(\n f\"No matching embedding found for model '{model_name}'. \"\n f\"This model will be skipped. Available identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(\n f\"[NO MATCH] Model '{model_name}' - available: {list(embedding_by_model.keys())}\"\n )\n except (\n RuntimeError,\n ValueError,\n ConnectionError,\n TimeoutError,\n AttributeError,\n KeyError,\n ) as e:\n logger.warning(f\"Failed to generate embedding for {model_name}: {e}\")\n self.log(f\"[ERROR] Embedding generation failed for '{model_name}': {e}\")\n\n # Log summary of model matching\n logger.info(\n f\"Model matching summary: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n self.log(\n f\"[SUMMARY] Model matching: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n if unmatched_models:\n self.log(f\"[WARN] Unmatched models in index: {unmatched_models}\")\n\n if not query_embeddings:\n msg = (\n f\"Failed to generate embeddings for any model. \"\n f\"Index has models: {available_models}, but no matching embedding objects found. \"\n f\"Available embedding identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(f\"[FAIL] Search failed: {msg}\")\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n vector_dim = len(embedding_vector)\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n self.log(\n f\"[SKIP] Field '{selected_field}' not a knn_vector - skipping model '{model_name}'\"\n )\n continue\n\n # Validate vector dimensions match the field dimensions\n field_dim = self._get_field_dimension(index_properties, selected_field)\n if field_dim is not None and field_dim != vector_dim:\n logger.error(\n f\"Dimension mismatch for model '{model_name}': \"\n f\"Query vector has {vector_dim} dimensions but field '{selected_field}' expects {field_dim}. \"\n f\"Skipping this model to prevent search errors.\"\n )\n self.log(\n f\"[DIM MISMATCH] Model '{model_name}': query={vector_dim} vs field={field_dim} - skipping\"\n )\n continue\n\n logger.info(\n f\"Adding KNN query for model '{model_name}': field='{selected_field}', \"\n f\"query_dims={vector_dim}, field_dims={field_dim or 'unknown'}\"\n )\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n self.log(\n f\"[WARN] No valid KNN queries could be built. \"\n f\"Query embeddings generated: {list(query_embeddings.keys())}, \"\n f\"but no matching knn_vector fields found in index.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1,\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = self._resolve_limit(filter_obj, default_limit=self.number_of_results)\n score_threshold = self._resolve_score_threshold(filter_obj)\n\n # Determine the best aggregation field for filename based on index mapping\n filename_agg_field = self._get_filename_agg_field(index_properties)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates,\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": filename_agg_field, \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n \"allowed_principals\",\n ],\n \"size\": limit,\n }\n\n if score_threshold is not None:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models: \"\n f\"{list(query_embeddings.keys())}\"\n )\n self.log(\n f\"[EXEC] Executing search with {len(knn_queries_with_candidates)} KNN queries, limit={limit}\"\n )\n self.log(f\"[EXEC] Embedding models used: {list(query_embeddings.keys())}\")\n self.log(f\"[EXEC] KNN fields being queried: {embedding_fields}\")\n\n try:\n resp = client.search(index=self.index_name, body=body, params={\"terminate_after\": 0})\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = (\n knn_queries_without_candidates\n )\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n self.log(f\"[RESULT] Search complete: {len(hits)} results found\")\n\n if len(hits) == 0:\n self.log(\n f\"[EMPTY] Debug info: \"\n f\"models_in_index={available_models}, \"\n f\"matched_models={matched_models}, \"\n f\"knn_fields={embedding_fields}, \"\n f\"filters={len(filter_clauses)} clauses\"\n )\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> Table:\n \"\"\"Search documents and return results as a Table.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Table (DataFrame) format\n so downstream Parser components can consume them directly.\n\n Always builds the vector store (triggering ingestion if needed), then performs\n search only if a query is provided.\n\n Returns:\n Table containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n\n try:\n # Always build/cache the vector store to ensure ingestion happens\n logger.info(f\"Search query: {self.search_query}\")\n if self._cached_vector_store is None:\n self.build_vector_store()\n\n # Only perform search if query is provided\n search_query = (self.search_query or \"\").strip()\n if not search_query:\n self.log(\"No search query provided - ingestion completed, returning empty results\")\n\n return Table(data=[])\n\n # Perform search with the provided query\n raw = self.search(search_query)\n raw_list = [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n return Table(data=raw_list)\n\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(\n self, build_config: dict, field_value: str, field_name: str | None = None\n ) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n is_openrag = mode == \"openrag\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n # build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n # In 'openrag' mode, expose the OPENRAG_* fields up front\n # since they are the only credentials required.\n for openrag_field in (\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ):\n if openrag_field in build_config:\n build_config[openrag_field][\"advanced\"] = not is_openrag\n build_config[openrag_field][\"required\"] = (\n is_openrag and openrag_field != \"openrag_ingest_batch_size\"\n )\n\n if is_basic or is_openrag:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" }, "docs_metadata": { "_input_type": "TableInput", diff --git a/flows/openrag_nudges.json b/flows/openrag_nudges.json index 764bd39d5..37de26b4a 100644 --- a/flows/openrag_nudges.json +++ b/flows/openrag_nudges.json @@ -2489,7 +2489,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport uuid\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import Any\n\nimport httpx\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n MultilineInput,\n Output,\n SecretStrInput,\n StrInput,\n TableInput,\n)\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\nfrom lfx.schema.dataframe import Table\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import OpenSearchException, RequestError\n\nREQUEST_TIMEOUT = 60\nMAX_RETRIES = 5\n\n# watsonx.ai surfaces rate-limit state via these (mostly non-standard) response\n# headers. The IBM SDK acts on the x-requests-limit-* family directly; we log\n# them on a failed embedding call to aid plan/region tuning.\n_WATSONX_RATE_LIMIT_HEADERS = (\n \"x-requests-limit-rate\",\n \"x-requests-limit-remaining\",\n \"x-requests-limit-reset\",\n \"Retry-After\",\n)\n\n\ndef _log_watsonx_rate_limit_headers(error: Exception) -> None:\n \"\"\"Best-effort diagnostic: log watsonx rate-limit headers from a failed call.\n\n The watsonx SDK raises ``ApiRequestFailure``, which carries the originating\n httpx/requests ``Response`` as ``.response``. On a 429 exhaustion we surface\n the documented rate-limit headers so operators can tune throughput.\n \"\"\"\n try:\n response = getattr(error, \"response\", None)\n headers = getattr(response, \"headers\", None)\n if not headers:\n return\n status = getattr(response, \"status_code\", \"unknown\")\n observed = {h: headers.get(h) for h in _WATSONX_RATE_LIMIT_HEADERS if headers.get(h) is not None}\n if str(status) == \"429\" or observed:\n logger.warning(f\"watsonx rate-limit response (status={status}): {observed}\")\n except Exception as log_error: # never let diagnostics mask the real error\n logger.debug(f\"Could not extract watsonx rate-limit headers: {log_error}\")\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n logger.info(f\"chunk_embedding_{normalize_model_name(model_name)}\")\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n\n Model Name Resolution:\n - Priority: deployment > model > model_name attributes\n - This ensures correct matching between embedding objects and index fields\n - When multiple embeddings are provided, specify embedding_model_name to select which one to use\n - During search, each detected model in the index is matched to its corresponding embedding object\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model Multi-Embedding)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search. \"\n \"To search use the tools search_documents and raw_search. \"\n \"Search documents takes a query for vector search, for example\\n\"\n ' {search_query: \"components in openrag\"}'\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n \"request_timeout\",\n \"max_retries\",\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ]\n _openrag_ingest_global_placeholders = {\n \"openrag_ingest_url\": \"OPENRAG_INGEST_URL\",\n \"openrag_ingest_token\": \"OPENRAG_INGEST_TOKEN\",\n \"openrag_ingest_run_id\": \"OPENRAG_INGEST_RUN_ID\",\n }\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\", \"JSON\"],\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"nmslib\", \"faiss\", \"lucene\", \"jvector\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'nmslib' works with standard \"\n \"OpenSearch. 'jvector' requires OpenSearch 2.9+. 'lucene' requires index.knn: true. \"\n \"Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(\n name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"], is_list=True\n ),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model to use for ingestion. This selects which embedding from the list \"\n \"will be used to embed documents. Matches on deployment, model, model_id, or model_name. \"\n \"For duplicate deployments, use combined format: 'deployment:model' \"\n \"(e.g., 'text-embedding-ada-002:text-embedding-3-large'). \"\n \"Leave empty to use the first embedding. Error message will show all available identifiers.\"\n ),\n advanced=False,\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"openrag\",\n options=[\"basic\", \"jwt\", \"openrag\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"'jwt' for JSON Web Token (Bearer) authentication, or 'openrag' to \"\n \"delegate writes to the OpenRAG backend ingest callback (no direct \"\n \"OpenSearch credentials required — only OPENRAG_* fields).\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=False,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n required=False\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=False,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n # ----- Timeout / Retry -----\n StrInput(\n name=\"request_timeout\",\n display_name=\"Request Timeout (seconds)\",\n value=\"60\",\n advanced=True,\n info=(\n \"Time in seconds to wait for a response from OpenSearch. \"\n \"Increase for large bulk ingestion or complex hybrid queries.\"\n ),\n ),\n StrInput(\n name=\"max_retries\",\n display_name=\"Max Retries\",\n value=\"3\",\n advanced=True,\n info=\"Number of retries for failed connections before raising an error.\",\n ),\n StrInput(\n name=\"openrag_ingest_url\",\n display_name=\"OpenRAG Ingest URL\",\n value=\"OPENRAG_INGEST_URL\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Internal OpenRAG callback URL for backend-owned document indexing.\",\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n IntInput(\n name=\"openrag_ingest_batch_size\",\n display_name=\"OpenRAG Ingest Batch Size\",\n value=100,\n advanced=True,\n ),\n ]\n outputs = [\n Output(\n display_name=\"Search Results\",\n name=\"search_results\",\n method=\"search_documents\",\n ),\n Output(display_name=\"Raw Search\", name=\"raw_search\", method=\"raw_search\"),\n ]\n\n def raw_search(self, query: str | dict | None = None) -> Data:\n \"\"\"Execute a raw OpenSearch query against the target index.\n\n Args:\n query (dict[str, Any]): The OpenSearch query DSL dictionary.\n\n Returns:\n Data: Search results as a Data object.\n\n Raises:\n ValueError: If 'query' is not a valid OpenSearch query (must be a non-empty dict).\n \"\"\"\n raw_query = query if query is not None else self.search_query\n\n if raw_query is None or (isinstance(raw_query, str) and not raw_query.strip()):\n self.log(\"No query provided for raw search - returning empty results\")\n return Data(data={})\n\n if isinstance(raw_query, dict):\n query_body = copy.deepcopy(raw_query)\n elif isinstance(raw_query, str):\n s = raw_query.strip()\n\n # First, optimistically try to parse as JSON DSL\n try:\n query_body = json.loads(s)\n except json.JSONDecodeError:\n # Fallback: treat as a basic text query over common fields\n query_body = {\n \"query\": {\n \"multi_match\": {\n \"query\": s,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n }\n }\n }\n else:\n msg = f\"Unsupported raw_search query type: {type(raw_query)!r}\"\n raise TypeError(msg)\n\n filter_obj = self._parse_filter_expression()\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n if filter_clauses:\n if \"query\" in query_body:\n original_query = query_body[\"query\"]\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [original_query],\n \"filter\": filter_clauses,\n }\n }\n else:\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [{\"match_all\": {}}],\n \"filter\": filter_clauses,\n }\n }\n\n if filter_obj:\n # Apply limit if not already set in the raw query\n if \"size\" not in query_body:\n limit = self._resolve_limit(filter_obj, default_limit=None)\n if limit is not None:\n query_body[\"size\"] = limit\n\n # Apply score_threshold / scoreThreshold as min_score if not already set\n if \"min_score\" not in query_body:\n\n score_threshold = self._resolve_score_threshold(filter_obj)\n if score_threshold is not None:\n\n query_body[\"min_score\"] = score_threshold\n\n client = self.build_client()\n logger.info(f\"query: {query_body}\")\n resp = client.search(\n index=self.index_name,\n body=query_body,\n params={\"terminate_after\": 0},\n )\n # Remove any _source keys whose value is a list of floats (embedding vectors)\n # Minimum length threshold to identify embedding vectors\n min_vector_length = 100\n\n def is_vector(val):\n # Accepts if it's a list of numbers (float or int) and has reasonable vector length\n return (\n isinstance(val, list)\n and len(val) > min_vector_length\n and all(isinstance(x, (float, int)) for x in val)\n )\n\n if \"hits\" in resp and \"hits\" in resp[\"hits\"]:\n for hit in resp[\"hits\"][\"hits\"]:\n source = hit.get(\"_source\")\n if isinstance(source, dict):\n keys_to_remove = [k for k, v in source.items() if is_vector(v)]\n for k in keys_to_remove:\n source.pop(k)\n logger.info(f\"Raw search response (all embedding vectors removed): {resp}\")\n return Data(**resp)\n\n def _get_embedding_model_name(self, embedding_obj=None) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Priority: deployment > model > model_id > model_name\n This ensures we use the actual model being deployed, not just the configured model.\n Supports multiple embedding providers (OpenAI, Watsonx, Cohere, etc.)\n\n Args:\n embedding_obj: Specific embedding object to get name from (optional)\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from provided embedding object\n if embedding_obj:\n # Priority: deployment > model > model_id > model_name\n if hasattr(embedding_obj, \"deployment\") and embedding_obj.deployment:\n return str(embedding_obj.deployment)\n if hasattr(embedding_obj, \"model\") and embedding_obj.model:\n return str(embedding_obj.model)\n if hasattr(embedding_obj, \"model_id\") and embedding_obj.model_id:\n return str(embedding_obj.model_id)\n if hasattr(embedding_obj, \"model_name\") and embedding_obj.model_name:\n return str(embedding_obj.model_name)\n\n # Try to get from embedding component (legacy single embedding)\n if hasattr(self, \"embedding\") and self.embedding:\n # Handle list of embeddings\n if isinstance(self.embedding, list) and len(self.embedding) > 0:\n first_emb = self.embedding[0]\n if hasattr(first_emb, \"deployment\") and first_emb.deployment:\n return str(first_emb.deployment)\n if hasattr(first_emb, \"model\") and first_emb.model:\n return str(first_emb.model)\n if hasattr(first_emb, \"model_id\") and first_emb.model_id:\n return str(first_emb.model_id)\n if hasattr(first_emb, \"model_name\") and first_emb.model_name:\n return str(first_emb.model_name)\n # Handle single embedding\n elif not isinstance(self.embedding, list):\n if hasattr(self.embedding, \"deployment\") and self.embedding.deployment:\n return str(self.embedding.deployment)\n if hasattr(self.embedding, \"model\") and self.embedding.model:\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_id\") and self.embedding.model_id:\n return str(self.embedding.model_id)\n if hasattr(self.embedding, \"model_name\") and self.embedding.model_name:\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'deployment', 'model', 'model_id', or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Note: Some OpenSearch versions/configurations have issues with dynamically adding\n knn_vector mappings (NullPointerException). This method checks if the field\n already exists before attempting to add it, and gracefully skips if the field\n is already properly configured.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n # First, check if the field already exists and is properly mapped\n properties = self._get_index_properties(client)\n if self._is_knn_vector_field(properties, field_name):\n # Field already exists as knn_vector - verify dimensions match\n existing_dim = self._get_field_dimension(properties, field_name)\n if existing_dim is not None and existing_dim != dim:\n logger.warning(\n f\"Field '{field_name}' exists with dimension {existing_dim}, \"\n f\"but current embedding has dimension {dim}. Using existing mapping.\"\n )\n else:\n logger.info(\n f\"[OpenSearchMultimodel] Field '{field_name}' already exists\"\n f\"as knn_vector with matching dimensions - skipping mapping update\"\n )\n return\n\n # Field doesn't exist, try to add the mapping\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\"type\": \"keyword\"},\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except RequestError as e:\n error_str = str(e).lower()\n if \"invalid engine\" in error_str and \"jvector\" in error_str:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to OpenSearch 2.9+.\"\n )\n raise ValueError(msg) from e\n if \"index.knn\" in error_str:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from e\n raise\n except Exception as e:\n # Check if this is the known OpenSearch k-NN NullPointerException issue\n error_str = str(e).lower()\n if \"null\" in error_str or \"nullpointerexception\" in error_str:\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}\"\n f\"due to OpenSearch k-NN plugin issue: {e}. \"\n f\"This is a known issue with some OpenSearch versions. \"\n f\"[OpenSearchMultimodel] Skipping mapping update. \"\n f\"Please ensure the index has the correct mapping for KNN search to work.\"\n )\n # Skip and continue - ingestion will proceed, but KNN search may fail if mapping doesn't exist\n return\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}: {e}\"\n )\n raise\n\n # Verify the field was added correctly\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n msg = f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n logger.error(msg)\n raise ValueError(msg)\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return (\n http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n )\n\n @staticmethod\n def _openrag_input_to_str(value: Any) -> str:\n if value is None:\n return \"\"\n if hasattr(value, \"get_secret_value\"):\n value = value.get_secret_value()\n if hasattr(value, \"text\"):\n value = value.text\n return str(value or \"\").strip()\n\n def _openrag_callback_value(self, attr_name: str) -> str:\n value = self._openrag_input_to_str(getattr(self, attr_name, \"\"))\n if value == self._openrag_ingest_global_placeholders.get(attr_name):\n return \"\"\n return value\n\n def _openrag_ingest_callback_config(self) -> tuple[str, str, str] | None:\n url = self._openrag_callback_value(\"openrag_ingest_url\")\n token = self._openrag_callback_value(\"openrag_ingest_token\")\n ingest_run_id = self._openrag_callback_value(\"openrag_ingest_run_id\")\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n debug_payload = {\n \"openrag_ingest_url\": url,\n \"openrag_ingest_url_len\": len(url),\n \"openrag_ingest_token_masked\": masked_token,\n \"openrag_ingest_token_len\": len(token),\n \"openrag_ingest_run_id\": ingest_run_id,\n \"raw_url_type\": type(self.openrag_ingest_url).__name__,\n \"raw_token_type\": type(self.openrag_ingest_token).__name__,\n \"raw_run_id_type\": type(self.openrag_ingest_run_id).__name__,\n }\n logger.warning(f\"[OpenRAG callback config] {debug_payload}\")\n try:\n self.log(f\"[OpenRAG callback config] {debug_payload}\")\n except Exception:\n pass\n\n if not url and not token and not ingest_run_id:\n return None\n if not url or not token or not ingest_run_id:\n msg = \"OpenRAG ingest callback requires url, token, and ingest_run_id.\"\n raise ValueError(msg)\n return url, token, ingest_run_id\n\n def _post_openrag_ingest_batches(\n self,\n *,\n requests: list[dict],\n vector_field: str,\n text_field: str,\n ) -> None:\n callback_config = self._openrag_ingest_callback_config()\n if callback_config is None:\n return\n\n url, token, ingest_run_id = callback_config\n batch_size = max(self._parse_int_param(\"openrag_ingest_batch_size\", 100), 1)\n timeout = self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT)\n headers = {\"Authorization\": f\"Bearer {token}\"}\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n request_summary = {\n \"url\": url,\n \"ingest_run_id\": ingest_run_id,\n \"token_masked\": masked_token,\n \"total_chunks\": len(requests),\n \"batch_size\": batch_size,\n \"timeout_s\": timeout,\n }\n logger.warning(f\"[OpenRAG ingest POST] {request_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST] {request_summary}\")\n except Exception:\n pass\n\n with httpx.Client(timeout=timeout) as client:\n total_batches = (len(requests) + batch_size - 1) // batch_size\n for batch_number, start in enumerate(range(0, len(requests), batch_size), start=1):\n batch = requests[start : start + batch_size]\n final = batch_number == total_batches\n payload = {\n \"ingest_run_id\": ingest_run_id,\n \"batch_id\": batch_number,\n \"final\": final,\n \"chunks\": [\n self._openrag_chunk_payload(\n request,\n vector_field=vector_field,\n text_field=text_field,\n )\n for request in batch\n ],\n }\n logger.warning(\n f\"[OpenRAG ingest POST] -> batch={batch_number}/{total_batches} \"\n f\"url={url} chunks={len(payload['chunks'])} final={final}\"\n )\n response = client.post(url, json=payload, headers=headers)\n response_summary = {\n \"batch\": batch_number,\n \"url\": url,\n \"status\": response.status_code,\n \"final_url\": str(response.request.url),\n \"response_headers\": dict(response.headers),\n \"body_preview\": response.text[:500],\n }\n logger.warning(f\"[OpenRAG ingest POST resp] {response_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST resp] {response_summary}\")\n except Exception:\n pass\n if response.status_code >= 400:\n msg = (\n \"OpenRAG ingest callback failed \"\n f\"(batch={batch_number}, status={response.status_code}, \"\n f\"url={url}): {response.text[:1000]}\"\n )\n raise RuntimeError(msg)\n\n self.log(f\"Posted {len(requests)} chunks to OpenRAG backend ingest callback.\")\n\n @staticmethod\n def _openrag_chunk_payload(\n request: dict,\n *,\n vector_field: str,\n text_field: str,\n ) -> dict:\n metadata = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\", vector_field, text_field}\n }\n page = metadata.get(\"page\")\n if isinstance(page, str) and page.isdigit():\n page = int(page)\n return {\n \"id\": request.get(\"_id\") or request.get(\"id\"),\n \"text\": request.get(text_field, \"\"),\n \"vector\": request[vector_field],\n \"page\": page if isinstance(page, int) else None,\n \"metadata\": metadata,\n }\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n logger.debug(f\"[OpenSearchMultimodel] Bulk ingesting embeddings for {index_name}\")\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n\n # Normalize ACL fields that may arrive as JSON strings from flows\n for key in (\"allowed_users\", \"allowed_groups\", \"allowed_principals\"):\n value = metadata.get(key)\n if isinstance(value, str):\n try:\n parsed = json.loads(value)\n if isinstance(parsed, list):\n metadata[key] = parsed\n except (json.JSONDecodeError, TypeError):\n # Leave value as-is if it isn't valid JSON\n pass\n\n metadata_document_id = str(metadata.get(\"document_id\") or \"\").strip()\n if metadata_document_id and metadata_document_id.lower() != \"none\":\n generated_id = f\"{metadata_document_id}_{i}\"\n else:\n generated_id = str(uuid.uuid4())\n _id = ids[i] if ids else generated_id\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n if self._openrag_ingest_callback_config() is not None:\n self._post_openrag_ingest_batches(\n requests=requests,\n vector_field=vector_field,\n text_field=text_field,\n )\n return return_ids\n try:\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n except Exception as bulk_error:\n if \"Unsupported request type for filter level DLS\" not in str(bulk_error):\n raise\n logger.warning(\n \"[OpenSearchMultimodel] Bulk ingest is blocked by filter-level DLS; \"\n \"falling back to per-document index requests.\"\n )\n self._index_embeddings_individually(client, requests)\n return return_ids\n\n def _index_embeddings_individually(\n self,\n client: OpenSearch,\n requests: list[dict],\n ) -> None:\n \"\"\"Index documents one at a time when OpenSearch DLS rejects bulk writes.\"\"\"\n for request in requests:\n document_id = request.get(\"_id\") or request.get(\"id\")\n body = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\"}\n }\n client.index(index=request[\"_index\"], id=document_id, body=body)\n\n def _log_index_admin_skip(self, operation: str, error: Exception) -> None:\n \"\"\"Log index-admin operations that may be blocked under filter-level DLS.\"\"\"\n logger.warning(\n f\"[OpenSearchMultimodel] Could not run index-admin operation '{operation}': {error}. \"\n \"Assuming the backend pre-created the required index/mapping and continuing.\"\n )\n\n # ---------- param helpers ----------\n def _parse_int_param(self, attr_name: str, default: int) -> int:\n \"\"\"Parse a string attribute to int, returning *default* on failure.\"\"\"\n raw = getattr(self, attr_name, None)\n if raw is None or str(raw).strip() == \"\":\n return default\n try:\n value = int(str(raw).strip())\n except ValueError:\n logger.warning(\n f\"Invalid integer value '{raw}' for {attr_name}, using default {default}\"\n )\n return default\n\n if value < 0:\n logger.warning(f\"Negative value '{raw}' for {attr_name}, using default {default}\")\n return default\n\n return value\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n if mode == \"openrag\":\n # Writes are delegated to the OpenRAG backend ingest callback,\n # so no direct OpenSearch credentials are needed. Only the\n # OPENRAG_* fields are required for ingestion to function.\n missing = [\n name\n for name, value in (\n (\"openrag_ingest_url\", self.openrag_ingest_url),\n (\"openrag_ingest_token\", self.openrag_ingest_token),\n (\"openrag_ingest_run_id\", self.openrag_ingest_run_id),\n )\n if not (value or \"\").strip()\n ]\n if missing:\n msg = (\n \"Auth Mode is 'openrag' but required OPENRAG_* fields are \"\n f\"missing: {', '.join(missing)}.\"\n )\n raise ValueError(msg)\n return {}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel] Building OpenSearch client\")\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n timeout=self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT),\n max_retries=self._parse_int_param(\"max_retries\", MAX_RETRIES),\n retry_on_timeout=True,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n client = self.build_client()\n\n # Check if we're in ingestion-only mode (no search query)\n has_search_query = bool((self.search_query or \"\").strip())\n if not has_search_query:\n logger.debug(\n \"[OpenSearchMultimodel] Ingestion-only mode activated: search operations will be skipped\"\n )\n logger.debug(\"[OpenSearchMultimodel] Starting ingestion mode...\")\n\n logger.debug(f\"[OpenSearchMultimodel] Embedding: {self.embedding}\")\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings using the selected model\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel][INGESTION] _add_documents_to_vector_store called\")\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data type: \"\n f\"{type(self.ingest_data)}, length: {len(self.ingest_data) if self.ingest_data else 0}\"\n )\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data content: \"\n f\"{self.ingest_data[:2] if self.ingest_data and len(self.ingest_data) > 0 else 'empty'}\"\n )\n\n docs = self.ingest_data or []\n if not docs:\n logger.debug(\"Ingestion complete: No documents provided\")\n return\n\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Normalize embedding to list first\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n\n # Filter out None values (fail-safe mode) - do this BEFORE checking if empty\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n # NOW check if we have any valid embeddings left after filtering\n if not embeddings_list:\n logger.warning(\n \"All embeddings returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n self.log(\n \"Embedding returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n return\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] Valid embeddings after filtering: {len(embeddings_list)}\"\n )\n self.log(\n f\"[OpenSearchMultimodel][INGESTION] Available embedding models: {len(embeddings_list)}\"\n )\n\n # Select the embedding to use for ingestion\n selected_embedding = None\n embedding_model = None\n\n # If embedding_model_name is specified, find matching embedding\n if (\n hasattr(self, \"embedding_model_name\")\n and self.embedding_model_name\n and self.embedding_model_name.strip()\n ):\n target_model_name = self.embedding_model_name.strip()\n self.log(f\"Looking for embedding model: {target_model_name}\")\n\n for emb_obj in embeddings_list:\n # Check all possible model identifiers (deployment, model, model_id, model_name)\n # Also check available_models list from EmbeddingsWithModels\n possible_names = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n if deployment:\n possible_names.append(str(deployment))\n if model:\n possible_names.append(str(model))\n if model_id:\n possible_names.append(str(model_id))\n if model_name:\n possible_names.append(str(model_name))\n\n # Also add combined identifier\n if deployment and model and deployment != model:\n possible_names.append(f\"{deployment}:{model}\")\n\n # Add all models from available_models dict\n if available_models_attr and isinstance(available_models_attr, dict):\n possible_names.extend(\n str(model_key).strip()\n for model_key in available_models_attr\n if model_key and str(model_key).strip()\n )\n\n # Match if target matches any of the possible names\n if target_model_name in possible_names:\n # Check if target is in available_models dict - use dedicated instance\n if (\n available_models_attr\n and isinstance(available_models_attr, dict)\n and target_model_name in available_models_attr\n ):\n # Use the dedicated embedding instance from the dict\n selected_embedding = available_models_attr[target_model_name]\n embedding_model = target_model_name\n self.log(\n f\"Found dedicated embedding instance for '{embedding_model}' in available_models dict\"\n )\n else:\n # Traditional identifier match\n selected_embedding = emb_obj\n embedding_model = self._get_embedding_model_name(emb_obj)\n self.log(\n f\"Found matching embedding model: {embedding_model} (matched on: {target_model_name})\"\n )\n break\n\n if not selected_embedding:\n # Build detailed list of available embeddings with all their identifiers\n available_info = []\n for idx, emb in enumerate(embeddings_list):\n emb_type = type(emb).__name__\n identifiers = []\n deployment = getattr(emb, \"deployment\", None)\n model = getattr(emb, \"model\", None)\n model_id = getattr(emb, \"model_id\", None)\n model_name = getattr(emb, \"model_name\", None)\n available_models_attr = getattr(emb, \"available_models\", None)\n\n if deployment:\n identifiers.append(f\"deployment='{deployment}'\")\n if model:\n identifiers.append(f\"model='{model}'\")\n if model_id:\n identifiers.append(f\"model_id='{model_id}'\")\n if model_name:\n identifiers.append(f\"model_name='{model_name}'\")\n\n # Add combined identifier as an option\n if deployment and model and deployment != model:\n identifiers.append(f\"combined='{deployment}:{model}'\")\n\n # Add available_models dict if present\n if available_models_attr and isinstance(available_models_attr, dict):\n identifiers.append(f\"available_models={list(available_models_attr.keys())}\")\n\n available_info.append(\n f\" [{idx}] {emb_type}: {', '.join(identifiers) if identifiers else 'No identifiers'}\"\n )\n\n msg = (\n f\"Embedding model '{target_model_name}' not found in available embeddings.\\n\\n\"\n f\"Available embeddings:\\n\" + \"\\n\".join(available_info) + \"\\n\\n\"\n \"Please set 'embedding_model_name' to one of the identifier values shown above \"\n \"(use the value after the '=' sign, without quotes).\\n\"\n \"For duplicate deployments, use the 'combined' format.\\n\"\n \"Or leave it empty to use the first embedding.\"\n )\n raise ValueError(msg)\n else:\n # Use first embedding if no model name specified\n selected_embedding = embeddings_list[0]\n embedding_model = self._get_embedding_model_name(selected_embedding)\n self.log(f\"No embedding_model_name specified, using first embedding: {embedding_model}\")\n\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n logger.info(f\"Selected embedding model for ingestion: '{embedding_model}'\")\n self.log(f\"Using embedding model for ingestion: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Log embedding details for debugging\n if hasattr(selected_embedding, \"deployment\"):\n logger.info(f\"Embedding deployment: {selected_embedding.deployment}\")\n if hasattr(selected_embedding, \"model\"):\n logger.info(f\"Embedding model: {selected_embedding.model}\")\n if hasattr(selected_embedding, \"model_id\"):\n logger.info(f\"Embedding model_id: {selected_embedding.model_id}\")\n if hasattr(selected_embedding, \"dimensions\"):\n logger.info(f\"Embedding dimensions: {selected_embedding.dimensions}\")\n if hasattr(selected_embedding, \"available_models\"):\n logger.info(f\"Embedding available_models: {selected_embedding.available_models}\")\n\n # No model switching needed - each model in available_models has its own dedicated instance\n # The selected_embedding is already configured correctly for the target model\n logger.info(\n f\"Using embedding instance for '{embedding_model}' - pre-configured and ready to use\"\n )\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n logger.debug(f\"[LF] Docs metadata {self.docs_metadata}\")\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n\n # Determine whether the selected embedding is watsonx/IBM. The watsonx\n # SDK ships its own rate-limit machinery (input batching, proactive\n # x-requests-limit-* TokenBucket throttling, and jittered exponential\n # backoff on 429), so we lean on it instead of retrying on top of it.\n # The type-name check also covers watsonx-hosted, non-\"ibm/\" models\n # (e.g. intfloat/multilingual-e5-large).\n is_ibm = (embedding_model and \"ibm\" in str(embedding_model).lower()) or (\n selected_embedding and \"watsonx\" in type(selected_embedding).__name__.lower()\n )\n logger.debug(f\"Is IBM/watsonx embedding: {is_ibm}\")\n\n if is_ibm:\n\n # Hand the full batch to the SDK and let it batch/throttle/retry.\n # Retry attempts and base backoff are tunable via the SDK's own\n # WATSONX_MAX_RETRIES / WATSONX_DELAY_TIME environment variables.\n logger.info(\n f\"Embedding {len(texts)} chunks via watsonx SDK batch (SDK-managed throttle + 429 retry)\"\n )\n try:\n vectors: list[list[float]] = selected_embedding.embed_documents(texts)\n logger.info(f\"Successfully embedded {len(vectors)} chunks via watsonx SDK\")\n except Exception as embed_error:\n _log_watsonx_rate_limit_headers(embed_error)\n logger.error(\n f\"Failed to embed {len(texts)} chunks via watsonx SDK. Error: {embed_error}\",\n )\n raise\n\n else:\n # Non-watsonx providers (OpenAI, Ollama) lack the watsonx SDK's\n # built-in rate-limit handling, so embed per chunk in parallel with\n # a generic rate-limit-aware tenacity retry.\n vectors: list[list[float]] = [None] * len(texts)\n from tenacity import (\n retry,\n retry_if_exception,\n stop_after_attempt,\n wait_exponential,\n )\n\n def is_rate_limit_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a rate limit error (429).\"\"\"\n error_str = str(exception).lower()\n return \"429\" in error_str or \"rate_limit\" in error_str or \"rate limit\" in error_str\n\n def is_other_retryable_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a transient network error worth retrying.\"\"\"\n if is_rate_limit_error(exception):\n return False\n return isinstance(exception, (ConnectionError, TimeoutError, OSError))\n\n # Retry decorator for rate limit errors (longer backoff)\n retry_on_rate_limit = retry(\n retry=retry_if_exception(is_rate_limit_error),\n stop=stop_after_attempt(5),\n wait=wait_exponential(multiplier=2, min=2, max=30),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Rate limit hit for chunk (attempt {retry_state.attempt_number}/5), \"\n f\"backing off for {retry_state.next_action.sleep:.1f}s\"\n ),\n )\n\n # Retry decorator for other errors (shorter backoff)\n retry_on_other_errors = retry(\n retry=retry_if_exception(is_other_retryable_error),\n stop=stop_after_attempt(3),\n wait=wait_exponential(multiplier=1, min=1, max=8),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Error embedding chunk (attempt {retry_state.attempt_number}/3), \"\n f\"retrying in {retry_state.next_action.sleep:.1f}s: {retry_state.outcome.exception()}\"\n ),\n )\n\n def embed_chunk_with_retry(chunk_text: str, chunk_idx: int) -> list[float]:\n \"\"\"Embed a single chunk with rate-limit-aware retry logic.\"\"\"\n\n @retry_on_rate_limit\n @retry_on_other_errors\n def _embed(text: str) -> list[float]:\n return selected_embedding.embed_documents([text])[0]\n\n try:\n return _embed(chunk_text)\n except Exception as e:\n logger.error(\n f\"Failed to embed chunk {chunk_idx} after all retries: {e}\",\n error=str(e),\n )\n raise\n\n max_workers = min(max(len(texts), 1), 8)\n logger.debug(f\"Using parallel processing with {max_workers} workers\")\n\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {\n executor.submit(embed_chunk_with_retry, chunk, idx): idx\n for idx, chunk in enumerate(texts)\n }\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n\n if not vectors:\n self.log(f\"No vectors generated from documents for model {embedding_model}.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n use_openrag_ingest_callback = self._openrag_ingest_callback_config() is not None\n\n is_aoss = False\n mapping: dict | None = None\n\n engine = getattr(self, \"engine\", \"jvector\")\n\n if use_openrag_ingest_callback:\n self.log(\"Using OpenRAG backend ingest callback; skipping direct OpenSearch writes.\")\n else:\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping (index.knn: true is required for vector search)\n index_exists = True\n try:\n index_exists = bool(client.indices.exists(index=self.index_name))\n except OpenSearchException as exists_error:\n self._log_index_admin_skip(\"indices.exists\", exists_error)\n\n try:\n if not index_exists:\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error == \"resource_already_exists_exception\":\n pass # Index was created concurrently\n else:\n error_msg = str(creation_error).lower()\n if \"invalid engine\" in error_msg or \"illegal_argument\" in error_msg:\n if \"jvector\" in error_msg:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to 2.9+.\"\n )\n raise ValueError(msg) from creation_error\n if \"index.knn\" in error_msg:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from creation_error\n logger.warning(f\"Failed to create index '{self.index_name}': {creation_error}\")\n raise\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(\n f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\"\n )\n logger.info(f\"Will store embeddings in field: {dynamic_field_name}\")\n logger.info(f\"Will tag documents with embedding_model: {embedding_model}\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n logger.info(\n f\"Ingestion complete: Successfully indexed {len(return_ids)} documents with model '{embedding_model}'\"\n )\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if (\n \"term\" in f\n and isinstance(f[\"term\"], dict)\n and not self._is_placeholder_term(f[\"term\"])\n ):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n\n def _parse_filter_expression(self) -> dict | None:\n \"\"\"Parse and validate optional filter_expression JSON.\n\n Returns:\n Parsed JSON object as a dict, or None when unset/blank.\n\n Raises:\n ValueError: If JSON is invalid or does not decode to an object.\n \"\"\"\n filter_expression = getattr(self, \"filter_expression\", \"\")\n if not isinstance(filter_expression, str) or not filter_expression.strip():\n return None\n try:\n filter_obj = json.loads(filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not isinstance(filter_obj, dict):\n msg = \"Invalid filter_expression JSON type: expected a JSON object.\"\n raise TypeError(msg)\n return filter_obj\n\n def _resolve_limit(self, filter_obj: dict | None, default_limit: int | None) -> int | None:\n \"\"\"Resolve an integer result limit from filter settings.\"\"\"\n if not filter_obj:\n return default_limit\n raw_limit = filter_obj.get(\"limit\", default_limit)\n if raw_limit is None:\n return None\n if isinstance(raw_limit, bool):\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise TypeError(msg)\n try:\n limit = int(raw_limit)\n except (TypeError, ValueError) as e:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg) from e\n if limit <= 0:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg)\n return limit\n\n def _resolve_score_threshold(self, filter_obj: dict | None) -> float | None:\n \"\"\"Resolve optional positive min score from filter settings.\"\"\"\n if not filter_obj:\n return None\n score_threshold = filter_obj.get(\"score_threshold\")\n if score_threshold is None:\n score_threshold = filter_obj.get(\"scoreThreshold\")\n if not isinstance(score_threshold, (int, float)) or score_threshold <= 0:\n return None\n return float(score_threshold)\n\n def _detect_available_models(self, client: OpenSearch, filter_clauses: list[dict] | None = None) -> list[str]:\n\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}}},\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\"bool\": {\"filter\": filter_clauses}}\n\n logger.debug(f\"Model detection query: {agg_query}\")\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n # Log detailed bucket info for debugging\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n if not models:\n total_hits = result.get(\"hits\", {}).get(\"total\", {})\n total_count = (\n total_hits.get(\"value\", 0) if isinstance(total_hits, dict) else total_hits\n )\n logger.warning(\n f\"No embedding_model values found in index '{self.index_name}'. \"\n f\"Total docs in index: {total_count}. \"\n f\"This may indicate documents were indexed without the embedding_model field.\"\n )\n except (OpenSearchException, KeyError, ValueError) as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n fallback_model = self._get_embedding_model_name()\n logger.info(f\"Using fallback model: {fallback_model}\")\n return [fallback_model]\n else:\n return models\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except OpenSearchException as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n return bool(isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\")\n\n def _get_field_dimension(\n self, properties: dict[str, Any] | None, field_name: str\n ) -> int | None:\n \"\"\"Get the dimension of a knn_vector field from the index mapping.\n\n Args:\n properties: Index properties from mapping\n field_name: Name of the vector field\n\n Returns:\n Dimension of the field, or None if not found\n \"\"\"\n if not field_name or properties is None:\n return None\n\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return None\n\n # Check direct knn_vector field\n if field_def.get(\"type\") == \"knn_vector\":\n return field_def.get(\"dimension\")\n\n # Check nested properties\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return nested_props.get(\"dimension\")\n\n return None\n\n def _get_filename_agg_field(self, index_properties: dict[str, Any] | None) -> str:\n \"\"\"Choose the appropriate field for filename aggregations.\"\"\"\n if not index_properties:\n return \"filename.keyword\"\n\n filename_def = index_properties.get(\"filename\")\n if not isinstance(filename_def, dict):\n return \"filename.keyword\"\n\n field_type = filename_def.get(\"type\")\n fields_def = filename_def.get(\"fields\", {})\n\n # Top-level keyword with no subfields\n if field_type == \"keyword\" and not isinstance(fields_def, dict):\n return \"filename\"\n\n # Text field with keyword subfield\n if isinstance(fields_def, dict) and \"keyword\" in fields_def:\n return \"filename.keyword\"\n\n # Fallback: aggregate on filename directly\n return \"filename\"\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = self._parse_filter_expression()\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Check if embedding is None (fail-safe mode)\n if self.embedding is None or (\n isinstance(self.embedding, list) and all(e is None for e in self.embedding)\n ):\n logger.error(\"Embedding returned None (fail-safe mode enabled). Cannot perform search.\")\n return []\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models\n query_embeddings = {}\n\n # Normalize embedding to list\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n # Filter out None values (fail-safe mode)\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n if not embeddings_list:\n logger.error(\n \"No valid embeddings available after filtering None values (fail-safe mode). Cannot perform search.\"\n )\n return []\n\n # Create a comprehensive map of model names to embedding objects\n # Check all possible identifiers (deployment, model, model_id, model_name)\n # Also leverage available_models list from EmbeddingsWithModels\n # Handle duplicate identifiers by creating combined keys\n embedding_by_model = {}\n identifier_conflicts = {} # Track which identifiers have conflicts\n\n for idx, emb_obj in enumerate(embeddings_list):\n # Get all possible identifiers for this embedding\n identifiers = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n dimensions = getattr(emb_obj, \"dimensions\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Embedding object {idx}: deployment={deployment}, model={model}, \"\n f\"model_id={model_id}, model_name={model_name}, dimensions={dimensions}, \"\n f\"available_models={available_models_attr}\"\n )\n\n # If this embedding has available_models dict, map all models to their dedicated instances\n if available_models_attr and isinstance(available_models_attr, dict):\n logger.info(\n f\"Embedding object {idx} provides {len(available_models_attr)} models via available_models dict\"\n )\n for model_name_key, dedicated_embedding in available_models_attr.items():\n if model_name_key and str(model_name_key).strip():\n model_str = str(model_name_key).strip()\n if model_str not in embedding_by_model:\n # Use the dedicated embedding instance from the dict\n embedding_by_model[model_str] = dedicated_embedding\n logger.info(\n f\"Mapped available model '{model_str}' to dedicated embedding instance\"\n )\n else:\n # Conflict detected - track it\n if model_str not in identifier_conflicts:\n identifier_conflicts[model_str] = [embedding_by_model[model_str]]\n identifier_conflicts[model_str].append(dedicated_embedding)\n logger.warning(\n f\"Available model '{model_str}' has conflict - used by multiple embeddings\"\n )\n\n # Also map traditional identifiers (for backward compatibility)\n if deployment:\n identifiers.append(str(deployment))\n if model:\n identifiers.append(str(model))\n if model_id:\n identifiers.append(str(model_id))\n if model_name:\n identifiers.append(str(model_name))\n\n # Map all identifiers to this embedding object\n for identifier in identifiers:\n if identifier not in embedding_by_model:\n embedding_by_model[identifier] = emb_obj\n logger.info(f\"Mapped identifier '{identifier}' to embedding object {idx}\")\n else:\n # Conflict detected - track it\n if identifier not in identifier_conflicts:\n identifier_conflicts[identifier] = [embedding_by_model[identifier]]\n identifier_conflicts[identifier].append(emb_obj)\n logger.warning(\n f\"Identifier '{identifier}' has conflict - used by multiple embeddings\"\n )\n\n # For embeddings with model+deployment, create combined identifier\n # This helps when deployment is the same but model differs\n if deployment and model and deployment != model:\n combined_id = f\"{deployment}:{model}\"\n if combined_id not in embedding_by_model:\n embedding_by_model[combined_id] = emb_obj\n logger.info(\n f\"Created combined identifier '{combined_id}' for embedding object {idx}\"\n )\n\n # Log conflicts\n if identifier_conflicts:\n logger.warning(\n f\"Found {len(identifier_conflicts)} conflicting identifiers. \"\n f\"Consider using combined format 'deployment:model' or specifying unique model names.\"\n )\n for conflict_id, emb_list in identifier_conflicts.items():\n logger.warning(\n f\" Conflict on '{conflict_id}': {len(emb_list)} embeddings use this identifier\"\n )\n\n logger.info(f\"Generating embeddings for {len(available_models)} models in index\")\n logger.info(f\"Available embedding identifiers: {list(embedding_by_model.keys())}\")\n self.log(f\"[SEARCH] Models detected in index: {available_models}\")\n self.log(f\"[SEARCH] Available embedding identifiers: {list(embedding_by_model.keys())}\")\n\n # Track matching status for debugging\n matched_models = []\n unmatched_models = []\n\n for model_name in available_models:\n try:\n # Check if we have an embedding object for this model\n if model_name in embedding_by_model:\n # Use the matching embedding object directly\n emb_obj = embedding_by_model[model_name]\n emb_deployment = getattr(emb_obj, \"deployment\", None)\n emb_model = getattr(emb_obj, \"model\", None)\n emb_model_id = getattr(emb_obj, \"model_id\", None)\n emb_dimensions = getattr(emb_obj, \"dimensions\", None)\n emb_available_models = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Using embedding object for model '{model_name}': \"\n f\"deployment={emb_deployment}, model={emb_model}, model_id={emb_model_id}, \"\n f\"dimensions={emb_dimensions}\"\n )\n\n # Check if this is a dedicated instance from available_models dict\n if emb_available_models and isinstance(emb_available_models, dict):\n logger.info(\n f\"Model '{model_name}' using dedicated instance from available_models dict \"\n f\"(pre-configured with correct model and dimensions)\"\n )\n\n # Use the embedding instance directly - no model switching needed!\n vec = emb_obj.embed_query(q)\n query_embeddings[model_name] = vec\n matched_models.append(model_name)\n logger.info(\n f\"Generated embedding for model: {model_name} (actual dimensions: {len(vec)})\"\n )\n self.log(f\"[MATCH] Model '{model_name}' - generated {len(vec)}-dim embedding\")\n else:\n # No matching embedding found for this model\n unmatched_models.append(model_name)\n logger.warning(\n f\"No matching embedding found for model '{model_name}'. \"\n f\"This model will be skipped. Available identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(\n f\"[NO MATCH] Model '{model_name}' - available: {list(embedding_by_model.keys())}\"\n )\n except (\n RuntimeError,\n ValueError,\n ConnectionError,\n TimeoutError,\n AttributeError,\n KeyError,\n ) as e:\n logger.warning(f\"Failed to generate embedding for {model_name}: {e}\")\n self.log(f\"[ERROR] Embedding generation failed for '{model_name}': {e}\")\n\n # Log summary of model matching\n logger.info(\n f\"Model matching summary: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n self.log(\n f\"[SUMMARY] Model matching: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n if unmatched_models:\n self.log(f\"[WARN] Unmatched models in index: {unmatched_models}\")\n\n if not query_embeddings:\n msg = (\n f\"Failed to generate embeddings for any model. \"\n f\"Index has models: {available_models}, but no matching embedding objects found. \"\n f\"Available embedding identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(f\"[FAIL] Search failed: {msg}\")\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n vector_dim = len(embedding_vector)\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n self.log(\n f\"[SKIP] Field '{selected_field}' not a knn_vector - skipping model '{model_name}'\"\n )\n continue\n\n # Validate vector dimensions match the field dimensions\n field_dim = self._get_field_dimension(index_properties, selected_field)\n if field_dim is not None and field_dim != vector_dim:\n logger.error(\n f\"Dimension mismatch for model '{model_name}': \"\n f\"Query vector has {vector_dim} dimensions but field '{selected_field}' expects {field_dim}. \"\n f\"Skipping this model to prevent search errors.\"\n )\n self.log(\n f\"[DIM MISMATCH] Model '{model_name}': query={vector_dim} vs field={field_dim} - skipping\"\n )\n continue\n\n logger.info(\n f\"Adding KNN query for model '{model_name}': field='{selected_field}', \"\n f\"query_dims={vector_dim}, field_dims={field_dim or 'unknown'}\"\n )\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n self.log(\n f\"[WARN] No valid KNN queries could be built. \"\n f\"Query embeddings generated: {list(query_embeddings.keys())}, \"\n f\"but no matching knn_vector fields found in index.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1,\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = self._resolve_limit(filter_obj, default_limit=self.number_of_results)\n score_threshold = self._resolve_score_threshold(filter_obj)\n\n # Determine the best aggregation field for filename based on index mapping\n filename_agg_field = self._get_filename_agg_field(index_properties)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates,\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": filename_agg_field, \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n \"allowed_principals\",\n ],\n \"size\": limit,\n }\n\n if score_threshold is not None:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models: \"\n f\"{list(query_embeddings.keys())}\"\n )\n self.log(\n f\"[EXEC] Executing search with {len(knn_queries_with_candidates)} KNN queries, limit={limit}\"\n )\n self.log(f\"[EXEC] Embedding models used: {list(query_embeddings.keys())}\")\n self.log(f\"[EXEC] KNN fields being queried: {embedding_fields}\")\n\n try:\n resp = client.search(index=self.index_name, body=body, params={\"terminate_after\": 0})\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = (\n knn_queries_without_candidates\n )\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n self.log(f\"[RESULT] Search complete: {len(hits)} results found\")\n\n if len(hits) == 0:\n self.log(\n f\"[EMPTY] Debug info: \"\n f\"models_in_index={available_models}, \"\n f\"matched_models={matched_models}, \"\n f\"knn_fields={embedding_fields}, \"\n f\"filters={len(filter_clauses)} clauses\"\n )\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> Table:\n\n \"\"\"Search documents and return results as a Table.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Table (DataFrame) format\n so downstream Parser components can consume them directly.\n\n Always builds the vector store (triggering ingestion if needed), then performs\n search only if a query is provided.\n\n Returns:\n Table containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n\n try:\n # Always build/cache the vector store to ensure ingestion happens\n logger.info(f\"Search query: {self.search_query}\")\n if self._cached_vector_store is None:\n self.build_vector_store()\n\n # Only perform search if query is provided\n search_query = (self.search_query or \"\").strip()\n if not search_query:\n self.log(\"No search query provided - ingestion completed, returning empty results\")\n\n return Table(data=[])\n\n # Perform search with the provided query\n raw = self.search(search_query)\n raw_list = [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n return Table(data=raw_list)\n\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(\n self, build_config: dict, field_value: str, field_name: str | None = None\n ) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n is_openrag = mode == \"openrag\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n # build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n # In 'openrag' mode, expose the OPENRAG_* fields up front\n # since they are the only credentials required.\n for openrag_field in (\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ):\n if openrag_field in build_config:\n build_config[openrag_field][\"advanced\"] = not is_openrag\n build_config[openrag_field][\"required\"] = (\n is_openrag and openrag_field != \"openrag_ingest_batch_size\"\n )\n\n if is_basic or is_openrag:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" + "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport os\nimport uuid\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import Any\n\nimport httpx\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n MultilineInput,\n Output,\n SecretStrInput,\n StrInput,\n TableInput,\n)\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\nfrom lfx.schema.dataframe import Table\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import OpenSearchException, RequestError\n\nREQUEST_TIMEOUT = 60\nMAX_RETRIES = 5\n\n\ndef _get_min_env_int(key: str, default: int, minimum: int) -> int:\n try:\n value = int(os.getenv(key, default))\n except (TypeError, ValueError):\n value = default\n return max(value, minimum)\n\n\nOPENSEARCH_NUMBER_OF_SHARDS = _get_min_env_int(\"OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS\", 1, 1)\nOPENSEARCH_NUMBER_OF_REPLICAS = _get_min_env_int(\"OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS\", 0, 0)\n\n# watsonx.ai surfaces rate-limit state via these (mostly non-standard) response\n# headers. The IBM SDK acts on the x-requests-limit-* family directly; we log\n# them on a failed embedding call to aid plan/region tuning.\n_WATSONX_RATE_LIMIT_HEADERS = (\n \"x-requests-limit-rate\",\n \"x-requests-limit-remaining\",\n \"x-requests-limit-reset\",\n \"Retry-After\",\n)\n\n\ndef _log_watsonx_rate_limit_headers(error: Exception) -> None:\n \"\"\"Best-effort diagnostic: log watsonx rate-limit headers from a failed call.\n\n The watsonx SDK raises ``ApiRequestFailure``, which carries the originating\n httpx/requests ``Response`` as ``.response``. On a 429 exhaustion we surface\n the documented rate-limit headers so operators can tune throughput.\n \"\"\"\n try:\n response = getattr(error, \"response\", None)\n headers = getattr(response, \"headers\", None)\n if not headers:\n return\n status = getattr(response, \"status_code\", \"unknown\")\n observed = {\n h: headers.get(h) for h in _WATSONX_RATE_LIMIT_HEADERS if headers.get(h) is not None\n }\n if str(status) == \"429\" or observed:\n logger.warning(f\"watsonx rate-limit response (status={status}): {observed}\")\n except Exception as log_error: # never let diagnostics mask the real error\n logger.debug(f\"Could not extract watsonx rate-limit headers: {log_error}\")\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n logger.info(f\"chunk_embedding_{normalize_model_name(model_name)}\")\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n\n Model Name Resolution:\n - Priority: deployment > model > model_name attributes\n - This ensures correct matching between embedding objects and index fields\n - When multiple embeddings are provided, specify embedding_model_name to select which one to use\n - During search, each detected model in the index is matched to its corresponding embedding object\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model Multi-Embedding)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search. \"\n \"To search use the tools search_documents and raw_search. \"\n \"Search documents takes a query for vector search, for example\\n\"\n ' {search_query: \"components in openrag\"}'\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n \"request_timeout\",\n \"max_retries\",\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ]\n _openrag_ingest_global_placeholders = {\n \"openrag_ingest_url\": \"OPENRAG_INGEST_URL\",\n \"openrag_ingest_token\": \"OPENRAG_INGEST_TOKEN\",\n \"openrag_ingest_run_id\": \"OPENRAG_INGEST_RUN_ID\",\n }\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\", \"JSON\"],\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"nmslib\", \"faiss\", \"lucene\", \"jvector\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'nmslib' works with standard \"\n \"OpenSearch. 'jvector' requires OpenSearch 2.9+. 'lucene' requires index.knn: true. \"\n \"Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(\n name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"], is_list=True\n ),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model to use for ingestion. This selects which embedding from the list \"\n \"will be used to embed documents. Matches on deployment, model, model_id, or model_name. \"\n \"For duplicate deployments, use combined format: 'deployment:model' \"\n \"(e.g., 'text-embedding-ada-002:text-embedding-3-large'). \"\n \"Leave empty to use the first embedding. Error message will show all available identifiers.\"\n ),\n advanced=False,\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"openrag\",\n options=[\"basic\", \"jwt\", \"openrag\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"'jwt' for JSON Web Token (Bearer) authentication, or 'openrag' to \"\n \"delegate writes to the OpenRAG backend ingest callback (no direct \"\n \"OpenSearch credentials required — only OPENRAG_* fields).\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=False,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n required=False,\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=False,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n # ----- Timeout / Retry -----\n StrInput(\n name=\"request_timeout\",\n display_name=\"Request Timeout (seconds)\",\n value=\"60\",\n advanced=True,\n info=(\n \"Time in seconds to wait for a response from OpenSearch. \"\n \"Increase for large bulk ingestion or complex hybrid queries.\"\n ),\n ),\n StrInput(\n name=\"max_retries\",\n display_name=\"Max Retries\",\n value=\"3\",\n advanced=True,\n info=\"Number of retries for failed connections before raising an error.\",\n ),\n StrInput(\n name=\"openrag_ingest_url\",\n display_name=\"OpenRAG Ingest URL\",\n value=\"OPENRAG_INGEST_URL\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Internal OpenRAG callback URL for backend-owned document indexing.\",\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n IntInput(\n name=\"openrag_ingest_batch_size\",\n display_name=\"OpenRAG Ingest Batch Size\",\n value=100,\n advanced=True,\n ),\n ]\n outputs = [\n Output(\n display_name=\"Search Results\",\n name=\"search_results\",\n method=\"search_documents\",\n ),\n Output(display_name=\"Raw Search\", name=\"raw_search\", method=\"raw_search\"),\n ]\n\n def raw_search(self, query: str | dict | None = None) -> Data:\n \"\"\"Execute a raw OpenSearch query against the target index.\n\n Args:\n query (dict[str, Any]): The OpenSearch query DSL dictionary.\n\n Returns:\n Data: Search results as a Data object.\n\n Raises:\n ValueError: If 'query' is not a valid OpenSearch query (must be a non-empty dict).\n \"\"\"\n raw_query = query if query is not None else self.search_query\n\n if raw_query is None or (isinstance(raw_query, str) and not raw_query.strip()):\n self.log(\"No query provided for raw search - returning empty results\")\n return Data(data={})\n\n if isinstance(raw_query, dict):\n query_body = copy.deepcopy(raw_query)\n elif isinstance(raw_query, str):\n s = raw_query.strip()\n\n # First, optimistically try to parse as JSON DSL\n try:\n query_body = json.loads(s)\n except json.JSONDecodeError:\n # Fallback: treat as a basic text query over common fields\n query_body = {\n \"query\": {\n \"multi_match\": {\n \"query\": s,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n }\n }\n }\n else:\n msg = f\"Unsupported raw_search query type: {type(raw_query)!r}\"\n raise TypeError(msg)\n\n filter_obj = self._parse_filter_expression()\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n if filter_clauses:\n if \"query\" in query_body:\n original_query = query_body[\"query\"]\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [original_query],\n \"filter\": filter_clauses,\n }\n }\n else:\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [{\"match_all\": {}}],\n \"filter\": filter_clauses,\n }\n }\n\n if filter_obj:\n # Apply limit if not already set in the raw query\n if \"size\" not in query_body:\n limit = self._resolve_limit(filter_obj, default_limit=None)\n if limit is not None:\n query_body[\"size\"] = limit\n\n # Apply score_threshold / scoreThreshold as min_score if not already set\n if \"min_score\" not in query_body:\n score_threshold = self._resolve_score_threshold(filter_obj)\n if score_threshold is not None:\n query_body[\"min_score\"] = score_threshold\n\n client = self.build_client()\n logger.info(f\"query: {query_body}\")\n resp = client.search(\n index=self.index_name,\n body=query_body,\n params={\"terminate_after\": 0},\n )\n # Remove any _source keys whose value is a list of floats (embedding vectors)\n # Minimum length threshold to identify embedding vectors\n min_vector_length = 100\n\n def is_vector(val):\n # Accepts if it's a list of numbers (float or int) and has reasonable vector length\n return (\n isinstance(val, list)\n and len(val) > min_vector_length\n and all(isinstance(x, (float, int)) for x in val)\n )\n\n if \"hits\" in resp and \"hits\" in resp[\"hits\"]:\n for hit in resp[\"hits\"][\"hits\"]:\n source = hit.get(\"_source\")\n if isinstance(source, dict):\n keys_to_remove = [k for k, v in source.items() if is_vector(v)]\n for k in keys_to_remove:\n source.pop(k)\n logger.info(f\"Raw search response (all embedding vectors removed): {resp}\")\n return Data(**resp)\n\n def _get_embedding_model_name(self, embedding_obj=None) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Priority: deployment > model > model_id > model_name\n This ensures we use the actual model being deployed, not just the configured model.\n Supports multiple embedding providers (OpenAI, Watsonx, Cohere, etc.)\n\n Args:\n embedding_obj: Specific embedding object to get name from (optional)\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from provided embedding object\n if embedding_obj:\n # Priority: deployment > model > model_id > model_name\n if hasattr(embedding_obj, \"deployment\") and embedding_obj.deployment:\n return str(embedding_obj.deployment)\n if hasattr(embedding_obj, \"model\") and embedding_obj.model:\n return str(embedding_obj.model)\n if hasattr(embedding_obj, \"model_id\") and embedding_obj.model_id:\n return str(embedding_obj.model_id)\n if hasattr(embedding_obj, \"model_name\") and embedding_obj.model_name:\n return str(embedding_obj.model_name)\n\n # Try to get from embedding component (legacy single embedding)\n if hasattr(self, \"embedding\") and self.embedding:\n # Handle list of embeddings\n if isinstance(self.embedding, list) and len(self.embedding) > 0:\n first_emb = self.embedding[0]\n if hasattr(first_emb, \"deployment\") and first_emb.deployment:\n return str(first_emb.deployment)\n if hasattr(first_emb, \"model\") and first_emb.model:\n return str(first_emb.model)\n if hasattr(first_emb, \"model_id\") and first_emb.model_id:\n return str(first_emb.model_id)\n if hasattr(first_emb, \"model_name\") and first_emb.model_name:\n return str(first_emb.model_name)\n # Handle single embedding\n elif not isinstance(self.embedding, list):\n if hasattr(self.embedding, \"deployment\") and self.embedding.deployment:\n return str(self.embedding.deployment)\n if hasattr(self.embedding, \"model\") and self.embedding.model:\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_id\") and self.embedding.model_id:\n return str(self.embedding.model_id)\n if hasattr(self.embedding, \"model_name\") and self.embedding.model_name:\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'deployment', 'model', 'model_id', or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\n \"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search},\n \"number_of_shards\": OPENSEARCH_NUMBER_OF_SHARDS,\n \"number_of_replicas\": OPENSEARCH_NUMBER_OF_REPLICAS,\n },\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Note: Some OpenSearch versions/configurations have issues with dynamically adding\n knn_vector mappings (NullPointerException). This method checks if the field\n already exists before attempting to add it, and gracefully skips if the field\n is already properly configured.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n # First, check if the field already exists and is properly mapped\n properties = self._get_index_properties(client)\n if self._is_knn_vector_field(properties, field_name):\n # Field already exists as knn_vector - verify dimensions match\n existing_dim = self._get_field_dimension(properties, field_name)\n if existing_dim is not None and existing_dim != dim:\n logger.warning(\n f\"Field '{field_name}' exists with dimension {existing_dim}, \"\n f\"but current embedding has dimension {dim}. Using existing mapping.\"\n )\n else:\n logger.info(\n f\"[OpenSearchMultimodel] Field '{field_name}' already exists\"\n f\"as knn_vector with matching dimensions - skipping mapping update\"\n )\n return\n\n # Field doesn't exist, try to add the mapping\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\"type\": \"keyword\"},\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except RequestError as e:\n error_str = str(e).lower()\n if \"invalid engine\" in error_str and \"jvector\" in error_str:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to OpenSearch 2.9+.\"\n )\n raise ValueError(msg) from e\n if \"index.knn\" in error_str:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from e\n raise\n except Exception as e:\n # Check if this is the known OpenSearch k-NN NullPointerException issue\n error_str = str(e).lower()\n if \"null\" in error_str or \"nullpointerexception\" in error_str:\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}\"\n f\"due to OpenSearch k-NN plugin issue: {e}. \"\n f\"This is a known issue with some OpenSearch versions. \"\n f\"[OpenSearchMultimodel] Skipping mapping update. \"\n f\"Please ensure the index has the correct mapping for KNN search to work.\"\n )\n # Skip and continue - ingestion will proceed, but KNN search may fail if mapping doesn't exist\n return\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}: {e}\"\n )\n raise\n\n # Verify the field was added correctly\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n msg = f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n logger.error(msg)\n raise ValueError(msg)\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return (\n http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n )\n\n @staticmethod\n def _openrag_input_to_str(value: Any) -> str:\n if value is None:\n return \"\"\n if hasattr(value, \"get_secret_value\"):\n value = value.get_secret_value()\n if hasattr(value, \"text\"):\n value = value.text\n return str(value or \"\").strip()\n\n def _openrag_callback_value(self, attr_name: str) -> str:\n value = self._openrag_input_to_str(getattr(self, attr_name, \"\"))\n if value == self._openrag_ingest_global_placeholders.get(attr_name):\n return \"\"\n return value\n\n def _openrag_ingest_callback_config(self) -> tuple[str, str, str] | None:\n url = self._openrag_callback_value(\"openrag_ingest_url\")\n token = self._openrag_callback_value(\"openrag_ingest_token\")\n ingest_run_id = self._openrag_callback_value(\"openrag_ingest_run_id\")\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n debug_payload = {\n \"openrag_ingest_url\": url,\n \"openrag_ingest_url_len\": len(url),\n \"openrag_ingest_token_masked\": masked_token,\n \"openrag_ingest_token_len\": len(token),\n \"openrag_ingest_run_id\": ingest_run_id,\n \"raw_url_type\": type(self.openrag_ingest_url).__name__,\n \"raw_token_type\": type(self.openrag_ingest_token).__name__,\n \"raw_run_id_type\": type(self.openrag_ingest_run_id).__name__,\n }\n logger.warning(f\"[OpenRAG callback config] {debug_payload}\")\n try:\n self.log(f\"[OpenRAG callback config] {debug_payload}\")\n except Exception:\n pass\n\n if not url and not token and not ingest_run_id:\n return None\n if not url or not token or not ingest_run_id:\n msg = \"OpenRAG ingest callback requires url, token, and ingest_run_id.\"\n raise ValueError(msg)\n return url, token, ingest_run_id\n\n def _post_openrag_ingest_batches(\n self,\n *,\n requests: list[dict],\n vector_field: str,\n text_field: str,\n ) -> None:\n callback_config = self._openrag_ingest_callback_config()\n if callback_config is None:\n return\n\n url, token, ingest_run_id = callback_config\n batch_size = max(self._parse_int_param(\"openrag_ingest_batch_size\", 100), 1)\n timeout = self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT)\n headers = {\"Authorization\": f\"Bearer {token}\"}\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n request_summary = {\n \"url\": url,\n \"ingest_run_id\": ingest_run_id,\n \"token_masked\": masked_token,\n \"total_chunks\": len(requests),\n \"batch_size\": batch_size,\n \"timeout_s\": timeout,\n }\n logger.warning(f\"[OpenRAG ingest POST] {request_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST] {request_summary}\")\n except Exception:\n pass\n\n with httpx.Client(timeout=timeout) as client:\n total_batches = (len(requests) + batch_size - 1) // batch_size\n for batch_number, start in enumerate(range(0, len(requests), batch_size), start=1):\n batch = requests[start : start + batch_size]\n final = batch_number == total_batches\n payload = {\n \"ingest_run_id\": ingest_run_id,\n \"batch_id\": batch_number,\n \"final\": final,\n \"chunks\": [\n self._openrag_chunk_payload(\n request,\n vector_field=vector_field,\n text_field=text_field,\n )\n for request in batch\n ],\n }\n logger.warning(\n f\"[OpenRAG ingest POST] -> batch={batch_number}/{total_batches} \"\n f\"url={url} chunks={len(payload['chunks'])} final={final}\"\n )\n response = client.post(url, json=payload, headers=headers)\n response_summary = {\n \"batch\": batch_number,\n \"url\": url,\n \"status\": response.status_code,\n \"final_url\": str(response.request.url),\n \"response_headers\": dict(response.headers),\n \"body_preview\": response.text[:500],\n }\n logger.warning(f\"[OpenRAG ingest POST resp] {response_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST resp] {response_summary}\")\n except Exception:\n pass\n if response.status_code >= 400:\n msg = (\n \"OpenRAG ingest callback failed \"\n f\"(batch={batch_number}, status={response.status_code}, \"\n f\"url={url}): {response.text[:1000]}\"\n )\n raise RuntimeError(msg)\n\n self.log(f\"Posted {len(requests)} chunks to OpenRAG backend ingest callback.\")\n\n @staticmethod\n def _openrag_chunk_payload(\n request: dict,\n *,\n vector_field: str,\n text_field: str,\n ) -> dict:\n metadata = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\", vector_field, text_field}\n }\n page = metadata.get(\"page\")\n if isinstance(page, str) and page.isdigit():\n page = int(page)\n return {\n \"id\": request.get(\"_id\") or request.get(\"id\"),\n \"text\": request.get(text_field, \"\"),\n \"vector\": request[vector_field],\n \"page\": page if isinstance(page, int) else None,\n \"metadata\": metadata,\n }\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n logger.debug(f\"[OpenSearchMultimodel] Bulk ingesting embeddings for {index_name}\")\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n\n # Normalize ACL fields that may arrive as JSON strings from flows\n for key in (\"allowed_users\", \"allowed_groups\", \"allowed_principals\"):\n value = metadata.get(key)\n if isinstance(value, str):\n try:\n parsed = json.loads(value)\n if isinstance(parsed, list):\n metadata[key] = parsed\n except (json.JSONDecodeError, TypeError):\n # Leave value as-is if it isn't valid JSON\n pass\n\n metadata_document_id = str(metadata.get(\"document_id\") or \"\").strip()\n if metadata_document_id and metadata_document_id.lower() != \"none\":\n generated_id = f\"{metadata_document_id}_{i}\"\n else:\n generated_id = str(uuid.uuid4())\n _id = ids[i] if ids else generated_id\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n if self._openrag_ingest_callback_config() is not None:\n self._post_openrag_ingest_batches(\n requests=requests,\n vector_field=vector_field,\n text_field=text_field,\n )\n return return_ids\n try:\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n except Exception as bulk_error:\n if \"Unsupported request type for filter level DLS\" not in str(bulk_error):\n raise\n logger.warning(\n \"[OpenSearchMultimodel] Bulk ingest is blocked by filter-level DLS; \"\n \"falling back to per-document index requests.\"\n )\n self._index_embeddings_individually(client, requests)\n return return_ids\n\n def _index_embeddings_individually(\n self,\n client: OpenSearch,\n requests: list[dict],\n ) -> None:\n \"\"\"Index documents one at a time when OpenSearch DLS rejects bulk writes.\"\"\"\n for request in requests:\n document_id = request.get(\"_id\") or request.get(\"id\")\n body = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\"}\n }\n client.index(index=request[\"_index\"], id=document_id, body=body)\n\n def _log_index_admin_skip(self, operation: str, error: Exception) -> None:\n \"\"\"Log index-admin operations that may be blocked under filter-level DLS.\"\"\"\n logger.warning(\n f\"[OpenSearchMultimodel] Could not run index-admin operation '{operation}': {error}. \"\n \"Assuming the backend pre-created the required index/mapping and continuing.\"\n )\n\n # ---------- param helpers ----------\n def _parse_int_param(self, attr_name: str, default: int) -> int:\n \"\"\"Parse a string attribute to int, returning *default* on failure.\"\"\"\n raw = getattr(self, attr_name, None)\n if raw is None or str(raw).strip() == \"\":\n return default\n try:\n value = int(str(raw).strip())\n except ValueError:\n logger.warning(\n f\"Invalid integer value '{raw}' for {attr_name}, using default {default}\"\n )\n return default\n\n if value < 0:\n logger.warning(f\"Negative value '{raw}' for {attr_name}, using default {default}\")\n return default\n\n return value\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n if mode == \"openrag\":\n # Writes are delegated to the OpenRAG backend ingest callback,\n # so no direct OpenSearch credentials are needed. Only the\n # OPENRAG_* fields are required for ingestion to function.\n missing = [\n name\n for name, value in (\n (\"openrag_ingest_url\", self.openrag_ingest_url),\n (\"openrag_ingest_token\", self.openrag_ingest_token),\n (\"openrag_ingest_run_id\", self.openrag_ingest_run_id),\n )\n if not (value or \"\").strip()\n ]\n if missing:\n msg = (\n \"Auth Mode is 'openrag' but required OPENRAG_* fields are \"\n f\"missing: {', '.join(missing)}.\"\n )\n raise ValueError(msg)\n return {}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel] Building OpenSearch client\")\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n timeout=self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT),\n max_retries=self._parse_int_param(\"max_retries\", MAX_RETRIES),\n retry_on_timeout=True,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n client = self.build_client()\n\n # Check if we're in ingestion-only mode (no search query)\n has_search_query = bool((self.search_query or \"\").strip())\n if not has_search_query:\n logger.debug(\n \"[OpenSearchMultimodel] Ingestion-only mode activated: search operations will be skipped\"\n )\n logger.debug(\"[OpenSearchMultimodel] Starting ingestion mode...\")\n\n logger.debug(f\"[OpenSearchMultimodel] Embedding: {self.embedding}\")\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings using the selected model\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel][INGESTION] _add_documents_to_vector_store called\")\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data type: \"\n f\"{type(self.ingest_data)}, length: {len(self.ingest_data) if self.ingest_data else 0}\"\n )\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data content: \"\n f\"{self.ingest_data[:2] if self.ingest_data and len(self.ingest_data) > 0 else 'empty'}\"\n )\n\n docs = self.ingest_data or []\n if not docs:\n logger.debug(\"Ingestion complete: No documents provided\")\n return\n\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Normalize embedding to list first\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n\n # Filter out None values (fail-safe mode) - do this BEFORE checking if empty\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n # NOW check if we have any valid embeddings left after filtering\n if not embeddings_list:\n logger.warning(\n \"All embeddings returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n self.log(\n \"Embedding returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n return\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] Valid embeddings after filtering: {len(embeddings_list)}\"\n )\n self.log(\n f\"[OpenSearchMultimodel][INGESTION] Available embedding models: {len(embeddings_list)}\"\n )\n\n # Select the embedding to use for ingestion\n selected_embedding = None\n embedding_model = None\n\n # If embedding_model_name is specified, find matching embedding\n if (\n hasattr(self, \"embedding_model_name\")\n and self.embedding_model_name\n and self.embedding_model_name.strip()\n ):\n target_model_name = self.embedding_model_name.strip()\n self.log(f\"Looking for embedding model: {target_model_name}\")\n\n for emb_obj in embeddings_list:\n # Check all possible model identifiers (deployment, model, model_id, model_name)\n # Also check available_models list from EmbeddingsWithModels\n possible_names = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n if deployment:\n possible_names.append(str(deployment))\n if model:\n possible_names.append(str(model))\n if model_id:\n possible_names.append(str(model_id))\n if model_name:\n possible_names.append(str(model_name))\n\n # Also add combined identifier\n if deployment and model and deployment != model:\n possible_names.append(f\"{deployment}:{model}\")\n\n # Add all models from available_models dict\n if available_models_attr and isinstance(available_models_attr, dict):\n possible_names.extend(\n str(model_key).strip()\n for model_key in available_models_attr\n if model_key and str(model_key).strip()\n )\n\n # Match if target matches any of the possible names\n if target_model_name in possible_names:\n # Check if target is in available_models dict - use dedicated instance\n if (\n available_models_attr\n and isinstance(available_models_attr, dict)\n and target_model_name in available_models_attr\n ):\n # Use the dedicated embedding instance from the dict\n selected_embedding = available_models_attr[target_model_name]\n embedding_model = target_model_name\n self.log(\n f\"Found dedicated embedding instance for '{embedding_model}' in available_models dict\"\n )\n else:\n # Traditional identifier match\n selected_embedding = emb_obj\n embedding_model = self._get_embedding_model_name(emb_obj)\n self.log(\n f\"Found matching embedding model: {embedding_model} (matched on: {target_model_name})\"\n )\n break\n\n if not selected_embedding:\n # Build detailed list of available embeddings with all their identifiers\n available_info = []\n for idx, emb in enumerate(embeddings_list):\n emb_type = type(emb).__name__\n identifiers = []\n deployment = getattr(emb, \"deployment\", None)\n model = getattr(emb, \"model\", None)\n model_id = getattr(emb, \"model_id\", None)\n model_name = getattr(emb, \"model_name\", None)\n available_models_attr = getattr(emb, \"available_models\", None)\n\n if deployment:\n identifiers.append(f\"deployment='{deployment}'\")\n if model:\n identifiers.append(f\"model='{model}'\")\n if model_id:\n identifiers.append(f\"model_id='{model_id}'\")\n if model_name:\n identifiers.append(f\"model_name='{model_name}'\")\n\n # Add combined identifier as an option\n if deployment and model and deployment != model:\n identifiers.append(f\"combined='{deployment}:{model}'\")\n\n # Add available_models dict if present\n if available_models_attr and isinstance(available_models_attr, dict):\n identifiers.append(f\"available_models={list(available_models_attr.keys())}\")\n\n available_info.append(\n f\" [{idx}] {emb_type}: {', '.join(identifiers) if identifiers else 'No identifiers'}\"\n )\n\n msg = (\n f\"Embedding model '{target_model_name}' not found in available embeddings.\\n\\n\"\n f\"Available embeddings:\\n\" + \"\\n\".join(available_info) + \"\\n\\n\"\n \"Please set 'embedding_model_name' to one of the identifier values shown above \"\n \"(use the value after the '=' sign, without quotes).\\n\"\n \"For duplicate deployments, use the 'combined' format.\\n\"\n \"Or leave it empty to use the first embedding.\"\n )\n raise ValueError(msg)\n else:\n # Use first embedding if no model name specified\n selected_embedding = embeddings_list[0]\n embedding_model = self._get_embedding_model_name(selected_embedding)\n self.log(f\"No embedding_model_name specified, using first embedding: {embedding_model}\")\n\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n logger.info(f\"Selected embedding model for ingestion: '{embedding_model}'\")\n self.log(f\"Using embedding model for ingestion: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Log embedding details for debugging\n if hasattr(selected_embedding, \"deployment\"):\n logger.info(f\"Embedding deployment: {selected_embedding.deployment}\")\n if hasattr(selected_embedding, \"model\"):\n logger.info(f\"Embedding model: {selected_embedding.model}\")\n if hasattr(selected_embedding, \"model_id\"):\n logger.info(f\"Embedding model_id: {selected_embedding.model_id}\")\n if hasattr(selected_embedding, \"dimensions\"):\n logger.info(f\"Embedding dimensions: {selected_embedding.dimensions}\")\n if hasattr(selected_embedding, \"available_models\"):\n logger.info(f\"Embedding available_models: {selected_embedding.available_models}\")\n\n # No model switching needed - each model in available_models has its own dedicated instance\n # The selected_embedding is already configured correctly for the target model\n logger.info(\n f\"Using embedding instance for '{embedding_model}' - pre-configured and ready to use\"\n )\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n logger.debug(f\"[LF] Docs metadata {self.docs_metadata}\")\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n\n # Determine whether the selected embedding is watsonx/IBM. The watsonx\n # SDK ships its own rate-limit machinery (input batching, proactive\n # x-requests-limit-* TokenBucket throttling, and jittered exponential\n # backoff on 429), so we lean on it instead of retrying on top of it.\n # The type-name check also covers watsonx-hosted, non-\"ibm/\" models\n # (e.g. intfloat/multilingual-e5-large).\n is_ibm = (embedding_model and \"ibm\" in str(embedding_model).lower()) or (\n selected_embedding and \"watsonx\" in type(selected_embedding).__name__.lower()\n )\n logger.debug(f\"Is IBM/watsonx embedding: {is_ibm}\")\n\n if is_ibm:\n # Hand the full batch to the SDK and let it batch/throttle/retry.\n # Retry attempts and base backoff are tunable via the SDK's own\n # WATSONX_MAX_RETRIES / WATSONX_DELAY_TIME environment variables.\n logger.info(\n f\"Embedding {len(texts)} chunks via watsonx SDK batch (SDK-managed throttle + 429 retry)\"\n )\n try:\n vectors: list[list[float]] = selected_embedding.embed_documents(texts)\n logger.info(f\"Successfully embedded {len(vectors)} chunks via watsonx SDK\")\n except Exception as embed_error:\n _log_watsonx_rate_limit_headers(embed_error)\n logger.error(\n f\"Failed to embed {len(texts)} chunks via watsonx SDK. Error: {embed_error}\",\n )\n raise\n\n else:\n # Non-watsonx providers (OpenAI, Ollama) lack the watsonx SDK's\n # built-in rate-limit handling, so embed per chunk in parallel with\n # a generic rate-limit-aware tenacity retry.\n vectors: list[list[float]] = [None] * len(texts)\n from tenacity import (\n retry,\n retry_if_exception,\n stop_after_attempt,\n wait_exponential,\n )\n\n def is_rate_limit_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a rate limit error (429).\"\"\"\n error_str = str(exception).lower()\n return \"429\" in error_str or \"rate_limit\" in error_str or \"rate limit\" in error_str\n\n def is_other_retryable_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a transient network error worth retrying.\"\"\"\n if is_rate_limit_error(exception):\n return False\n return isinstance(exception, (ConnectionError, TimeoutError, OSError))\n\n # Retry decorator for rate limit errors (longer backoff)\n retry_on_rate_limit = retry(\n retry=retry_if_exception(is_rate_limit_error),\n stop=stop_after_attempt(5),\n wait=wait_exponential(multiplier=2, min=2, max=30),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Rate limit hit for chunk (attempt {retry_state.attempt_number}/5), \"\n f\"backing off for {retry_state.next_action.sleep:.1f}s\"\n ),\n )\n\n # Retry decorator for other errors (shorter backoff)\n retry_on_other_errors = retry(\n retry=retry_if_exception(is_other_retryable_error),\n stop=stop_after_attempt(3),\n wait=wait_exponential(multiplier=1, min=1, max=8),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Error embedding chunk (attempt {retry_state.attempt_number}/3), \"\n f\"retrying in {retry_state.next_action.sleep:.1f}s: {retry_state.outcome.exception()}\"\n ),\n )\n\n def embed_chunk_with_retry(chunk_text: str, chunk_idx: int) -> list[float]:\n \"\"\"Embed a single chunk with rate-limit-aware retry logic.\"\"\"\n\n @retry_on_rate_limit\n @retry_on_other_errors\n def _embed(text: str) -> list[float]:\n return selected_embedding.embed_documents([text])[0]\n\n try:\n return _embed(chunk_text)\n except Exception as e:\n logger.error(\n f\"Failed to embed chunk {chunk_idx} after all retries: {e}\",\n error=str(e),\n )\n raise\n\n max_workers = min(max(len(texts), 1), 8)\n logger.debug(f\"Using parallel processing with {max_workers} workers\")\n\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {\n executor.submit(embed_chunk_with_retry, chunk, idx): idx\n for idx, chunk in enumerate(texts)\n }\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n\n if not vectors:\n self.log(f\"No vectors generated from documents for model {embedding_model}.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n use_openrag_ingest_callback = self._openrag_ingest_callback_config() is not None\n\n is_aoss = False\n mapping: dict | None = None\n\n engine = getattr(self, \"engine\", \"jvector\")\n\n if use_openrag_ingest_callback:\n self.log(\"Using OpenRAG backend ingest callback; skipping direct OpenSearch writes.\")\n else:\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping (index.knn: true is required for vector search)\n index_exists = True\n try:\n index_exists = bool(client.indices.exists(index=self.index_name))\n except OpenSearchException as exists_error:\n self._log_index_admin_skip(\"indices.exists\", exists_error)\n\n try:\n if not index_exists:\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error == \"resource_already_exists_exception\":\n pass # Index was created concurrently\n else:\n error_msg = str(creation_error).lower()\n if \"invalid engine\" in error_msg or \"illegal_argument\" in error_msg:\n if \"jvector\" in error_msg:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to 2.9+.\"\n )\n raise ValueError(msg) from creation_error\n if \"index.knn\" in error_msg:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from creation_error\n logger.warning(f\"Failed to create index '{self.index_name}': {creation_error}\")\n raise\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(\n f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\"\n )\n logger.info(f\"Will store embeddings in field: {dynamic_field_name}\")\n logger.info(f\"Will tag documents with embedding_model: {embedding_model}\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n logger.info(\n f\"Ingestion complete: Successfully indexed {len(return_ids)} documents with model '{embedding_model}'\"\n )\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if (\n \"term\" in f\n and isinstance(f[\"term\"], dict)\n and not self._is_placeholder_term(f[\"term\"])\n ):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n def _parse_filter_expression(self) -> dict | None:\n \"\"\"Parse and validate optional filter_expression JSON.\n\n Returns:\n Parsed JSON object as a dict, or None when unset/blank.\n\n Raises:\n ValueError: If JSON is invalid or does not decode to an object.\n \"\"\"\n filter_expression = getattr(self, \"filter_expression\", \"\")\n if not isinstance(filter_expression, str) or not filter_expression.strip():\n return None\n try:\n filter_obj = json.loads(filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not isinstance(filter_obj, dict):\n msg = \"Invalid filter_expression JSON type: expected a JSON object.\"\n raise TypeError(msg)\n return filter_obj\n\n def _resolve_limit(self, filter_obj: dict | None, default_limit: int | None) -> int | None:\n \"\"\"Resolve an integer result limit from filter settings.\"\"\"\n if not filter_obj:\n return default_limit\n raw_limit = filter_obj.get(\"limit\", default_limit)\n if raw_limit is None:\n return None\n if isinstance(raw_limit, bool):\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise TypeError(msg)\n try:\n limit = int(raw_limit)\n except (TypeError, ValueError) as e:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg) from e\n if limit <= 0:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg)\n return limit\n\n def _resolve_score_threshold(self, filter_obj: dict | None) -> float | None:\n \"\"\"Resolve optional positive min score from filter settings.\"\"\"\n if not filter_obj:\n return None\n score_threshold = filter_obj.get(\"score_threshold\")\n if score_threshold is None:\n score_threshold = filter_obj.get(\"scoreThreshold\")\n if not isinstance(score_threshold, (int, float)) or score_threshold <= 0:\n return None\n return float(score_threshold)\n\n def _detect_available_models(\n self, client: OpenSearch, filter_clauses: list[dict] | None = None\n ) -> list[str]:\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}}},\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\"bool\": {\"filter\": filter_clauses}}\n\n logger.debug(f\"Model detection query: {agg_query}\")\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n # Log detailed bucket info for debugging\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n if not models:\n total_hits = result.get(\"hits\", {}).get(\"total\", {})\n total_count = (\n total_hits.get(\"value\", 0) if isinstance(total_hits, dict) else total_hits\n )\n logger.warning(\n f\"No embedding_model values found in index '{self.index_name}'. \"\n f\"Total docs in index: {total_count}. \"\n f\"This may indicate documents were indexed without the embedding_model field.\"\n )\n except (OpenSearchException, KeyError, ValueError) as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n fallback_model = self._get_embedding_model_name()\n logger.info(f\"Using fallback model: {fallback_model}\")\n return [fallback_model]\n else:\n return models\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except OpenSearchException as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n return bool(isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\")\n\n def _get_field_dimension(\n self, properties: dict[str, Any] | None, field_name: str\n ) -> int | None:\n \"\"\"Get the dimension of a knn_vector field from the index mapping.\n\n Args:\n properties: Index properties from mapping\n field_name: Name of the vector field\n\n Returns:\n Dimension of the field, or None if not found\n \"\"\"\n if not field_name or properties is None:\n return None\n\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return None\n\n # Check direct knn_vector field\n if field_def.get(\"type\") == \"knn_vector\":\n return field_def.get(\"dimension\")\n\n # Check nested properties\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return nested_props.get(\"dimension\")\n\n return None\n\n def _get_filename_agg_field(self, index_properties: dict[str, Any] | None) -> str:\n \"\"\"Choose the appropriate field for filename aggregations.\"\"\"\n if not index_properties:\n return \"filename.keyword\"\n\n filename_def = index_properties.get(\"filename\")\n if not isinstance(filename_def, dict):\n return \"filename.keyword\"\n\n field_type = filename_def.get(\"type\")\n fields_def = filename_def.get(\"fields\", {})\n\n # Top-level keyword with no subfields\n if field_type == \"keyword\" and not isinstance(fields_def, dict):\n return \"filename\"\n\n # Text field with keyword subfield\n if isinstance(fields_def, dict) and \"keyword\" in fields_def:\n return \"filename.keyword\"\n\n # Fallback: aggregate on filename directly\n return \"filename\"\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = self._parse_filter_expression()\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Check if embedding is None (fail-safe mode)\n if self.embedding is None or (\n isinstance(self.embedding, list) and all(e is None for e in self.embedding)\n ):\n logger.error(\"Embedding returned None (fail-safe mode enabled). Cannot perform search.\")\n return []\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models\n query_embeddings = {}\n\n # Normalize embedding to list\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n # Filter out None values (fail-safe mode)\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n if not embeddings_list:\n logger.error(\n \"No valid embeddings available after filtering None values (fail-safe mode). Cannot perform search.\"\n )\n return []\n\n # Create a comprehensive map of model names to embedding objects\n # Check all possible identifiers (deployment, model, model_id, model_name)\n # Also leverage available_models list from EmbeddingsWithModels\n # Handle duplicate identifiers by creating combined keys\n embedding_by_model = {}\n identifier_conflicts = {} # Track which identifiers have conflicts\n\n for idx, emb_obj in enumerate(embeddings_list):\n # Get all possible identifiers for this embedding\n identifiers = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n dimensions = getattr(emb_obj, \"dimensions\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Embedding object {idx}: deployment={deployment}, model={model}, \"\n f\"model_id={model_id}, model_name={model_name}, dimensions={dimensions}, \"\n f\"available_models={available_models_attr}\"\n )\n\n # If this embedding has available_models dict, map all models to their dedicated instances\n if available_models_attr and isinstance(available_models_attr, dict):\n logger.info(\n f\"Embedding object {idx} provides {len(available_models_attr)} models via available_models dict\"\n )\n for model_name_key, dedicated_embedding in available_models_attr.items():\n if model_name_key and str(model_name_key).strip():\n model_str = str(model_name_key).strip()\n if model_str not in embedding_by_model:\n # Use the dedicated embedding instance from the dict\n embedding_by_model[model_str] = dedicated_embedding\n logger.info(\n f\"Mapped available model '{model_str}' to dedicated embedding instance\"\n )\n else:\n # Conflict detected - track it\n if model_str not in identifier_conflicts:\n identifier_conflicts[model_str] = [embedding_by_model[model_str]]\n identifier_conflicts[model_str].append(dedicated_embedding)\n logger.warning(\n f\"Available model '{model_str}' has conflict - used by multiple embeddings\"\n )\n\n # Also map traditional identifiers (for backward compatibility)\n if deployment:\n identifiers.append(str(deployment))\n if model:\n identifiers.append(str(model))\n if model_id:\n identifiers.append(str(model_id))\n if model_name:\n identifiers.append(str(model_name))\n\n # Map all identifiers to this embedding object\n for identifier in identifiers:\n if identifier not in embedding_by_model:\n embedding_by_model[identifier] = emb_obj\n logger.info(f\"Mapped identifier '{identifier}' to embedding object {idx}\")\n else:\n # Conflict detected - track it\n if identifier not in identifier_conflicts:\n identifier_conflicts[identifier] = [embedding_by_model[identifier]]\n identifier_conflicts[identifier].append(emb_obj)\n logger.warning(\n f\"Identifier '{identifier}' has conflict - used by multiple embeddings\"\n )\n\n # For embeddings with model+deployment, create combined identifier\n # This helps when deployment is the same but model differs\n if deployment and model and deployment != model:\n combined_id = f\"{deployment}:{model}\"\n if combined_id not in embedding_by_model:\n embedding_by_model[combined_id] = emb_obj\n logger.info(\n f\"Created combined identifier '{combined_id}' for embedding object {idx}\"\n )\n\n # Log conflicts\n if identifier_conflicts:\n logger.warning(\n f\"Found {len(identifier_conflicts)} conflicting identifiers. \"\n f\"Consider using combined format 'deployment:model' or specifying unique model names.\"\n )\n for conflict_id, emb_list in identifier_conflicts.items():\n logger.warning(\n f\" Conflict on '{conflict_id}': {len(emb_list)} embeddings use this identifier\"\n )\n\n logger.info(f\"Generating embeddings for {len(available_models)} models in index\")\n logger.info(f\"Available embedding identifiers: {list(embedding_by_model.keys())}\")\n self.log(f\"[SEARCH] Models detected in index: {available_models}\")\n self.log(f\"[SEARCH] Available embedding identifiers: {list(embedding_by_model.keys())}\")\n\n # Track matching status for debugging\n matched_models = []\n unmatched_models = []\n\n for model_name in available_models:\n try:\n # Check if we have an embedding object for this model\n if model_name in embedding_by_model:\n # Use the matching embedding object directly\n emb_obj = embedding_by_model[model_name]\n emb_deployment = getattr(emb_obj, \"deployment\", None)\n emb_model = getattr(emb_obj, \"model\", None)\n emb_model_id = getattr(emb_obj, \"model_id\", None)\n emb_dimensions = getattr(emb_obj, \"dimensions\", None)\n emb_available_models = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Using embedding object for model '{model_name}': \"\n f\"deployment={emb_deployment}, model={emb_model}, model_id={emb_model_id}, \"\n f\"dimensions={emb_dimensions}\"\n )\n\n # Check if this is a dedicated instance from available_models dict\n if emb_available_models and isinstance(emb_available_models, dict):\n logger.info(\n f\"Model '{model_name}' using dedicated instance from available_models dict \"\n f\"(pre-configured with correct model and dimensions)\"\n )\n\n # Use the embedding instance directly - no model switching needed!\n vec = emb_obj.embed_query(q)\n query_embeddings[model_name] = vec\n matched_models.append(model_name)\n logger.info(\n f\"Generated embedding for model: {model_name} (actual dimensions: {len(vec)})\"\n )\n self.log(f\"[MATCH] Model '{model_name}' - generated {len(vec)}-dim embedding\")\n else:\n # No matching embedding found for this model\n unmatched_models.append(model_name)\n logger.warning(\n f\"No matching embedding found for model '{model_name}'. \"\n f\"This model will be skipped. Available identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(\n f\"[NO MATCH] Model '{model_name}' - available: {list(embedding_by_model.keys())}\"\n )\n except (\n RuntimeError,\n ValueError,\n ConnectionError,\n TimeoutError,\n AttributeError,\n KeyError,\n ) as e:\n logger.warning(f\"Failed to generate embedding for {model_name}: {e}\")\n self.log(f\"[ERROR] Embedding generation failed for '{model_name}': {e}\")\n\n # Log summary of model matching\n logger.info(\n f\"Model matching summary: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n self.log(\n f\"[SUMMARY] Model matching: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n if unmatched_models:\n self.log(f\"[WARN] Unmatched models in index: {unmatched_models}\")\n\n if not query_embeddings:\n msg = (\n f\"Failed to generate embeddings for any model. \"\n f\"Index has models: {available_models}, but no matching embedding objects found. \"\n f\"Available embedding identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(f\"[FAIL] Search failed: {msg}\")\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n vector_dim = len(embedding_vector)\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n self.log(\n f\"[SKIP] Field '{selected_field}' not a knn_vector - skipping model '{model_name}'\"\n )\n continue\n\n # Validate vector dimensions match the field dimensions\n field_dim = self._get_field_dimension(index_properties, selected_field)\n if field_dim is not None and field_dim != vector_dim:\n logger.error(\n f\"Dimension mismatch for model '{model_name}': \"\n f\"Query vector has {vector_dim} dimensions but field '{selected_field}' expects {field_dim}. \"\n f\"Skipping this model to prevent search errors.\"\n )\n self.log(\n f\"[DIM MISMATCH] Model '{model_name}': query={vector_dim} vs field={field_dim} - skipping\"\n )\n continue\n\n logger.info(\n f\"Adding KNN query for model '{model_name}': field='{selected_field}', \"\n f\"query_dims={vector_dim}, field_dims={field_dim or 'unknown'}\"\n )\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n self.log(\n f\"[WARN] No valid KNN queries could be built. \"\n f\"Query embeddings generated: {list(query_embeddings.keys())}, \"\n f\"but no matching knn_vector fields found in index.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1,\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = self._resolve_limit(filter_obj, default_limit=self.number_of_results)\n score_threshold = self._resolve_score_threshold(filter_obj)\n\n # Determine the best aggregation field for filename based on index mapping\n filename_agg_field = self._get_filename_agg_field(index_properties)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates,\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": filename_agg_field, \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n \"allowed_principals\",\n ],\n \"size\": limit,\n }\n\n if score_threshold is not None:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models: \"\n f\"{list(query_embeddings.keys())}\"\n )\n self.log(\n f\"[EXEC] Executing search with {len(knn_queries_with_candidates)} KNN queries, limit={limit}\"\n )\n self.log(f\"[EXEC] Embedding models used: {list(query_embeddings.keys())}\")\n self.log(f\"[EXEC] KNN fields being queried: {embedding_fields}\")\n\n try:\n resp = client.search(index=self.index_name, body=body, params={\"terminate_after\": 0})\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = (\n knn_queries_without_candidates\n )\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n self.log(f\"[RESULT] Search complete: {len(hits)} results found\")\n\n if len(hits) == 0:\n self.log(\n f\"[EMPTY] Debug info: \"\n f\"models_in_index={available_models}, \"\n f\"matched_models={matched_models}, \"\n f\"knn_fields={embedding_fields}, \"\n f\"filters={len(filter_clauses)} clauses\"\n )\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> Table:\n \"\"\"Search documents and return results as a Table.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Table (DataFrame) format\n so downstream Parser components can consume them directly.\n\n Always builds the vector store (triggering ingestion if needed), then performs\n search only if a query is provided.\n\n Returns:\n Table containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n\n try:\n # Always build/cache the vector store to ensure ingestion happens\n logger.info(f\"Search query: {self.search_query}\")\n if self._cached_vector_store is None:\n self.build_vector_store()\n\n # Only perform search if query is provided\n search_query = (self.search_query or \"\").strip()\n if not search_query:\n self.log(\"No search query provided - ingestion completed, returning empty results\")\n\n return Table(data=[])\n\n # Perform search with the provided query\n raw = self.search(search_query)\n raw_list = [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n return Table(data=raw_list)\n\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(\n self, build_config: dict, field_value: str, field_name: str | None = None\n ) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n is_openrag = mode == \"openrag\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n # build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n # In 'openrag' mode, expose the OPENRAG_* fields up front\n # since they are the only credentials required.\n for openrag_field in (\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ):\n if openrag_field in build_config:\n build_config[openrag_field][\"advanced\"] = not is_openrag\n build_config[openrag_field][\"required\"] = (\n is_openrag and openrag_field != \"openrag_ingest_batch_size\"\n )\n\n if is_basic or is_openrag:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" }, "docs_metadata": { "_input_type": "TableInput", diff --git a/flows/openrag_url_mcp.json b/flows/openrag_url_mcp.json index ec34cd3aa..847d211d2 100644 --- a/flows/openrag_url_mcp.json +++ b/flows/openrag_url_mcp.json @@ -3849,7 +3849,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport uuid\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import Any\n\nimport httpx\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n MultilineInput,\n Output,\n SecretStrInput,\n StrInput,\n TableInput,\n)\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\nfrom lfx.schema.dataframe import Table\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import OpenSearchException, RequestError\n\nREQUEST_TIMEOUT = 60\nMAX_RETRIES = 5\n\n# watsonx.ai surfaces rate-limit state via these (mostly non-standard) response\n# headers. The IBM SDK acts on the x-requests-limit-* family directly; we log\n# them on a failed embedding call to aid plan/region tuning.\n_WATSONX_RATE_LIMIT_HEADERS = (\n \"x-requests-limit-rate\",\n \"x-requests-limit-remaining\",\n \"x-requests-limit-reset\",\n \"Retry-After\",\n)\n\n\ndef _log_watsonx_rate_limit_headers(error: Exception) -> None:\n \"\"\"Best-effort diagnostic: log watsonx rate-limit headers from a failed call.\n\n The watsonx SDK raises ``ApiRequestFailure``, which carries the originating\n httpx/requests ``Response`` as ``.response``. On a 429 exhaustion we surface\n the documented rate-limit headers so operators can tune throughput.\n \"\"\"\n try:\n response = getattr(error, \"response\", None)\n headers = getattr(response, \"headers\", None)\n if not headers:\n return\n status = getattr(response, \"status_code\", \"unknown\")\n observed = {h: headers.get(h) for h in _WATSONX_RATE_LIMIT_HEADERS if headers.get(h) is not None}\n if str(status) == \"429\" or observed:\n logger.warning(f\"watsonx rate-limit response (status={status}): {observed}\")\n except Exception as log_error: # never let diagnostics mask the real error\n logger.debug(f\"Could not extract watsonx rate-limit headers: {log_error}\")\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n logger.info(f\"chunk_embedding_{normalize_model_name(model_name)}\")\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n\n Model Name Resolution:\n - Priority: deployment > model > model_name attributes\n - This ensures correct matching between embedding objects and index fields\n - When multiple embeddings are provided, specify embedding_model_name to select which one to use\n - During search, each detected model in the index is matched to its corresponding embedding object\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model Multi-Embedding)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search. \"\n \"To search use the tools search_documents and raw_search. \"\n \"Search documents takes a query for vector search, for example\\n\"\n ' {search_query: \"components in openrag\"}'\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n \"request_timeout\",\n \"max_retries\",\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ]\n _openrag_ingest_global_placeholders = {\n \"openrag_ingest_url\": \"OPENRAG_INGEST_URL\",\n \"openrag_ingest_token\": \"OPENRAG_INGEST_TOKEN\",\n \"openrag_ingest_run_id\": \"OPENRAG_INGEST_RUN_ID\",\n }\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\", \"JSON\"],\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"nmslib\", \"faiss\", \"lucene\", \"jvector\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'nmslib' works with standard \"\n \"OpenSearch. 'jvector' requires OpenSearch 2.9+. 'lucene' requires index.knn: true. \"\n \"Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(\n name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"], is_list=True\n ),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model to use for ingestion. This selects which embedding from the list \"\n \"will be used to embed documents. Matches on deployment, model, model_id, or model_name. \"\n \"For duplicate deployments, use combined format: 'deployment:model' \"\n \"(e.g., 'text-embedding-ada-002:text-embedding-3-large'). \"\n \"Leave empty to use the first embedding. Error message will show all available identifiers.\"\n ),\n advanced=False,\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"openrag\",\n options=[\"basic\", \"jwt\", \"openrag\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"'jwt' for JSON Web Token (Bearer) authentication, or 'openrag' to \"\n \"delegate writes to the OpenRAG backend ingest callback (no direct \"\n \"OpenSearch credentials required — only OPENRAG_* fields).\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=False,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n required=False\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=False,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n # ----- Timeout / Retry -----\n StrInput(\n name=\"request_timeout\",\n display_name=\"Request Timeout (seconds)\",\n value=\"60\",\n advanced=True,\n info=(\n \"Time in seconds to wait for a response from OpenSearch. \"\n \"Increase for large bulk ingestion or complex hybrid queries.\"\n ),\n ),\n StrInput(\n name=\"max_retries\",\n display_name=\"Max Retries\",\n value=\"3\",\n advanced=True,\n info=\"Number of retries for failed connections before raising an error.\",\n ),\n StrInput(\n name=\"openrag_ingest_url\",\n display_name=\"OpenRAG Ingest URL\",\n value=\"OPENRAG_INGEST_URL\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Internal OpenRAG callback URL for backend-owned document indexing.\",\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n IntInput(\n name=\"openrag_ingest_batch_size\",\n display_name=\"OpenRAG Ingest Batch Size\",\n value=100,\n advanced=True,\n ),\n ]\n outputs = [\n Output(\n display_name=\"Search Results\",\n name=\"search_results\",\n method=\"search_documents\",\n ),\n Output(display_name=\"Raw Search\", name=\"raw_search\", method=\"raw_search\"),\n ]\n\n def raw_search(self, query: str | dict | None = None) -> Data:\n \"\"\"Execute a raw OpenSearch query against the target index.\n\n Args:\n query (dict[str, Any]): The OpenSearch query DSL dictionary.\n\n Returns:\n Data: Search results as a Data object.\n\n Raises:\n ValueError: If 'query' is not a valid OpenSearch query (must be a non-empty dict).\n \"\"\"\n raw_query = query if query is not None else self.search_query\n\n if raw_query is None or (isinstance(raw_query, str) and not raw_query.strip()):\n self.log(\"No query provided for raw search - returning empty results\")\n return Data(data={})\n\n if isinstance(raw_query, dict):\n query_body = copy.deepcopy(raw_query)\n elif isinstance(raw_query, str):\n s = raw_query.strip()\n\n # First, optimistically try to parse as JSON DSL\n try:\n query_body = json.loads(s)\n except json.JSONDecodeError:\n # Fallback: treat as a basic text query over common fields\n query_body = {\n \"query\": {\n \"multi_match\": {\n \"query\": s,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n }\n }\n }\n else:\n msg = f\"Unsupported raw_search query type: {type(raw_query)!r}\"\n raise TypeError(msg)\n\n filter_obj = self._parse_filter_expression()\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n if filter_clauses:\n if \"query\" in query_body:\n original_query = query_body[\"query\"]\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [original_query],\n \"filter\": filter_clauses,\n }\n }\n else:\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [{\"match_all\": {}}],\n \"filter\": filter_clauses,\n }\n }\n\n if filter_obj:\n # Apply limit if not already set in the raw query\n if \"size\" not in query_body:\n limit = self._resolve_limit(filter_obj, default_limit=None)\n if limit is not None:\n query_body[\"size\"] = limit\n\n # Apply score_threshold / scoreThreshold as min_score if not already set\n if \"min_score\" not in query_body:\n\n score_threshold = self._resolve_score_threshold(filter_obj)\n if score_threshold is not None:\n\n query_body[\"min_score\"] = score_threshold\n\n client = self.build_client()\n logger.info(f\"query: {query_body}\")\n resp = client.search(\n index=self.index_name,\n body=query_body,\n params={\"terminate_after\": 0},\n )\n # Remove any _source keys whose value is a list of floats (embedding vectors)\n # Minimum length threshold to identify embedding vectors\n min_vector_length = 100\n\n def is_vector(val):\n # Accepts if it's a list of numbers (float or int) and has reasonable vector length\n return (\n isinstance(val, list)\n and len(val) > min_vector_length\n and all(isinstance(x, (float, int)) for x in val)\n )\n\n if \"hits\" in resp and \"hits\" in resp[\"hits\"]:\n for hit in resp[\"hits\"][\"hits\"]:\n source = hit.get(\"_source\")\n if isinstance(source, dict):\n keys_to_remove = [k for k, v in source.items() if is_vector(v)]\n for k in keys_to_remove:\n source.pop(k)\n logger.info(f\"Raw search response (all embedding vectors removed): {resp}\")\n return Data(**resp)\n\n def _get_embedding_model_name(self, embedding_obj=None) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Priority: deployment > model > model_id > model_name\n This ensures we use the actual model being deployed, not just the configured model.\n Supports multiple embedding providers (OpenAI, Watsonx, Cohere, etc.)\n\n Args:\n embedding_obj: Specific embedding object to get name from (optional)\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from provided embedding object\n if embedding_obj:\n # Priority: deployment > model > model_id > model_name\n if hasattr(embedding_obj, \"deployment\") and embedding_obj.deployment:\n return str(embedding_obj.deployment)\n if hasattr(embedding_obj, \"model\") and embedding_obj.model:\n return str(embedding_obj.model)\n if hasattr(embedding_obj, \"model_id\") and embedding_obj.model_id:\n return str(embedding_obj.model_id)\n if hasattr(embedding_obj, \"model_name\") and embedding_obj.model_name:\n return str(embedding_obj.model_name)\n\n # Try to get from embedding component (legacy single embedding)\n if hasattr(self, \"embedding\") and self.embedding:\n # Handle list of embeddings\n if isinstance(self.embedding, list) and len(self.embedding) > 0:\n first_emb = self.embedding[0]\n if hasattr(first_emb, \"deployment\") and first_emb.deployment:\n return str(first_emb.deployment)\n if hasattr(first_emb, \"model\") and first_emb.model:\n return str(first_emb.model)\n if hasattr(first_emb, \"model_id\") and first_emb.model_id:\n return str(first_emb.model_id)\n if hasattr(first_emb, \"model_name\") and first_emb.model_name:\n return str(first_emb.model_name)\n # Handle single embedding\n elif not isinstance(self.embedding, list):\n if hasattr(self.embedding, \"deployment\") and self.embedding.deployment:\n return str(self.embedding.deployment)\n if hasattr(self.embedding, \"model\") and self.embedding.model:\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_id\") and self.embedding.model_id:\n return str(self.embedding.model_id)\n if hasattr(self.embedding, \"model_name\") and self.embedding.model_name:\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'deployment', 'model', 'model_id', or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Note: Some OpenSearch versions/configurations have issues with dynamically adding\n knn_vector mappings (NullPointerException). This method checks if the field\n already exists before attempting to add it, and gracefully skips if the field\n is already properly configured.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n # First, check if the field already exists and is properly mapped\n properties = self._get_index_properties(client)\n if self._is_knn_vector_field(properties, field_name):\n # Field already exists as knn_vector - verify dimensions match\n existing_dim = self._get_field_dimension(properties, field_name)\n if existing_dim is not None and existing_dim != dim:\n logger.warning(\n f\"Field '{field_name}' exists with dimension {existing_dim}, \"\n f\"but current embedding has dimension {dim}. Using existing mapping.\"\n )\n else:\n logger.info(\n f\"[OpenSearchMultimodel] Field '{field_name}' already exists\"\n f\"as knn_vector with matching dimensions - skipping mapping update\"\n )\n return\n\n # Field doesn't exist, try to add the mapping\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\"type\": \"keyword\"},\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except RequestError as e:\n error_str = str(e).lower()\n if \"invalid engine\" in error_str and \"jvector\" in error_str:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to OpenSearch 2.9+.\"\n )\n raise ValueError(msg) from e\n if \"index.knn\" in error_str:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from e\n raise\n except Exception as e:\n # Check if this is the known OpenSearch k-NN NullPointerException issue\n error_str = str(e).lower()\n if \"null\" in error_str or \"nullpointerexception\" in error_str:\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}\"\n f\"due to OpenSearch k-NN plugin issue: {e}. \"\n f\"This is a known issue with some OpenSearch versions. \"\n f\"[OpenSearchMultimodel] Skipping mapping update. \"\n f\"Please ensure the index has the correct mapping for KNN search to work.\"\n )\n # Skip and continue - ingestion will proceed, but KNN search may fail if mapping doesn't exist\n return\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}: {e}\"\n )\n raise\n\n # Verify the field was added correctly\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n msg = f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n logger.error(msg)\n raise ValueError(msg)\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return (\n http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n )\n\n @staticmethod\n def _openrag_input_to_str(value: Any) -> str:\n if value is None:\n return \"\"\n if hasattr(value, \"get_secret_value\"):\n value = value.get_secret_value()\n if hasattr(value, \"text\"):\n value = value.text\n return str(value or \"\").strip()\n\n def _openrag_callback_value(self, attr_name: str) -> str:\n value = self._openrag_input_to_str(getattr(self, attr_name, \"\"))\n if value == self._openrag_ingest_global_placeholders.get(attr_name):\n return \"\"\n return value\n\n def _openrag_ingest_callback_config(self) -> tuple[str, str, str] | None:\n url = self._openrag_callback_value(\"openrag_ingest_url\")\n token = self._openrag_callback_value(\"openrag_ingest_token\")\n ingest_run_id = self._openrag_callback_value(\"openrag_ingest_run_id\")\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n debug_payload = {\n \"openrag_ingest_url\": url,\n \"openrag_ingest_url_len\": len(url),\n \"openrag_ingest_token_masked\": masked_token,\n \"openrag_ingest_token_len\": len(token),\n \"openrag_ingest_run_id\": ingest_run_id,\n \"raw_url_type\": type(self.openrag_ingest_url).__name__,\n \"raw_token_type\": type(self.openrag_ingest_token).__name__,\n \"raw_run_id_type\": type(self.openrag_ingest_run_id).__name__,\n }\n logger.warning(f\"[OpenRAG callback config] {debug_payload}\")\n try:\n self.log(f\"[OpenRAG callback config] {debug_payload}\")\n except Exception:\n pass\n\n if not url and not token and not ingest_run_id:\n return None\n if not url or not token or not ingest_run_id:\n msg = \"OpenRAG ingest callback requires url, token, and ingest_run_id.\"\n raise ValueError(msg)\n return url, token, ingest_run_id\n\n def _post_openrag_ingest_batches(\n self,\n *,\n requests: list[dict],\n vector_field: str,\n text_field: str,\n ) -> None:\n callback_config = self._openrag_ingest_callback_config()\n if callback_config is None:\n return\n\n url, token, ingest_run_id = callback_config\n batch_size = max(self._parse_int_param(\"openrag_ingest_batch_size\", 100), 1)\n timeout = self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT)\n headers = {\"Authorization\": f\"Bearer {token}\"}\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n request_summary = {\n \"url\": url,\n \"ingest_run_id\": ingest_run_id,\n \"token_masked\": masked_token,\n \"total_chunks\": len(requests),\n \"batch_size\": batch_size,\n \"timeout_s\": timeout,\n }\n logger.warning(f\"[OpenRAG ingest POST] {request_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST] {request_summary}\")\n except Exception:\n pass\n\n with httpx.Client(timeout=timeout) as client:\n total_batches = (len(requests) + batch_size - 1) // batch_size\n for batch_number, start in enumerate(range(0, len(requests), batch_size), start=1):\n batch = requests[start : start + batch_size]\n final = batch_number == total_batches\n payload = {\n \"ingest_run_id\": ingest_run_id,\n \"batch_id\": batch_number,\n \"final\": final,\n \"chunks\": [\n self._openrag_chunk_payload(\n request,\n vector_field=vector_field,\n text_field=text_field,\n )\n for request in batch\n ],\n }\n logger.warning(\n f\"[OpenRAG ingest POST] -> batch={batch_number}/{total_batches} \"\n f\"url={url} chunks={len(payload['chunks'])} final={final}\"\n )\n response = client.post(url, json=payload, headers=headers)\n response_summary = {\n \"batch\": batch_number,\n \"url\": url,\n \"status\": response.status_code,\n \"final_url\": str(response.request.url),\n \"response_headers\": dict(response.headers),\n \"body_preview\": response.text[:500],\n }\n logger.warning(f\"[OpenRAG ingest POST resp] {response_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST resp] {response_summary}\")\n except Exception:\n pass\n if response.status_code >= 400:\n msg = (\n \"OpenRAG ingest callback failed \"\n f\"(batch={batch_number}, status={response.status_code}, \"\n f\"url={url}): {response.text[:1000]}\"\n )\n raise RuntimeError(msg)\n\n self.log(f\"Posted {len(requests)} chunks to OpenRAG backend ingest callback.\")\n\n @staticmethod\n def _openrag_chunk_payload(\n request: dict,\n *,\n vector_field: str,\n text_field: str,\n ) -> dict:\n metadata = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\", vector_field, text_field}\n }\n page = metadata.get(\"page\")\n if isinstance(page, str) and page.isdigit():\n page = int(page)\n return {\n \"id\": request.get(\"_id\") or request.get(\"id\"),\n \"text\": request.get(text_field, \"\"),\n \"vector\": request[vector_field],\n \"page\": page if isinstance(page, int) else None,\n \"metadata\": metadata,\n }\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n logger.debug(f\"[OpenSearchMultimodel] Bulk ingesting embeddings for {index_name}\")\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n\n # Normalize ACL fields that may arrive as JSON strings from flows\n for key in (\"allowed_users\", \"allowed_groups\", \"allowed_principals\"):\n value = metadata.get(key)\n if isinstance(value, str):\n try:\n parsed = json.loads(value)\n if isinstance(parsed, list):\n metadata[key] = parsed\n except (json.JSONDecodeError, TypeError):\n # Leave value as-is if it isn't valid JSON\n pass\n\n metadata_document_id = str(metadata.get(\"document_id\") or \"\").strip()\n if metadata_document_id and metadata_document_id.lower() != \"none\":\n generated_id = f\"{metadata_document_id}_{i}\"\n else:\n generated_id = str(uuid.uuid4())\n _id = ids[i] if ids else generated_id\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n if self._openrag_ingest_callback_config() is not None:\n self._post_openrag_ingest_batches(\n requests=requests,\n vector_field=vector_field,\n text_field=text_field,\n )\n return return_ids\n try:\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n except Exception as bulk_error:\n if \"Unsupported request type for filter level DLS\" not in str(bulk_error):\n raise\n logger.warning(\n \"[OpenSearchMultimodel] Bulk ingest is blocked by filter-level DLS; \"\n \"falling back to per-document index requests.\"\n )\n self._index_embeddings_individually(client, requests)\n return return_ids\n\n def _index_embeddings_individually(\n self,\n client: OpenSearch,\n requests: list[dict],\n ) -> None:\n \"\"\"Index documents one at a time when OpenSearch DLS rejects bulk writes.\"\"\"\n for request in requests:\n document_id = request.get(\"_id\") or request.get(\"id\")\n body = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\"}\n }\n client.index(index=request[\"_index\"], id=document_id, body=body)\n\n def _log_index_admin_skip(self, operation: str, error: Exception) -> None:\n \"\"\"Log index-admin operations that may be blocked under filter-level DLS.\"\"\"\n logger.warning(\n f\"[OpenSearchMultimodel] Could not run index-admin operation '{operation}': {error}. \"\n \"Assuming the backend pre-created the required index/mapping and continuing.\"\n )\n\n # ---------- param helpers ----------\n def _parse_int_param(self, attr_name: str, default: int) -> int:\n \"\"\"Parse a string attribute to int, returning *default* on failure.\"\"\"\n raw = getattr(self, attr_name, None)\n if raw is None or str(raw).strip() == \"\":\n return default\n try:\n value = int(str(raw).strip())\n except ValueError:\n logger.warning(\n f\"Invalid integer value '{raw}' for {attr_name}, using default {default}\"\n )\n return default\n\n if value < 0:\n logger.warning(f\"Negative value '{raw}' for {attr_name}, using default {default}\")\n return default\n\n return value\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n if mode == \"openrag\":\n # Writes are delegated to the OpenRAG backend ingest callback,\n # so no direct OpenSearch credentials are needed. Only the\n # OPENRAG_* fields are required for ingestion to function.\n missing = [\n name\n for name, value in (\n (\"openrag_ingest_url\", self.openrag_ingest_url),\n (\"openrag_ingest_token\", self.openrag_ingest_token),\n (\"openrag_ingest_run_id\", self.openrag_ingest_run_id),\n )\n if not (value or \"\").strip()\n ]\n if missing:\n msg = (\n \"Auth Mode is 'openrag' but required OPENRAG_* fields are \"\n f\"missing: {', '.join(missing)}.\"\n )\n raise ValueError(msg)\n return {}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel] Building OpenSearch client\")\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n timeout=self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT),\n max_retries=self._parse_int_param(\"max_retries\", MAX_RETRIES),\n retry_on_timeout=True,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n client = self.build_client()\n\n # Check if we're in ingestion-only mode (no search query)\n has_search_query = bool((self.search_query or \"\").strip())\n if not has_search_query:\n logger.debug(\n \"[OpenSearchMultimodel] Ingestion-only mode activated: search operations will be skipped\"\n )\n logger.debug(\"[OpenSearchMultimodel] Starting ingestion mode...\")\n\n logger.debug(f\"[OpenSearchMultimodel] Embedding: {self.embedding}\")\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings using the selected model\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel][INGESTION] _add_documents_to_vector_store called\")\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data type: \"\n f\"{type(self.ingest_data)}, length: {len(self.ingest_data) if self.ingest_data else 0}\"\n )\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data content: \"\n f\"{self.ingest_data[:2] if self.ingest_data and len(self.ingest_data) > 0 else 'empty'}\"\n )\n\n docs = self.ingest_data or []\n if not docs:\n logger.debug(\"Ingestion complete: No documents provided\")\n return\n\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Normalize embedding to list first\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n\n # Filter out None values (fail-safe mode) - do this BEFORE checking if empty\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n # NOW check if we have any valid embeddings left after filtering\n if not embeddings_list:\n logger.warning(\n \"All embeddings returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n self.log(\n \"Embedding returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n return\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] Valid embeddings after filtering: {len(embeddings_list)}\"\n )\n self.log(\n f\"[OpenSearchMultimodel][INGESTION] Available embedding models: {len(embeddings_list)}\"\n )\n\n # Select the embedding to use for ingestion\n selected_embedding = None\n embedding_model = None\n\n # If embedding_model_name is specified, find matching embedding\n if (\n hasattr(self, \"embedding_model_name\")\n and self.embedding_model_name\n and self.embedding_model_name.strip()\n ):\n target_model_name = self.embedding_model_name.strip()\n self.log(f\"Looking for embedding model: {target_model_name}\")\n\n for emb_obj in embeddings_list:\n # Check all possible model identifiers (deployment, model, model_id, model_name)\n # Also check available_models list from EmbeddingsWithModels\n possible_names = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n if deployment:\n possible_names.append(str(deployment))\n if model:\n possible_names.append(str(model))\n if model_id:\n possible_names.append(str(model_id))\n if model_name:\n possible_names.append(str(model_name))\n\n # Also add combined identifier\n if deployment and model and deployment != model:\n possible_names.append(f\"{deployment}:{model}\")\n\n # Add all models from available_models dict\n if available_models_attr and isinstance(available_models_attr, dict):\n possible_names.extend(\n str(model_key).strip()\n for model_key in available_models_attr\n if model_key and str(model_key).strip()\n )\n\n # Match if target matches any of the possible names\n if target_model_name in possible_names:\n # Check if target is in available_models dict - use dedicated instance\n if (\n available_models_attr\n and isinstance(available_models_attr, dict)\n and target_model_name in available_models_attr\n ):\n # Use the dedicated embedding instance from the dict\n selected_embedding = available_models_attr[target_model_name]\n embedding_model = target_model_name\n self.log(\n f\"Found dedicated embedding instance for '{embedding_model}' in available_models dict\"\n )\n else:\n # Traditional identifier match\n selected_embedding = emb_obj\n embedding_model = self._get_embedding_model_name(emb_obj)\n self.log(\n f\"Found matching embedding model: {embedding_model} (matched on: {target_model_name})\"\n )\n break\n\n if not selected_embedding:\n # Build detailed list of available embeddings with all their identifiers\n available_info = []\n for idx, emb in enumerate(embeddings_list):\n emb_type = type(emb).__name__\n identifiers = []\n deployment = getattr(emb, \"deployment\", None)\n model = getattr(emb, \"model\", None)\n model_id = getattr(emb, \"model_id\", None)\n model_name = getattr(emb, \"model_name\", None)\n available_models_attr = getattr(emb, \"available_models\", None)\n\n if deployment:\n identifiers.append(f\"deployment='{deployment}'\")\n if model:\n identifiers.append(f\"model='{model}'\")\n if model_id:\n identifiers.append(f\"model_id='{model_id}'\")\n if model_name:\n identifiers.append(f\"model_name='{model_name}'\")\n\n # Add combined identifier as an option\n if deployment and model and deployment != model:\n identifiers.append(f\"combined='{deployment}:{model}'\")\n\n # Add available_models dict if present\n if available_models_attr and isinstance(available_models_attr, dict):\n identifiers.append(f\"available_models={list(available_models_attr.keys())}\")\n\n available_info.append(\n f\" [{idx}] {emb_type}: {', '.join(identifiers) if identifiers else 'No identifiers'}\"\n )\n\n msg = (\n f\"Embedding model '{target_model_name}' not found in available embeddings.\\n\\n\"\n f\"Available embeddings:\\n\" + \"\\n\".join(available_info) + \"\\n\\n\"\n \"Please set 'embedding_model_name' to one of the identifier values shown above \"\n \"(use the value after the '=' sign, without quotes).\\n\"\n \"For duplicate deployments, use the 'combined' format.\\n\"\n \"Or leave it empty to use the first embedding.\"\n )\n raise ValueError(msg)\n else:\n # Use first embedding if no model name specified\n selected_embedding = embeddings_list[0]\n embedding_model = self._get_embedding_model_name(selected_embedding)\n self.log(f\"No embedding_model_name specified, using first embedding: {embedding_model}\")\n\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n logger.info(f\"Selected embedding model for ingestion: '{embedding_model}'\")\n self.log(f\"Using embedding model for ingestion: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Log embedding details for debugging\n if hasattr(selected_embedding, \"deployment\"):\n logger.info(f\"Embedding deployment: {selected_embedding.deployment}\")\n if hasattr(selected_embedding, \"model\"):\n logger.info(f\"Embedding model: {selected_embedding.model}\")\n if hasattr(selected_embedding, \"model_id\"):\n logger.info(f\"Embedding model_id: {selected_embedding.model_id}\")\n if hasattr(selected_embedding, \"dimensions\"):\n logger.info(f\"Embedding dimensions: {selected_embedding.dimensions}\")\n if hasattr(selected_embedding, \"available_models\"):\n logger.info(f\"Embedding available_models: {selected_embedding.available_models}\")\n\n # No model switching needed - each model in available_models has its own dedicated instance\n # The selected_embedding is already configured correctly for the target model\n logger.info(\n f\"Using embedding instance for '{embedding_model}' - pre-configured and ready to use\"\n )\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n logger.debug(f\"[LF] Docs metadata {self.docs_metadata}\")\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n\n # Determine whether the selected embedding is watsonx/IBM. The watsonx\n # SDK ships its own rate-limit machinery (input batching, proactive\n # x-requests-limit-* TokenBucket throttling, and jittered exponential\n # backoff on 429), so we lean on it instead of retrying on top of it.\n # The type-name check also covers watsonx-hosted, non-\"ibm/\" models\n # (e.g. intfloat/multilingual-e5-large).\n is_ibm = (embedding_model and \"ibm\" in str(embedding_model).lower()) or (\n selected_embedding and \"watsonx\" in type(selected_embedding).__name__.lower()\n )\n logger.debug(f\"Is IBM/watsonx embedding: {is_ibm}\")\n\n if is_ibm:\n\n # Hand the full batch to the SDK and let it batch/throttle/retry.\n # Retry attempts and base backoff are tunable via the SDK's own\n # WATSONX_MAX_RETRIES / WATSONX_DELAY_TIME environment variables.\n logger.info(\n f\"Embedding {len(texts)} chunks via watsonx SDK batch (SDK-managed throttle + 429 retry)\"\n )\n try:\n vectors: list[list[float]] = selected_embedding.embed_documents(texts)\n logger.info(f\"Successfully embedded {len(vectors)} chunks via watsonx SDK\")\n except Exception as embed_error:\n _log_watsonx_rate_limit_headers(embed_error)\n logger.error(\n f\"Failed to embed {len(texts)} chunks via watsonx SDK. Error: {embed_error}\",\n )\n raise\n\n else:\n # Non-watsonx providers (OpenAI, Ollama) lack the watsonx SDK's\n # built-in rate-limit handling, so embed per chunk in parallel with\n # a generic rate-limit-aware tenacity retry.\n vectors: list[list[float]] = [None] * len(texts)\n from tenacity import (\n retry,\n retry_if_exception,\n stop_after_attempt,\n wait_exponential,\n )\n\n def is_rate_limit_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a rate limit error (429).\"\"\"\n error_str = str(exception).lower()\n return \"429\" in error_str or \"rate_limit\" in error_str or \"rate limit\" in error_str\n\n def is_other_retryable_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a transient network error worth retrying.\"\"\"\n if is_rate_limit_error(exception):\n return False\n return isinstance(exception, (ConnectionError, TimeoutError, OSError))\n\n # Retry decorator for rate limit errors (longer backoff)\n retry_on_rate_limit = retry(\n retry=retry_if_exception(is_rate_limit_error),\n stop=stop_after_attempt(5),\n wait=wait_exponential(multiplier=2, min=2, max=30),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Rate limit hit for chunk (attempt {retry_state.attempt_number}/5), \"\n f\"backing off for {retry_state.next_action.sleep:.1f}s\"\n ),\n )\n\n # Retry decorator for other errors (shorter backoff)\n retry_on_other_errors = retry(\n retry=retry_if_exception(is_other_retryable_error),\n stop=stop_after_attempt(3),\n wait=wait_exponential(multiplier=1, min=1, max=8),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Error embedding chunk (attempt {retry_state.attempt_number}/3), \"\n f\"retrying in {retry_state.next_action.sleep:.1f}s: {retry_state.outcome.exception()}\"\n ),\n )\n\n def embed_chunk_with_retry(chunk_text: str, chunk_idx: int) -> list[float]:\n \"\"\"Embed a single chunk with rate-limit-aware retry logic.\"\"\"\n\n @retry_on_rate_limit\n @retry_on_other_errors\n def _embed(text: str) -> list[float]:\n return selected_embedding.embed_documents([text])[0]\n\n try:\n return _embed(chunk_text)\n except Exception as e:\n logger.error(\n f\"Failed to embed chunk {chunk_idx} after all retries: {e}\",\n error=str(e),\n )\n raise\n\n max_workers = min(max(len(texts), 1), 8)\n logger.debug(f\"Using parallel processing with {max_workers} workers\")\n\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {\n executor.submit(embed_chunk_with_retry, chunk, idx): idx\n for idx, chunk in enumerate(texts)\n }\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n\n if not vectors:\n self.log(f\"No vectors generated from documents for model {embedding_model}.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n use_openrag_ingest_callback = self._openrag_ingest_callback_config() is not None\n\n is_aoss = False\n mapping: dict | None = None\n\n engine = getattr(self, \"engine\", \"jvector\")\n\n if use_openrag_ingest_callback:\n self.log(\"Using OpenRAG backend ingest callback; skipping direct OpenSearch writes.\")\n else:\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping (index.knn: true is required for vector search)\n index_exists = True\n try:\n index_exists = bool(client.indices.exists(index=self.index_name))\n except OpenSearchException as exists_error:\n self._log_index_admin_skip(\"indices.exists\", exists_error)\n\n try:\n if not index_exists:\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error == \"resource_already_exists_exception\":\n pass # Index was created concurrently\n else:\n error_msg = str(creation_error).lower()\n if \"invalid engine\" in error_msg or \"illegal_argument\" in error_msg:\n if \"jvector\" in error_msg:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to 2.9+.\"\n )\n raise ValueError(msg) from creation_error\n if \"index.knn\" in error_msg:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from creation_error\n logger.warning(f\"Failed to create index '{self.index_name}': {creation_error}\")\n raise\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(\n f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\"\n )\n logger.info(f\"Will store embeddings in field: {dynamic_field_name}\")\n logger.info(f\"Will tag documents with embedding_model: {embedding_model}\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n logger.info(\n f\"Ingestion complete: Successfully indexed {len(return_ids)} documents with model '{embedding_model}'\"\n )\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if (\n \"term\" in f\n and isinstance(f[\"term\"], dict)\n and not self._is_placeholder_term(f[\"term\"])\n ):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n\n def _parse_filter_expression(self) -> dict | None:\n \"\"\"Parse and validate optional filter_expression JSON.\n\n Returns:\n Parsed JSON object as a dict, or None when unset/blank.\n\n Raises:\n ValueError: If JSON is invalid or does not decode to an object.\n \"\"\"\n filter_expression = getattr(self, \"filter_expression\", \"\")\n if not isinstance(filter_expression, str) or not filter_expression.strip():\n return None\n try:\n filter_obj = json.loads(filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not isinstance(filter_obj, dict):\n msg = \"Invalid filter_expression JSON type: expected a JSON object.\"\n raise TypeError(msg)\n return filter_obj\n\n def _resolve_limit(self, filter_obj: dict | None, default_limit: int | None) -> int | None:\n \"\"\"Resolve an integer result limit from filter settings.\"\"\"\n if not filter_obj:\n return default_limit\n raw_limit = filter_obj.get(\"limit\", default_limit)\n if raw_limit is None:\n return None\n if isinstance(raw_limit, bool):\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise TypeError(msg)\n try:\n limit = int(raw_limit)\n except (TypeError, ValueError) as e:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg) from e\n if limit <= 0:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg)\n return limit\n\n def _resolve_score_threshold(self, filter_obj: dict | None) -> float | None:\n \"\"\"Resolve optional positive min score from filter settings.\"\"\"\n if not filter_obj:\n return None\n score_threshold = filter_obj.get(\"score_threshold\")\n if score_threshold is None:\n score_threshold = filter_obj.get(\"scoreThreshold\")\n if not isinstance(score_threshold, (int, float)) or score_threshold <= 0:\n return None\n return float(score_threshold)\n\n def _detect_available_models(self, client: OpenSearch, filter_clauses: list[dict] | None = None) -> list[str]:\n\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}}},\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\"bool\": {\"filter\": filter_clauses}}\n\n logger.debug(f\"Model detection query: {agg_query}\")\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n # Log detailed bucket info for debugging\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n if not models:\n total_hits = result.get(\"hits\", {}).get(\"total\", {})\n total_count = (\n total_hits.get(\"value\", 0) if isinstance(total_hits, dict) else total_hits\n )\n logger.warning(\n f\"No embedding_model values found in index '{self.index_name}'. \"\n f\"Total docs in index: {total_count}. \"\n f\"This may indicate documents were indexed without the embedding_model field.\"\n )\n except (OpenSearchException, KeyError, ValueError) as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n fallback_model = self._get_embedding_model_name()\n logger.info(f\"Using fallback model: {fallback_model}\")\n return [fallback_model]\n else:\n return models\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except OpenSearchException as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n return bool(isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\")\n\n def _get_field_dimension(\n self, properties: dict[str, Any] | None, field_name: str\n ) -> int | None:\n \"\"\"Get the dimension of a knn_vector field from the index mapping.\n\n Args:\n properties: Index properties from mapping\n field_name: Name of the vector field\n\n Returns:\n Dimension of the field, or None if not found\n \"\"\"\n if not field_name or properties is None:\n return None\n\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return None\n\n # Check direct knn_vector field\n if field_def.get(\"type\") == \"knn_vector\":\n return field_def.get(\"dimension\")\n\n # Check nested properties\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return nested_props.get(\"dimension\")\n\n return None\n\n def _get_filename_agg_field(self, index_properties: dict[str, Any] | None) -> str:\n \"\"\"Choose the appropriate field for filename aggregations.\"\"\"\n if not index_properties:\n return \"filename.keyword\"\n\n filename_def = index_properties.get(\"filename\")\n if not isinstance(filename_def, dict):\n return \"filename.keyword\"\n\n field_type = filename_def.get(\"type\")\n fields_def = filename_def.get(\"fields\", {})\n\n # Top-level keyword with no subfields\n if field_type == \"keyword\" and not isinstance(fields_def, dict):\n return \"filename\"\n\n # Text field with keyword subfield\n if isinstance(fields_def, dict) and \"keyword\" in fields_def:\n return \"filename.keyword\"\n\n # Fallback: aggregate on filename directly\n return \"filename\"\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = self._parse_filter_expression()\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Check if embedding is None (fail-safe mode)\n if self.embedding is None or (\n isinstance(self.embedding, list) and all(e is None for e in self.embedding)\n ):\n logger.error(\"Embedding returned None (fail-safe mode enabled). Cannot perform search.\")\n return []\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models\n query_embeddings = {}\n\n # Normalize embedding to list\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n # Filter out None values (fail-safe mode)\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n if not embeddings_list:\n logger.error(\n \"No valid embeddings available after filtering None values (fail-safe mode). Cannot perform search.\"\n )\n return []\n\n # Create a comprehensive map of model names to embedding objects\n # Check all possible identifiers (deployment, model, model_id, model_name)\n # Also leverage available_models list from EmbeddingsWithModels\n # Handle duplicate identifiers by creating combined keys\n embedding_by_model = {}\n identifier_conflicts = {} # Track which identifiers have conflicts\n\n for idx, emb_obj in enumerate(embeddings_list):\n # Get all possible identifiers for this embedding\n identifiers = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n dimensions = getattr(emb_obj, \"dimensions\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Embedding object {idx}: deployment={deployment}, model={model}, \"\n f\"model_id={model_id}, model_name={model_name}, dimensions={dimensions}, \"\n f\"available_models={available_models_attr}\"\n )\n\n # If this embedding has available_models dict, map all models to their dedicated instances\n if available_models_attr and isinstance(available_models_attr, dict):\n logger.info(\n f\"Embedding object {idx} provides {len(available_models_attr)} models via available_models dict\"\n )\n for model_name_key, dedicated_embedding in available_models_attr.items():\n if model_name_key and str(model_name_key).strip():\n model_str = str(model_name_key).strip()\n if model_str not in embedding_by_model:\n # Use the dedicated embedding instance from the dict\n embedding_by_model[model_str] = dedicated_embedding\n logger.info(\n f\"Mapped available model '{model_str}' to dedicated embedding instance\"\n )\n else:\n # Conflict detected - track it\n if model_str not in identifier_conflicts:\n identifier_conflicts[model_str] = [embedding_by_model[model_str]]\n identifier_conflicts[model_str].append(dedicated_embedding)\n logger.warning(\n f\"Available model '{model_str}' has conflict - used by multiple embeddings\"\n )\n\n # Also map traditional identifiers (for backward compatibility)\n if deployment:\n identifiers.append(str(deployment))\n if model:\n identifiers.append(str(model))\n if model_id:\n identifiers.append(str(model_id))\n if model_name:\n identifiers.append(str(model_name))\n\n # Map all identifiers to this embedding object\n for identifier in identifiers:\n if identifier not in embedding_by_model:\n embedding_by_model[identifier] = emb_obj\n logger.info(f\"Mapped identifier '{identifier}' to embedding object {idx}\")\n else:\n # Conflict detected - track it\n if identifier not in identifier_conflicts:\n identifier_conflicts[identifier] = [embedding_by_model[identifier]]\n identifier_conflicts[identifier].append(emb_obj)\n logger.warning(\n f\"Identifier '{identifier}' has conflict - used by multiple embeddings\"\n )\n\n # For embeddings with model+deployment, create combined identifier\n # This helps when deployment is the same but model differs\n if deployment and model and deployment != model:\n combined_id = f\"{deployment}:{model}\"\n if combined_id not in embedding_by_model:\n embedding_by_model[combined_id] = emb_obj\n logger.info(\n f\"Created combined identifier '{combined_id}' for embedding object {idx}\"\n )\n\n # Log conflicts\n if identifier_conflicts:\n logger.warning(\n f\"Found {len(identifier_conflicts)} conflicting identifiers. \"\n f\"Consider using combined format 'deployment:model' or specifying unique model names.\"\n )\n for conflict_id, emb_list in identifier_conflicts.items():\n logger.warning(\n f\" Conflict on '{conflict_id}': {len(emb_list)} embeddings use this identifier\"\n )\n\n logger.info(f\"Generating embeddings for {len(available_models)} models in index\")\n logger.info(f\"Available embedding identifiers: {list(embedding_by_model.keys())}\")\n self.log(f\"[SEARCH] Models detected in index: {available_models}\")\n self.log(f\"[SEARCH] Available embedding identifiers: {list(embedding_by_model.keys())}\")\n\n # Track matching status for debugging\n matched_models = []\n unmatched_models = []\n\n for model_name in available_models:\n try:\n # Check if we have an embedding object for this model\n if model_name in embedding_by_model:\n # Use the matching embedding object directly\n emb_obj = embedding_by_model[model_name]\n emb_deployment = getattr(emb_obj, \"deployment\", None)\n emb_model = getattr(emb_obj, \"model\", None)\n emb_model_id = getattr(emb_obj, \"model_id\", None)\n emb_dimensions = getattr(emb_obj, \"dimensions\", None)\n emb_available_models = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Using embedding object for model '{model_name}': \"\n f\"deployment={emb_deployment}, model={emb_model}, model_id={emb_model_id}, \"\n f\"dimensions={emb_dimensions}\"\n )\n\n # Check if this is a dedicated instance from available_models dict\n if emb_available_models and isinstance(emb_available_models, dict):\n logger.info(\n f\"Model '{model_name}' using dedicated instance from available_models dict \"\n f\"(pre-configured with correct model and dimensions)\"\n )\n\n # Use the embedding instance directly - no model switching needed!\n vec = emb_obj.embed_query(q)\n query_embeddings[model_name] = vec\n matched_models.append(model_name)\n logger.info(\n f\"Generated embedding for model: {model_name} (actual dimensions: {len(vec)})\"\n )\n self.log(f\"[MATCH] Model '{model_name}' - generated {len(vec)}-dim embedding\")\n else:\n # No matching embedding found for this model\n unmatched_models.append(model_name)\n logger.warning(\n f\"No matching embedding found for model '{model_name}'. \"\n f\"This model will be skipped. Available identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(\n f\"[NO MATCH] Model '{model_name}' - available: {list(embedding_by_model.keys())}\"\n )\n except (\n RuntimeError,\n ValueError,\n ConnectionError,\n TimeoutError,\n AttributeError,\n KeyError,\n ) as e:\n logger.warning(f\"Failed to generate embedding for {model_name}: {e}\")\n self.log(f\"[ERROR] Embedding generation failed for '{model_name}': {e}\")\n\n # Log summary of model matching\n logger.info(\n f\"Model matching summary: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n self.log(\n f\"[SUMMARY] Model matching: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n if unmatched_models:\n self.log(f\"[WARN] Unmatched models in index: {unmatched_models}\")\n\n if not query_embeddings:\n msg = (\n f\"Failed to generate embeddings for any model. \"\n f\"Index has models: {available_models}, but no matching embedding objects found. \"\n f\"Available embedding identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(f\"[FAIL] Search failed: {msg}\")\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n vector_dim = len(embedding_vector)\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n self.log(\n f\"[SKIP] Field '{selected_field}' not a knn_vector - skipping model '{model_name}'\"\n )\n continue\n\n # Validate vector dimensions match the field dimensions\n field_dim = self._get_field_dimension(index_properties, selected_field)\n if field_dim is not None and field_dim != vector_dim:\n logger.error(\n f\"Dimension mismatch for model '{model_name}': \"\n f\"Query vector has {vector_dim} dimensions but field '{selected_field}' expects {field_dim}. \"\n f\"Skipping this model to prevent search errors.\"\n )\n self.log(\n f\"[DIM MISMATCH] Model '{model_name}': query={vector_dim} vs field={field_dim} - skipping\"\n )\n continue\n\n logger.info(\n f\"Adding KNN query for model '{model_name}': field='{selected_field}', \"\n f\"query_dims={vector_dim}, field_dims={field_dim or 'unknown'}\"\n )\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n self.log(\n f\"[WARN] No valid KNN queries could be built. \"\n f\"Query embeddings generated: {list(query_embeddings.keys())}, \"\n f\"but no matching knn_vector fields found in index.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1,\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = self._resolve_limit(filter_obj, default_limit=self.number_of_results)\n score_threshold = self._resolve_score_threshold(filter_obj)\n\n # Determine the best aggregation field for filename based on index mapping\n filename_agg_field = self._get_filename_agg_field(index_properties)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates,\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": filename_agg_field, \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n \"allowed_principals\",\n ],\n \"size\": limit,\n }\n\n if score_threshold is not None:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models: \"\n f\"{list(query_embeddings.keys())}\"\n )\n self.log(\n f\"[EXEC] Executing search with {len(knn_queries_with_candidates)} KNN queries, limit={limit}\"\n )\n self.log(f\"[EXEC] Embedding models used: {list(query_embeddings.keys())}\")\n self.log(f\"[EXEC] KNN fields being queried: {embedding_fields}\")\n\n try:\n resp = client.search(index=self.index_name, body=body, params={\"terminate_after\": 0})\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = (\n knn_queries_without_candidates\n )\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n self.log(f\"[RESULT] Search complete: {len(hits)} results found\")\n\n if len(hits) == 0:\n self.log(\n f\"[EMPTY] Debug info: \"\n f\"models_in_index={available_models}, \"\n f\"matched_models={matched_models}, \"\n f\"knn_fields={embedding_fields}, \"\n f\"filters={len(filter_clauses)} clauses\"\n )\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> Table:\n\n \"\"\"Search documents and return results as a Table.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Table (DataFrame) format\n so downstream Parser components can consume them directly.\n\n Always builds the vector store (triggering ingestion if needed), then performs\n search only if a query is provided.\n\n Returns:\n Table containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n\n try:\n # Always build/cache the vector store to ensure ingestion happens\n logger.info(f\"Search query: {self.search_query}\")\n if self._cached_vector_store is None:\n self.build_vector_store()\n\n # Only perform search if query is provided\n search_query = (self.search_query or \"\").strip()\n if not search_query:\n self.log(\"No search query provided - ingestion completed, returning empty results\")\n\n return Table(data=[])\n\n # Perform search with the provided query\n raw = self.search(search_query)\n raw_list = [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n return Table(data=raw_list)\n\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(\n self, build_config: dict, field_value: str, field_name: str | None = None\n ) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n is_openrag = mode == \"openrag\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n # build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n # In 'openrag' mode, expose the OPENRAG_* fields up front\n # since they are the only credentials required.\n for openrag_field in (\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ):\n if openrag_field in build_config:\n build_config[openrag_field][\"advanced\"] = not is_openrag\n build_config[openrag_field][\"required\"] = (\n is_openrag and openrag_field != \"openrag_ingest_batch_size\"\n )\n\n if is_basic or is_openrag:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" + "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport os\nimport uuid\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom typing import Any\n\nimport httpx\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n MultilineInput,\n Output,\n SecretStrInput,\n StrInput,\n TableInput,\n)\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\nfrom lfx.schema.dataframe import Table\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import OpenSearchException, RequestError\n\nREQUEST_TIMEOUT = 60\nMAX_RETRIES = 5\n\n\ndef _get_min_env_int(key: str, default: int, minimum: int) -> int:\n try:\n value = int(os.getenv(key, default))\n except (TypeError, ValueError):\n value = default\n return max(value, minimum)\n\n\nOPENSEARCH_NUMBER_OF_SHARDS = _get_min_env_int(\"OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS\", 1, 1)\nOPENSEARCH_NUMBER_OF_REPLICAS = _get_min_env_int(\"OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS\", 0, 0)\n\n# watsonx.ai surfaces rate-limit state via these (mostly non-standard) response\n# headers. The IBM SDK acts on the x-requests-limit-* family directly; we log\n# them on a failed embedding call to aid plan/region tuning.\n_WATSONX_RATE_LIMIT_HEADERS = (\n \"x-requests-limit-rate\",\n \"x-requests-limit-remaining\",\n \"x-requests-limit-reset\",\n \"Retry-After\",\n)\n\n\ndef _log_watsonx_rate_limit_headers(error: Exception) -> None:\n \"\"\"Best-effort diagnostic: log watsonx rate-limit headers from a failed call.\n\n The watsonx SDK raises ``ApiRequestFailure``, which carries the originating\n httpx/requests ``Response`` as ``.response``. On a 429 exhaustion we surface\n the documented rate-limit headers so operators can tune throughput.\n \"\"\"\n try:\n response = getattr(error, \"response\", None)\n headers = getattr(response, \"headers\", None)\n if not headers:\n return\n status = getattr(response, \"status_code\", \"unknown\")\n observed = {\n h: headers.get(h) for h in _WATSONX_RATE_LIMIT_HEADERS if headers.get(h) is not None\n }\n if str(status) == \"429\" or observed:\n logger.warning(f\"watsonx rate-limit response (status={status}): {observed}\")\n except Exception as log_error: # never let diagnostics mask the real error\n logger.debug(f\"Could not extract watsonx rate-limit headers: {log_error}\")\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n logger.info(f\"chunk_embedding_{normalize_model_name(model_name)}\")\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n\n Model Name Resolution:\n - Priority: deployment > model > model_name attributes\n - This ensures correct matching between embedding objects and index fields\n - When multiple embeddings are provided, specify embedding_model_name to select which one to use\n - During search, each detected model in the index is matched to its corresponding embedding object\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model Multi-Embedding)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search. \"\n \"To search use the tools search_documents and raw_search. \"\n \"Search documents takes a query for vector search, for example\\n\"\n ' {search_query: \"components in openrag\"}'\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n \"request_timeout\",\n \"max_retries\",\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ]\n _openrag_ingest_global_placeholders = {\n \"openrag_ingest_url\": \"OPENRAG_INGEST_URL\",\n \"openrag_ingest_token\": \"OPENRAG_INGEST_TOKEN\",\n \"openrag_ingest_run_id\": \"OPENRAG_INGEST_RUN_ID\",\n }\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\", \"JSON\"],\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"nmslib\", \"faiss\", \"lucene\", \"jvector\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'nmslib' works with standard \"\n \"OpenSearch. 'jvector' requires OpenSearch 2.9+. 'lucene' requires index.knn: true. \"\n \"Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(\n name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"], is_list=True\n ),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model to use for ingestion. This selects which embedding from the list \"\n \"will be used to embed documents. Matches on deployment, model, model_id, or model_name. \"\n \"For duplicate deployments, use combined format: 'deployment:model' \"\n \"(e.g., 'text-embedding-ada-002:text-embedding-3-large'). \"\n \"Leave empty to use the first embedding. Error message will show all available identifiers.\"\n ),\n advanced=False,\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"openrag\",\n options=[\"basic\", \"jwt\", \"openrag\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"'jwt' for JSON Web Token (Bearer) authentication, or 'openrag' to \"\n \"delegate writes to the OpenRAG backend ingest callback (no direct \"\n \"OpenSearch credentials required — only OPENRAG_* fields).\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=True,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=False,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n required=False,\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=False,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n # ----- Timeout / Retry -----\n StrInput(\n name=\"request_timeout\",\n display_name=\"Request Timeout (seconds)\",\n value=\"60\",\n advanced=True,\n info=(\n \"Time in seconds to wait for a response from OpenSearch. \"\n \"Increase for large bulk ingestion or complex hybrid queries.\"\n ),\n ),\n StrInput(\n name=\"max_retries\",\n display_name=\"Max Retries\",\n value=\"3\",\n advanced=True,\n info=\"Number of retries for failed connections before raising an error.\",\n ),\n StrInput(\n name=\"openrag_ingest_url\",\n display_name=\"OpenRAG Ingest URL\",\n value=\"OPENRAG_INGEST_URL\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Internal OpenRAG callback URL for backend-owned document indexing.\",\n ),\n StrInput(\n name=\"openrag_ingest_token\",\n display_name=\"OpenRAG Ingest Token\",\n value=\"OPENRAG_INGEST_TOKEN\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n info=\"Short-lived token used only for OpenRAG ingest callbacks.\",\n ),\n StrInput(\n name=\"openrag_ingest_run_id\",\n display_name=\"OpenRAG Ingest Run ID\",\n value=\"OPENRAG_INGEST_RUN_ID\",\n load_from_db=True,\n input_types=[\"Text\", \"Message\"],\n advanced=True,\n ),\n IntInput(\n name=\"openrag_ingest_batch_size\",\n display_name=\"OpenRAG Ingest Batch Size\",\n value=100,\n advanced=True,\n ),\n ]\n outputs = [\n Output(\n display_name=\"Search Results\",\n name=\"search_results\",\n method=\"search_documents\",\n ),\n Output(display_name=\"Raw Search\", name=\"raw_search\", method=\"raw_search\"),\n ]\n\n def raw_search(self, query: str | dict | None = None) -> Data:\n \"\"\"Execute a raw OpenSearch query against the target index.\n\n Args:\n query (dict[str, Any]): The OpenSearch query DSL dictionary.\n\n Returns:\n Data: Search results as a Data object.\n\n Raises:\n ValueError: If 'query' is not a valid OpenSearch query (must be a non-empty dict).\n \"\"\"\n raw_query = query if query is not None else self.search_query\n\n if raw_query is None or (isinstance(raw_query, str) and not raw_query.strip()):\n self.log(\"No query provided for raw search - returning empty results\")\n return Data(data={})\n\n if isinstance(raw_query, dict):\n query_body = copy.deepcopy(raw_query)\n elif isinstance(raw_query, str):\n s = raw_query.strip()\n\n # First, optimistically try to parse as JSON DSL\n try:\n query_body = json.loads(s)\n except json.JSONDecodeError:\n # Fallback: treat as a basic text query over common fields\n query_body = {\n \"query\": {\n \"multi_match\": {\n \"query\": s,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n }\n }\n }\n else:\n msg = f\"Unsupported raw_search query type: {type(raw_query)!r}\"\n raise TypeError(msg)\n\n filter_obj = self._parse_filter_expression()\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n if filter_clauses:\n if \"query\" in query_body:\n original_query = query_body[\"query\"]\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [original_query],\n \"filter\": filter_clauses,\n }\n }\n else:\n query_body[\"query\"] = {\n \"bool\": {\n \"must\": [{\"match_all\": {}}],\n \"filter\": filter_clauses,\n }\n }\n\n if filter_obj:\n # Apply limit if not already set in the raw query\n if \"size\" not in query_body:\n limit = self._resolve_limit(filter_obj, default_limit=None)\n if limit is not None:\n query_body[\"size\"] = limit\n\n # Apply score_threshold / scoreThreshold as min_score if not already set\n if \"min_score\" not in query_body:\n score_threshold = self._resolve_score_threshold(filter_obj)\n if score_threshold is not None:\n query_body[\"min_score\"] = score_threshold\n\n client = self.build_client()\n logger.info(f\"query: {query_body}\")\n resp = client.search(\n index=self.index_name,\n body=query_body,\n params={\"terminate_after\": 0},\n )\n # Remove any _source keys whose value is a list of floats (embedding vectors)\n # Minimum length threshold to identify embedding vectors\n min_vector_length = 100\n\n def is_vector(val):\n # Accepts if it's a list of numbers (float or int) and has reasonable vector length\n return (\n isinstance(val, list)\n and len(val) > min_vector_length\n and all(isinstance(x, (float, int)) for x in val)\n )\n\n if \"hits\" in resp and \"hits\" in resp[\"hits\"]:\n for hit in resp[\"hits\"][\"hits\"]:\n source = hit.get(\"_source\")\n if isinstance(source, dict):\n keys_to_remove = [k for k, v in source.items() if is_vector(v)]\n for k in keys_to_remove:\n source.pop(k)\n logger.info(f\"Raw search response (all embedding vectors removed): {resp}\")\n return Data(**resp)\n\n def _get_embedding_model_name(self, embedding_obj=None) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Priority: deployment > model > model_id > model_name\n This ensures we use the actual model being deployed, not just the configured model.\n Supports multiple embedding providers (OpenAI, Watsonx, Cohere, etc.)\n\n Args:\n embedding_obj: Specific embedding object to get name from (optional)\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from provided embedding object\n if embedding_obj:\n # Priority: deployment > model > model_id > model_name\n if hasattr(embedding_obj, \"deployment\") and embedding_obj.deployment:\n return str(embedding_obj.deployment)\n if hasattr(embedding_obj, \"model\") and embedding_obj.model:\n return str(embedding_obj.model)\n if hasattr(embedding_obj, \"model_id\") and embedding_obj.model_id:\n return str(embedding_obj.model_id)\n if hasattr(embedding_obj, \"model_name\") and embedding_obj.model_name:\n return str(embedding_obj.model_name)\n\n # Try to get from embedding component (legacy single embedding)\n if hasattr(self, \"embedding\") and self.embedding:\n # Handle list of embeddings\n if isinstance(self.embedding, list) and len(self.embedding) > 0:\n first_emb = self.embedding[0]\n if hasattr(first_emb, \"deployment\") and first_emb.deployment:\n return str(first_emb.deployment)\n if hasattr(first_emb, \"model\") and first_emb.model:\n return str(first_emb.model)\n if hasattr(first_emb, \"model_id\") and first_emb.model_id:\n return str(first_emb.model_id)\n if hasattr(first_emb, \"model_name\") and first_emb.model_name:\n return str(first_emb.model_name)\n # Handle single embedding\n elif not isinstance(self.embedding, list):\n if hasattr(self.embedding, \"deployment\") and self.embedding.deployment:\n return str(self.embedding.deployment)\n if hasattr(self.embedding, \"model\") and self.embedding.model:\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_id\") and self.embedding.model_id:\n return str(self.embedding.model_id)\n if hasattr(self.embedding, \"model_name\") and self.embedding.model_name:\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'deployment', 'model', 'model_id', or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\n \"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search},\n \"number_of_shards\": OPENSEARCH_NUMBER_OF_SHARDS,\n \"number_of_replicas\": OPENSEARCH_NUMBER_OF_REPLICAS,\n },\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Note: Some OpenSearch versions/configurations have issues with dynamically adding\n knn_vector mappings (NullPointerException). This method checks if the field\n already exists before attempting to add it, and gracefully skips if the field\n is already properly configured.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n # First, check if the field already exists and is properly mapped\n properties = self._get_index_properties(client)\n if self._is_knn_vector_field(properties, field_name):\n # Field already exists as knn_vector - verify dimensions match\n existing_dim = self._get_field_dimension(properties, field_name)\n if existing_dim is not None and existing_dim != dim:\n logger.warning(\n f\"Field '{field_name}' exists with dimension {existing_dim}, \"\n f\"but current embedding has dimension {dim}. Using existing mapping.\"\n )\n else:\n logger.info(\n f\"[OpenSearchMultimodel] Field '{field_name}' already exists\"\n f\"as knn_vector with matching dimensions - skipping mapping update\"\n )\n return\n\n # Field doesn't exist, try to add the mapping\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\"type\": \"keyword\"},\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except RequestError as e:\n error_str = str(e).lower()\n if \"invalid engine\" in error_str and \"jvector\" in error_str:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to OpenSearch 2.9+.\"\n )\n raise ValueError(msg) from e\n if \"index.knn\" in error_str:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from e\n raise\n except Exception as e:\n # Check if this is the known OpenSearch k-NN NullPointerException issue\n error_str = str(e).lower()\n if \"null\" in error_str or \"nullpointerexception\" in error_str:\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}\"\n f\"due to OpenSearch k-NN plugin issue: {e}. \"\n f\"This is a known issue with some OpenSearch versions. \"\n f\"[OpenSearchMultimodel] Skipping mapping update. \"\n f\"Please ensure the index has the correct mapping for KNN search to work.\"\n )\n # Skip and continue - ingestion will proceed, but KNN search may fail if mapping doesn't exist\n return\n logger.warning(\n f\"[OpenSearchMultimodel] Could not add embedding field mapping for {field_name}: {e}\"\n )\n raise\n\n # Verify the field was added correctly\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n msg = f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n logger.error(msg)\n raise ValueError(msg)\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return (\n http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n )\n\n @staticmethod\n def _openrag_input_to_str(value: Any) -> str:\n if value is None:\n return \"\"\n if hasattr(value, \"get_secret_value\"):\n value = value.get_secret_value()\n if hasattr(value, \"text\"):\n value = value.text\n return str(value or \"\").strip()\n\n def _openrag_callback_value(self, attr_name: str) -> str:\n value = self._openrag_input_to_str(getattr(self, attr_name, \"\"))\n if value == self._openrag_ingest_global_placeholders.get(attr_name):\n return \"\"\n return value\n\n def _openrag_ingest_callback_config(self) -> tuple[str, str, str] | None:\n url = self._openrag_callback_value(\"openrag_ingest_url\")\n token = self._openrag_callback_value(\"openrag_ingest_token\")\n ingest_run_id = self._openrag_callback_value(\"openrag_ingest_run_id\")\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n debug_payload = {\n \"openrag_ingest_url\": url,\n \"openrag_ingest_url_len\": len(url),\n \"openrag_ingest_token_masked\": masked_token,\n \"openrag_ingest_token_len\": len(token),\n \"openrag_ingest_run_id\": ingest_run_id,\n \"raw_url_type\": type(self.openrag_ingest_url).__name__,\n \"raw_token_type\": type(self.openrag_ingest_token).__name__,\n \"raw_run_id_type\": type(self.openrag_ingest_run_id).__name__,\n }\n logger.warning(f\"[OpenRAG callback config] {debug_payload}\")\n try:\n self.log(f\"[OpenRAG callback config] {debug_payload}\")\n except Exception:\n pass\n\n if not url and not token and not ingest_run_id:\n return None\n if not url or not token or not ingest_run_id:\n msg = \"OpenRAG ingest callback requires url, token, and ingest_run_id.\"\n raise ValueError(msg)\n return url, token, ingest_run_id\n\n def _post_openrag_ingest_batches(\n self,\n *,\n requests: list[dict],\n vector_field: str,\n text_field: str,\n ) -> None:\n callback_config = self._openrag_ingest_callback_config()\n if callback_config is None:\n return\n\n url, token, ingest_run_id = callback_config\n batch_size = max(self._parse_int_param(\"openrag_ingest_batch_size\", 100), 1)\n timeout = self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT)\n headers = {\"Authorization\": f\"Bearer {token}\"}\n\n masked_token = (\n f\"{token[:4]}...{token[-4:]}\" if len(token) >= 8 else (\"\" if token else \"\")\n )\n request_summary = {\n \"url\": url,\n \"ingest_run_id\": ingest_run_id,\n \"token_masked\": masked_token,\n \"total_chunks\": len(requests),\n \"batch_size\": batch_size,\n \"timeout_s\": timeout,\n }\n logger.warning(f\"[OpenRAG ingest POST] {request_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST] {request_summary}\")\n except Exception:\n pass\n\n with httpx.Client(timeout=timeout) as client:\n total_batches = (len(requests) + batch_size - 1) // batch_size\n for batch_number, start in enumerate(range(0, len(requests), batch_size), start=1):\n batch = requests[start : start + batch_size]\n final = batch_number == total_batches\n payload = {\n \"ingest_run_id\": ingest_run_id,\n \"batch_id\": batch_number,\n \"final\": final,\n \"chunks\": [\n self._openrag_chunk_payload(\n request,\n vector_field=vector_field,\n text_field=text_field,\n )\n for request in batch\n ],\n }\n logger.warning(\n f\"[OpenRAG ingest POST] -> batch={batch_number}/{total_batches} \"\n f\"url={url} chunks={len(payload['chunks'])} final={final}\"\n )\n response = client.post(url, json=payload, headers=headers)\n response_summary = {\n \"batch\": batch_number,\n \"url\": url,\n \"status\": response.status_code,\n \"final_url\": str(response.request.url),\n \"response_headers\": dict(response.headers),\n \"body_preview\": response.text[:500],\n }\n logger.warning(f\"[OpenRAG ingest POST resp] {response_summary}\")\n try:\n self.log(f\"[OpenRAG ingest POST resp] {response_summary}\")\n except Exception:\n pass\n if response.status_code >= 400:\n msg = (\n \"OpenRAG ingest callback failed \"\n f\"(batch={batch_number}, status={response.status_code}, \"\n f\"url={url}): {response.text[:1000]}\"\n )\n raise RuntimeError(msg)\n\n self.log(f\"Posted {len(requests)} chunks to OpenRAG backend ingest callback.\")\n\n @staticmethod\n def _openrag_chunk_payload(\n request: dict,\n *,\n vector_field: str,\n text_field: str,\n ) -> dict:\n metadata = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\", vector_field, text_field}\n }\n page = metadata.get(\"page\")\n if isinstance(page, str) and page.isdigit():\n page = int(page)\n return {\n \"id\": request.get(\"_id\") or request.get(\"id\"),\n \"text\": request.get(text_field, \"\"),\n \"vector\": request[vector_field],\n \"page\": page if isinstance(page, int) else None,\n \"metadata\": metadata,\n }\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n logger.debug(f\"[OpenSearchMultimodel] Bulk ingesting embeddings for {index_name}\")\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n\n # Normalize ACL fields that may arrive as JSON strings from flows\n for key in (\"allowed_users\", \"allowed_groups\", \"allowed_principals\"):\n value = metadata.get(key)\n if isinstance(value, str):\n try:\n parsed = json.loads(value)\n if isinstance(parsed, list):\n metadata[key] = parsed\n except (json.JSONDecodeError, TypeError):\n # Leave value as-is if it isn't valid JSON\n pass\n\n metadata_document_id = str(metadata.get(\"document_id\") or \"\").strip()\n if metadata_document_id and metadata_document_id.lower() != \"none\":\n generated_id = f\"{metadata_document_id}_{i}\"\n else:\n generated_id = str(uuid.uuid4())\n _id = ids[i] if ids else generated_id\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n if self._openrag_ingest_callback_config() is not None:\n self._post_openrag_ingest_batches(\n requests=requests,\n vector_field=vector_field,\n text_field=text_field,\n )\n return return_ids\n try:\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n except Exception as bulk_error:\n if \"Unsupported request type for filter level DLS\" not in str(bulk_error):\n raise\n logger.warning(\n \"[OpenSearchMultimodel] Bulk ingest is blocked by filter-level DLS; \"\n \"falling back to per-document index requests.\"\n )\n self._index_embeddings_individually(client, requests)\n return return_ids\n\n def _index_embeddings_individually(\n self,\n client: OpenSearch,\n requests: list[dict],\n ) -> None:\n \"\"\"Index documents one at a time when OpenSearch DLS rejects bulk writes.\"\"\"\n for request in requests:\n document_id = request.get(\"_id\") or request.get(\"id\")\n body = {\n key: value\n for key, value in request.items()\n if key not in {\"_op_type\", \"_index\", \"_id\", \"id\"}\n }\n client.index(index=request[\"_index\"], id=document_id, body=body)\n\n def _log_index_admin_skip(self, operation: str, error: Exception) -> None:\n \"\"\"Log index-admin operations that may be blocked under filter-level DLS.\"\"\"\n logger.warning(\n f\"[OpenSearchMultimodel] Could not run index-admin operation '{operation}': {error}. \"\n \"Assuming the backend pre-created the required index/mapping and continuing.\"\n )\n\n # ---------- param helpers ----------\n def _parse_int_param(self, attr_name: str, default: int) -> int:\n \"\"\"Parse a string attribute to int, returning *default* on failure.\"\"\"\n raw = getattr(self, attr_name, None)\n if raw is None or str(raw).strip() == \"\":\n return default\n try:\n value = int(str(raw).strip())\n except ValueError:\n logger.warning(\n f\"Invalid integer value '{raw}' for {attr_name}, using default {default}\"\n )\n return default\n\n if value < 0:\n logger.warning(f\"Negative value '{raw}' for {attr_name}, using default {default}\")\n return default\n\n return value\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n if mode == \"openrag\":\n # Writes are delegated to the OpenRAG backend ingest callback,\n # so no direct OpenSearch credentials are needed. Only the\n # OPENRAG_* fields are required for ingestion to function.\n missing = [\n name\n for name, value in (\n (\"openrag_ingest_url\", self.openrag_ingest_url),\n (\"openrag_ingest_token\", self.openrag_ingest_token),\n (\"openrag_ingest_run_id\", self.openrag_ingest_run_id),\n )\n if not (value or \"\").strip()\n ]\n if missing:\n msg = (\n \"Auth Mode is 'openrag' but required OPENRAG_* fields are \"\n f\"missing: {', '.join(missing)}.\"\n )\n raise ValueError(msg)\n return {}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel] Building OpenSearch client\")\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n timeout=self._parse_int_param(\"request_timeout\", REQUEST_TIMEOUT),\n max_retries=self._parse_int_param(\"max_retries\", MAX_RETRIES),\n retry_on_timeout=True,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n client = self.build_client()\n\n # Check if we're in ingestion-only mode (no search query)\n has_search_query = bool((self.search_query or \"\").strip())\n if not has_search_query:\n logger.debug(\n \"[OpenSearchMultimodel] Ingestion-only mode activated: search operations will be skipped\"\n )\n logger.debug(\"[OpenSearchMultimodel] Starting ingestion mode...\")\n\n logger.debug(f\"[OpenSearchMultimodel] Embedding: {self.embedding}\")\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings using the selected model\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n logger.debug(\"[OpenSearchMultimodel][INGESTION] _add_documents_to_vector_store called\")\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data type: \"\n f\"{type(self.ingest_data)}, length: {len(self.ingest_data) if self.ingest_data else 0}\"\n )\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] ingest_data content: \"\n f\"{self.ingest_data[:2] if self.ingest_data and len(self.ingest_data) > 0 else 'empty'}\"\n )\n\n docs = self.ingest_data or []\n if not docs:\n logger.debug(\"Ingestion complete: No documents provided\")\n return\n\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Normalize embedding to list first\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n\n # Filter out None values (fail-safe mode) - do this BEFORE checking if empty\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n # NOW check if we have any valid embeddings left after filtering\n if not embeddings_list:\n logger.warning(\n \"All embeddings returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n self.log(\n \"Embedding returned None (fail-safe mode enabled). Skipping document ingestion.\"\n )\n return\n\n logger.debug(\n f\"[OpenSearchMultimodel][INGESTION] Valid embeddings after filtering: {len(embeddings_list)}\"\n )\n self.log(\n f\"[OpenSearchMultimodel][INGESTION] Available embedding models: {len(embeddings_list)}\"\n )\n\n # Select the embedding to use for ingestion\n selected_embedding = None\n embedding_model = None\n\n # If embedding_model_name is specified, find matching embedding\n if (\n hasattr(self, \"embedding_model_name\")\n and self.embedding_model_name\n and self.embedding_model_name.strip()\n ):\n target_model_name = self.embedding_model_name.strip()\n self.log(f\"Looking for embedding model: {target_model_name}\")\n\n for emb_obj in embeddings_list:\n # Check all possible model identifiers (deployment, model, model_id, model_name)\n # Also check available_models list from EmbeddingsWithModels\n possible_names = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n if deployment:\n possible_names.append(str(deployment))\n if model:\n possible_names.append(str(model))\n if model_id:\n possible_names.append(str(model_id))\n if model_name:\n possible_names.append(str(model_name))\n\n # Also add combined identifier\n if deployment and model and deployment != model:\n possible_names.append(f\"{deployment}:{model}\")\n\n # Add all models from available_models dict\n if available_models_attr and isinstance(available_models_attr, dict):\n possible_names.extend(\n str(model_key).strip()\n for model_key in available_models_attr\n if model_key and str(model_key).strip()\n )\n\n # Match if target matches any of the possible names\n if target_model_name in possible_names:\n # Check if target is in available_models dict - use dedicated instance\n if (\n available_models_attr\n and isinstance(available_models_attr, dict)\n and target_model_name in available_models_attr\n ):\n # Use the dedicated embedding instance from the dict\n selected_embedding = available_models_attr[target_model_name]\n embedding_model = target_model_name\n self.log(\n f\"Found dedicated embedding instance for '{embedding_model}' in available_models dict\"\n )\n else:\n # Traditional identifier match\n selected_embedding = emb_obj\n embedding_model = self._get_embedding_model_name(emb_obj)\n self.log(\n f\"Found matching embedding model: {embedding_model} (matched on: {target_model_name})\"\n )\n break\n\n if not selected_embedding:\n # Build detailed list of available embeddings with all their identifiers\n available_info = []\n for idx, emb in enumerate(embeddings_list):\n emb_type = type(emb).__name__\n identifiers = []\n deployment = getattr(emb, \"deployment\", None)\n model = getattr(emb, \"model\", None)\n model_id = getattr(emb, \"model_id\", None)\n model_name = getattr(emb, \"model_name\", None)\n available_models_attr = getattr(emb, \"available_models\", None)\n\n if deployment:\n identifiers.append(f\"deployment='{deployment}'\")\n if model:\n identifiers.append(f\"model='{model}'\")\n if model_id:\n identifiers.append(f\"model_id='{model_id}'\")\n if model_name:\n identifiers.append(f\"model_name='{model_name}'\")\n\n # Add combined identifier as an option\n if deployment and model and deployment != model:\n identifiers.append(f\"combined='{deployment}:{model}'\")\n\n # Add available_models dict if present\n if available_models_attr and isinstance(available_models_attr, dict):\n identifiers.append(f\"available_models={list(available_models_attr.keys())}\")\n\n available_info.append(\n f\" [{idx}] {emb_type}: {', '.join(identifiers) if identifiers else 'No identifiers'}\"\n )\n\n msg = (\n f\"Embedding model '{target_model_name}' not found in available embeddings.\\n\\n\"\n f\"Available embeddings:\\n\" + \"\\n\".join(available_info) + \"\\n\\n\"\n \"Please set 'embedding_model_name' to one of the identifier values shown above \"\n \"(use the value after the '=' sign, without quotes).\\n\"\n \"For duplicate deployments, use the 'combined' format.\\n\"\n \"Or leave it empty to use the first embedding.\"\n )\n raise ValueError(msg)\n else:\n # Use first embedding if no model name specified\n selected_embedding = embeddings_list[0]\n embedding_model = self._get_embedding_model_name(selected_embedding)\n self.log(f\"No embedding_model_name specified, using first embedding: {embedding_model}\")\n\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n logger.info(f\"Selected embedding model for ingestion: '{embedding_model}'\")\n self.log(f\"Using embedding model for ingestion: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Log embedding details for debugging\n if hasattr(selected_embedding, \"deployment\"):\n logger.info(f\"Embedding deployment: {selected_embedding.deployment}\")\n if hasattr(selected_embedding, \"model\"):\n logger.info(f\"Embedding model: {selected_embedding.model}\")\n if hasattr(selected_embedding, \"model_id\"):\n logger.info(f\"Embedding model_id: {selected_embedding.model_id}\")\n if hasattr(selected_embedding, \"dimensions\"):\n logger.info(f\"Embedding dimensions: {selected_embedding.dimensions}\")\n if hasattr(selected_embedding, \"available_models\"):\n logger.info(f\"Embedding available_models: {selected_embedding.available_models}\")\n\n # No model switching needed - each model in available_models has its own dedicated instance\n # The selected_embedding is already configured correctly for the target model\n logger.info(\n f\"Using embedding instance for '{embedding_model}' - pre-configured and ready to use\"\n )\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n logger.debug(f\"[LF] Docs metadata {self.docs_metadata}\")\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n\n # Determine whether the selected embedding is watsonx/IBM. The watsonx\n # SDK ships its own rate-limit machinery (input batching, proactive\n # x-requests-limit-* TokenBucket throttling, and jittered exponential\n # backoff on 429), so we lean on it instead of retrying on top of it.\n # The type-name check also covers watsonx-hosted, non-\"ibm/\" models\n # (e.g. intfloat/multilingual-e5-large).\n is_ibm = (embedding_model and \"ibm\" in str(embedding_model).lower()) or (\n selected_embedding and \"watsonx\" in type(selected_embedding).__name__.lower()\n )\n logger.debug(f\"Is IBM/watsonx embedding: {is_ibm}\")\n\n if is_ibm:\n # Hand the full batch to the SDK and let it batch/throttle/retry.\n # Retry attempts and base backoff are tunable via the SDK's own\n # WATSONX_MAX_RETRIES / WATSONX_DELAY_TIME environment variables.\n logger.info(\n f\"Embedding {len(texts)} chunks via watsonx SDK batch (SDK-managed throttle + 429 retry)\"\n )\n try:\n vectors: list[list[float]] = selected_embedding.embed_documents(texts)\n logger.info(f\"Successfully embedded {len(vectors)} chunks via watsonx SDK\")\n except Exception as embed_error:\n _log_watsonx_rate_limit_headers(embed_error)\n logger.error(\n f\"Failed to embed {len(texts)} chunks via watsonx SDK. Error: {embed_error}\",\n )\n raise\n\n else:\n # Non-watsonx providers (OpenAI, Ollama) lack the watsonx SDK's\n # built-in rate-limit handling, so embed per chunk in parallel with\n # a generic rate-limit-aware tenacity retry.\n vectors: list[list[float]] = [None] * len(texts)\n from tenacity import (\n retry,\n retry_if_exception,\n stop_after_attempt,\n wait_exponential,\n )\n\n def is_rate_limit_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a rate limit error (429).\"\"\"\n error_str = str(exception).lower()\n return \"429\" in error_str or \"rate_limit\" in error_str or \"rate limit\" in error_str\n\n def is_other_retryable_error(exception: Exception) -> bool:\n \"\"\"Check if exception is a transient network error worth retrying.\"\"\"\n if is_rate_limit_error(exception):\n return False\n return isinstance(exception, (ConnectionError, TimeoutError, OSError))\n\n # Retry decorator for rate limit errors (longer backoff)\n retry_on_rate_limit = retry(\n retry=retry_if_exception(is_rate_limit_error),\n stop=stop_after_attempt(5),\n wait=wait_exponential(multiplier=2, min=2, max=30),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Rate limit hit for chunk (attempt {retry_state.attempt_number}/5), \"\n f\"backing off for {retry_state.next_action.sleep:.1f}s\"\n ),\n )\n\n # Retry decorator for other errors (shorter backoff)\n retry_on_other_errors = retry(\n retry=retry_if_exception(is_other_retryable_error),\n stop=stop_after_attempt(3),\n wait=wait_exponential(multiplier=1, min=1, max=8),\n reraise=True,\n before_sleep=lambda retry_state: logger.warning(\n f\"Error embedding chunk (attempt {retry_state.attempt_number}/3), \"\n f\"retrying in {retry_state.next_action.sleep:.1f}s: {retry_state.outcome.exception()}\"\n ),\n )\n\n def embed_chunk_with_retry(chunk_text: str, chunk_idx: int) -> list[float]:\n \"\"\"Embed a single chunk with rate-limit-aware retry logic.\"\"\"\n\n @retry_on_rate_limit\n @retry_on_other_errors\n def _embed(text: str) -> list[float]:\n return selected_embedding.embed_documents([text])[0]\n\n try:\n return _embed(chunk_text)\n except Exception as e:\n logger.error(\n f\"Failed to embed chunk {chunk_idx} after all retries: {e}\",\n error=str(e),\n )\n raise\n\n max_workers = min(max(len(texts), 1), 8)\n logger.debug(f\"Using parallel processing with {max_workers} workers\")\n\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {\n executor.submit(embed_chunk_with_retry, chunk, idx): idx\n for idx, chunk in enumerate(texts)\n }\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n\n if not vectors:\n self.log(f\"No vectors generated from documents for model {embedding_model}.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n use_openrag_ingest_callback = self._openrag_ingest_callback_config() is not None\n\n is_aoss = False\n mapping: dict | None = None\n\n engine = getattr(self, \"engine\", \"jvector\")\n\n if use_openrag_ingest_callback:\n self.log(\"Using OpenRAG backend ingest callback; skipping direct OpenSearch writes.\")\n else:\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping (index.knn: true is required for vector search)\n index_exists = True\n try:\n index_exists = bool(client.indices.exists(index=self.index_name))\n except OpenSearchException as exists_error:\n self._log_index_admin_skip(\"indices.exists\", exists_error)\n\n try:\n if not index_exists:\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error == \"resource_already_exists_exception\":\n pass # Index was created concurrently\n else:\n error_msg = str(creation_error).lower()\n if \"invalid engine\" in error_msg or \"illegal_argument\" in error_msg:\n if \"jvector\" in error_msg:\n msg = (\n \"The 'jvector' engine is not available in your OpenSearch installation. \"\n \"Use 'nmslib' or 'faiss' for standard OpenSearch, or upgrade to 2.9+.\"\n )\n raise ValueError(msg) from creation_error\n if \"index.knn\" in error_msg:\n msg = (\n \"The index has index.knn: false. Delete the existing index and let the \"\n \"component recreate it, or create a new index with a different name.\"\n )\n raise ValueError(msg) from creation_error\n logger.warning(f\"Failed to create index '{self.index_name}': {creation_error}\")\n raise\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(\n f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\"\n )\n logger.info(f\"Will store embeddings in field: {dynamic_field_name}\")\n logger.info(f\"Will tag documents with embedding_model: {embedding_model}\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n logger.info(\n f\"Ingestion complete: Successfully indexed {len(return_ids)} documents with model '{embedding_model}'\"\n )\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if (\n \"term\" in f\n and isinstance(f[\"term\"], dict)\n and not self._is_placeholder_term(f[\"term\"])\n ):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n def _parse_filter_expression(self) -> dict | None:\n \"\"\"Parse and validate optional filter_expression JSON.\n\n Returns:\n Parsed JSON object as a dict, or None when unset/blank.\n\n Raises:\n ValueError: If JSON is invalid or does not decode to an object.\n \"\"\"\n filter_expression = getattr(self, \"filter_expression\", \"\")\n if not isinstance(filter_expression, str) or not filter_expression.strip():\n return None\n try:\n filter_obj = json.loads(filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not isinstance(filter_obj, dict):\n msg = \"Invalid filter_expression JSON type: expected a JSON object.\"\n raise TypeError(msg)\n return filter_obj\n\n def _resolve_limit(self, filter_obj: dict | None, default_limit: int | None) -> int | None:\n \"\"\"Resolve an integer result limit from filter settings.\"\"\"\n if not filter_obj:\n return default_limit\n raw_limit = filter_obj.get(\"limit\", default_limit)\n if raw_limit is None:\n return None\n if isinstance(raw_limit, bool):\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise TypeError(msg)\n try:\n limit = int(raw_limit)\n except (TypeError, ValueError) as e:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg) from e\n if limit <= 0:\n msg = \"Invalid filter_expression.limit: expected a positive integer.\"\n raise ValueError(msg)\n return limit\n\n def _resolve_score_threshold(self, filter_obj: dict | None) -> float | None:\n \"\"\"Resolve optional positive min score from filter settings.\"\"\"\n if not filter_obj:\n return None\n score_threshold = filter_obj.get(\"score_threshold\")\n if score_threshold is None:\n score_threshold = filter_obj.get(\"scoreThreshold\")\n if not isinstance(score_threshold, (int, float)) or score_threshold <= 0:\n return None\n return float(score_threshold)\n\n def _detect_available_models(\n self, client: OpenSearch, filter_clauses: list[dict] | None = None\n ) -> list[str]:\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}}},\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\"bool\": {\"filter\": filter_clauses}}\n\n logger.debug(f\"Model detection query: {agg_query}\")\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n # Log detailed bucket info for debugging\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n if not models:\n total_hits = result.get(\"hits\", {}).get(\"total\", {})\n total_count = (\n total_hits.get(\"value\", 0) if isinstance(total_hits, dict) else total_hits\n )\n logger.warning(\n f\"No embedding_model values found in index '{self.index_name}'. \"\n f\"Total docs in index: {total_count}. \"\n f\"This may indicate documents were indexed without the embedding_model field.\"\n )\n except (OpenSearchException, KeyError, ValueError) as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n fallback_model = self._get_embedding_model_name()\n logger.info(f\"Using fallback model: {fallback_model}\")\n return [fallback_model]\n else:\n return models\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except OpenSearchException as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n return bool(isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\")\n\n def _get_field_dimension(\n self, properties: dict[str, Any] | None, field_name: str\n ) -> int | None:\n \"\"\"Get the dimension of a knn_vector field from the index mapping.\n\n Args:\n properties: Index properties from mapping\n field_name: Name of the vector field\n\n Returns:\n Dimension of the field, or None if not found\n \"\"\"\n if not field_name or properties is None:\n return None\n\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return None\n\n # Check direct knn_vector field\n if field_def.get(\"type\") == \"knn_vector\":\n return field_def.get(\"dimension\")\n\n # Check nested properties\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return nested_props.get(\"dimension\")\n\n return None\n\n def _get_filename_agg_field(self, index_properties: dict[str, Any] | None) -> str:\n \"\"\"Choose the appropriate field for filename aggregations.\"\"\"\n if not index_properties:\n return \"filename.keyword\"\n\n filename_def = index_properties.get(\"filename\")\n if not isinstance(filename_def, dict):\n return \"filename.keyword\"\n\n field_type = filename_def.get(\"type\")\n fields_def = filename_def.get(\"fields\", {})\n\n # Top-level keyword with no subfields\n if field_type == \"keyword\" and not isinstance(fields_def, dict):\n return \"filename\"\n\n # Text field with keyword subfield\n if isinstance(fields_def, dict) and \"keyword\" in fields_def:\n return \"filename.keyword\"\n\n # Fallback: aggregate on filename directly\n return \"filename\"\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = self._parse_filter_expression()\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Check if embedding is None (fail-safe mode)\n if self.embedding is None or (\n isinstance(self.embedding, list) and all(e is None for e in self.embedding)\n ):\n logger.error(\"Embedding returned None (fail-safe mode enabled). Cannot perform search.\")\n return []\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models\n query_embeddings = {}\n\n # Normalize embedding to list\n embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]\n # Filter out None values (fail-safe mode)\n embeddings_list = [e for e in embeddings_list if e is not None]\n\n if not embeddings_list:\n logger.error(\n \"No valid embeddings available after filtering None values (fail-safe mode). Cannot perform search.\"\n )\n return []\n\n # Create a comprehensive map of model names to embedding objects\n # Check all possible identifiers (deployment, model, model_id, model_name)\n # Also leverage available_models list from EmbeddingsWithModels\n # Handle duplicate identifiers by creating combined keys\n embedding_by_model = {}\n identifier_conflicts = {} # Track which identifiers have conflicts\n\n for idx, emb_obj in enumerate(embeddings_list):\n # Get all possible identifiers for this embedding\n identifiers = []\n deployment = getattr(emb_obj, \"deployment\", None)\n model = getattr(emb_obj, \"model\", None)\n model_id = getattr(emb_obj, \"model_id\", None)\n model_name = getattr(emb_obj, \"model_name\", None)\n dimensions = getattr(emb_obj, \"dimensions\", None)\n available_models_attr = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Embedding object {idx}: deployment={deployment}, model={model}, \"\n f\"model_id={model_id}, model_name={model_name}, dimensions={dimensions}, \"\n f\"available_models={available_models_attr}\"\n )\n\n # If this embedding has available_models dict, map all models to their dedicated instances\n if available_models_attr and isinstance(available_models_attr, dict):\n logger.info(\n f\"Embedding object {idx} provides {len(available_models_attr)} models via available_models dict\"\n )\n for model_name_key, dedicated_embedding in available_models_attr.items():\n if model_name_key and str(model_name_key).strip():\n model_str = str(model_name_key).strip()\n if model_str not in embedding_by_model:\n # Use the dedicated embedding instance from the dict\n embedding_by_model[model_str] = dedicated_embedding\n logger.info(\n f\"Mapped available model '{model_str}' to dedicated embedding instance\"\n )\n else:\n # Conflict detected - track it\n if model_str not in identifier_conflicts:\n identifier_conflicts[model_str] = [embedding_by_model[model_str]]\n identifier_conflicts[model_str].append(dedicated_embedding)\n logger.warning(\n f\"Available model '{model_str}' has conflict - used by multiple embeddings\"\n )\n\n # Also map traditional identifiers (for backward compatibility)\n if deployment:\n identifiers.append(str(deployment))\n if model:\n identifiers.append(str(model))\n if model_id:\n identifiers.append(str(model_id))\n if model_name:\n identifiers.append(str(model_name))\n\n # Map all identifiers to this embedding object\n for identifier in identifiers:\n if identifier not in embedding_by_model:\n embedding_by_model[identifier] = emb_obj\n logger.info(f\"Mapped identifier '{identifier}' to embedding object {idx}\")\n else:\n # Conflict detected - track it\n if identifier not in identifier_conflicts:\n identifier_conflicts[identifier] = [embedding_by_model[identifier]]\n identifier_conflicts[identifier].append(emb_obj)\n logger.warning(\n f\"Identifier '{identifier}' has conflict - used by multiple embeddings\"\n )\n\n # For embeddings with model+deployment, create combined identifier\n # This helps when deployment is the same but model differs\n if deployment and model and deployment != model:\n combined_id = f\"{deployment}:{model}\"\n if combined_id not in embedding_by_model:\n embedding_by_model[combined_id] = emb_obj\n logger.info(\n f\"Created combined identifier '{combined_id}' for embedding object {idx}\"\n )\n\n # Log conflicts\n if identifier_conflicts:\n logger.warning(\n f\"Found {len(identifier_conflicts)} conflicting identifiers. \"\n f\"Consider using combined format 'deployment:model' or specifying unique model names.\"\n )\n for conflict_id, emb_list in identifier_conflicts.items():\n logger.warning(\n f\" Conflict on '{conflict_id}': {len(emb_list)} embeddings use this identifier\"\n )\n\n logger.info(f\"Generating embeddings for {len(available_models)} models in index\")\n logger.info(f\"Available embedding identifiers: {list(embedding_by_model.keys())}\")\n self.log(f\"[SEARCH] Models detected in index: {available_models}\")\n self.log(f\"[SEARCH] Available embedding identifiers: {list(embedding_by_model.keys())}\")\n\n # Track matching status for debugging\n matched_models = []\n unmatched_models = []\n\n for model_name in available_models:\n try:\n # Check if we have an embedding object for this model\n if model_name in embedding_by_model:\n # Use the matching embedding object directly\n emb_obj = embedding_by_model[model_name]\n emb_deployment = getattr(emb_obj, \"deployment\", None)\n emb_model = getattr(emb_obj, \"model\", None)\n emb_model_id = getattr(emb_obj, \"model_id\", None)\n emb_dimensions = getattr(emb_obj, \"dimensions\", None)\n emb_available_models = getattr(emb_obj, \"available_models\", None)\n\n logger.info(\n f\"Using embedding object for model '{model_name}': \"\n f\"deployment={emb_deployment}, model={emb_model}, model_id={emb_model_id}, \"\n f\"dimensions={emb_dimensions}\"\n )\n\n # Check if this is a dedicated instance from available_models dict\n if emb_available_models and isinstance(emb_available_models, dict):\n logger.info(\n f\"Model '{model_name}' using dedicated instance from available_models dict \"\n f\"(pre-configured with correct model and dimensions)\"\n )\n\n # Use the embedding instance directly - no model switching needed!\n vec = emb_obj.embed_query(q)\n query_embeddings[model_name] = vec\n matched_models.append(model_name)\n logger.info(\n f\"Generated embedding for model: {model_name} (actual dimensions: {len(vec)})\"\n )\n self.log(f\"[MATCH] Model '{model_name}' - generated {len(vec)}-dim embedding\")\n else:\n # No matching embedding found for this model\n unmatched_models.append(model_name)\n logger.warning(\n f\"No matching embedding found for model '{model_name}'. \"\n f\"This model will be skipped. Available identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(\n f\"[NO MATCH] Model '{model_name}' - available: {list(embedding_by_model.keys())}\"\n )\n except (\n RuntimeError,\n ValueError,\n ConnectionError,\n TimeoutError,\n AttributeError,\n KeyError,\n ) as e:\n logger.warning(f\"Failed to generate embedding for {model_name}: {e}\")\n self.log(f\"[ERROR] Embedding generation failed for '{model_name}': {e}\")\n\n # Log summary of model matching\n logger.info(\n f\"Model matching summary: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n self.log(\n f\"[SUMMARY] Model matching: {len(matched_models)} matched, {len(unmatched_models)} unmatched\"\n )\n if unmatched_models:\n self.log(f\"[WARN] Unmatched models in index: {unmatched_models}\")\n\n if not query_embeddings:\n msg = (\n f\"Failed to generate embeddings for any model. \"\n f\"Index has models: {available_models}, but no matching embedding objects found. \"\n f\"Available embedding identifiers: {list(embedding_by_model.keys())}\"\n )\n self.log(f\"[FAIL] Search failed: {msg}\")\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n vector_dim = len(embedding_vector)\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n self.log(\n f\"[SKIP] Field '{selected_field}' not a knn_vector - skipping model '{model_name}'\"\n )\n continue\n\n # Validate vector dimensions match the field dimensions\n field_dim = self._get_field_dimension(index_properties, selected_field)\n if field_dim is not None and field_dim != vector_dim:\n logger.error(\n f\"Dimension mismatch for model '{model_name}': \"\n f\"Query vector has {vector_dim} dimensions but field '{selected_field}' expects {field_dim}. \"\n f\"Skipping this model to prevent search errors.\"\n )\n self.log(\n f\"[DIM MISMATCH] Model '{model_name}': query={vector_dim} vs field={field_dim} - skipping\"\n )\n continue\n\n logger.info(\n f\"Adding KNN query for model '{model_name}': field='{selected_field}', \"\n f\"query_dims={vector_dim}, field_dims={field_dim or 'unknown'}\"\n )\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n self.log(\n f\"[WARN] No valid KNN queries could be built. \"\n f\"Query embeddings generated: {list(query_embeddings.keys())}, \"\n f\"but no matching knn_vector fields found in index.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1,\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = self._resolve_limit(filter_obj, default_limit=self.number_of_results)\n score_threshold = self._resolve_score_threshold(filter_obj)\n\n # Determine the best aggregation field for filename based on index mapping\n filename_agg_field = self._get_filename_agg_field(index_properties)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates,\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": filename_agg_field, \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n \"allowed_principals\",\n ],\n \"size\": limit,\n }\n\n if score_threshold is not None:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models: \"\n f\"{list(query_embeddings.keys())}\"\n )\n self.log(\n f\"[EXEC] Executing search with {len(knn_queries_with_candidates)} KNN queries, limit={limit}\"\n )\n self.log(f\"[EXEC] Embedding models used: {list(query_embeddings.keys())}\")\n self.log(f\"[EXEC] KNN fields being queried: {embedding_fields}\")\n\n try:\n resp = client.search(index=self.index_name, body=body, params={\"terminate_after\": 0})\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = (\n knn_queries_without_candidates\n )\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n self.log(f\"[RESULT] Search complete: {len(hits)} results found\")\n\n if len(hits) == 0:\n self.log(\n f\"[EMPTY] Debug info: \"\n f\"models_in_index={available_models}, \"\n f\"matched_models={matched_models}, \"\n f\"knn_fields={embedding_fields}, \"\n f\"filters={len(filter_clauses)} clauses\"\n )\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> Table:\n \"\"\"Search documents and return results as a Table.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Table (DataFrame) format\n so downstream Parser components can consume them directly.\n\n Always builds the vector store (triggering ingestion if needed), then performs\n search only if a query is provided.\n\n Returns:\n Table containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n\n try:\n # Always build/cache the vector store to ensure ingestion happens\n logger.info(f\"Search query: {self.search_query}\")\n if self._cached_vector_store is None:\n self.build_vector_store()\n\n # Only perform search if query is provided\n search_query = (self.search_query or \"\").strip()\n if not search_query:\n self.log(\"No search query provided - ingestion completed, returning empty results\")\n\n return Table(data=[])\n\n # Perform search with the provided query\n raw = self.search(search_query)\n raw_list = [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n return Table(data=raw_list)\n\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(\n self, build_config: dict, field_value: str, field_name: str | None = None\n ) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n is_openrag = mode == \"openrag\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n # build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n # In 'openrag' mode, expose the OPENRAG_* fields up front\n # since they are the only credentials required.\n for openrag_field in (\n \"openrag_ingest_url\",\n \"openrag_ingest_token\",\n \"openrag_ingest_run_id\",\n \"openrag_ingest_batch_size\",\n ):\n if openrag_field in build_config:\n build_config[openrag_field][\"advanced\"] = not is_openrag\n build_config[openrag_field][\"required\"] = (\n is_openrag and openrag_field != \"openrag_ingest_batch_size\"\n )\n\n if is_basic or is_openrag:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" }, "docs_metadata": { "_input_type": "TableInput", diff --git a/kubernetes/helm/openrag/templates/backend/backend-dotenv.yaml b/kubernetes/helm/openrag/templates/backend/backend-dotenv.yaml index 6cbdbfd65..ff362b2dc 100644 --- a/kubernetes/helm/openrag/templates/backend/backend-dotenv.yaml +++ b/kubernetes/helm/openrag/templates/backend/backend-dotenv.yaml @@ -83,6 +83,8 @@ stringData: {{- if .Values.global.opensearch.indexName }} OPENSEARCH_INDEX_NAME={{ .Values.global.opensearch.indexName | quote }} {{- end }} + OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS={{ .Values.global.opensearch.numberOfShards | quote }} + OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS={{ .Values.global.opensearch.numberOfReplicas | quote }} LANGFLOW_OPENSEARCH_HOST={{ include "openrag.langflow.opensearch.host" . | quote }} LANGFLOW_OPENSEARCH_PORT={{ include "openrag.langflow.opensearch.port" . | quote }} diff --git a/kubernetes/helm/openrag/templates/langflow/langflow-dotenv.yaml b/kubernetes/helm/openrag/templates/langflow/langflow-dotenv.yaml index 88a6b86a6..c977d56fc 100644 --- a/kubernetes/helm/openrag/templates/langflow/langflow-dotenv.yaml +++ b/kubernetes/helm/openrag/templates/langflow/langflow-dotenv.yaml @@ -113,6 +113,8 @@ stringData: OPENSEARCH_PORT={{ include "openrag.langflow.opensearch.port" . | quote }} OPENSEARCH_URL={{ include "openrag.langflow.opensearch.url" . | quote }} OPENSEARCH_INDEX_NAME={{ .Values.global.opensearch.indexName | quote }} + OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS={{ .Values.global.opensearch.numberOfShards | quote }} + OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS={{ .Values.global.opensearch.numberOfReplicas | quote }} {{- if .Values.global.opensearch.password }} OPENSEARCH_PASSWORD={{ .Values.global.opensearch.password | quote }} {{- end }} diff --git a/kubernetes/helm/openrag/values.yaml b/kubernetes/helm/openrag/values.yaml index 8528442ff..7eb793eae 100644 --- a/kubernetes/helm/openrag/values.yaml +++ b/kubernetes/helm/openrag/values.yaml @@ -28,6 +28,8 @@ global: username: "admin" # OpenSearch username password: "" # OpenSearch password (stored in secret) indexName: "documents" # OpenSearch index name + numberOfShards: 1 # Primary shards for newly-created OpenRAG indices + numberOfReplicas: 0 # Replica shards for OpenRAG indices docling: host: docling-serve.docling.svc.cluster.local diff --git a/kubernetes/operator/README.md b/kubernetes/operator/README.md index d99c30fed..614c7693e 100644 --- a/kubernetes/operator/README.md +++ b/kubernetes/operator/README.md @@ -289,6 +289,8 @@ spec: size: 10Gi opensearch: host: opensearch-coordinating.opensearch.svc.cluster.local + numberOfShards: 1 + numberOfReplicas: 0 credentialsSecret: opensearch-credentials # keys: username, password # docling: # optional # host: docling-serve.docling.svc.cluster.local diff --git a/kubernetes/operator/api/v1alpha1/openrag_types.go b/kubernetes/operator/api/v1alpha1/openrag_types.go index f2bfa0ef6..62348aefc 100644 --- a/kubernetes/operator/api/v1alpha1/openrag_types.go +++ b/kubernetes/operator/api/v1alpha1/openrag_types.go @@ -319,6 +319,18 @@ type OpenSearchSpec struct { // +kubebuilder:default="documents" IndexName string `json:"indexName,omitempty"` + // NumberOfShards configures primary shards for newly-created OpenRAG indices. + // +optional + // +kubebuilder:default=1 + // +kubebuilder:validation:Minimum=1 + NumberOfShards int32 `json:"numberOfShards,omitempty"` + + // NumberOfReplicas configures replica shards for OpenRAG indices. + // +optional + // +kubebuilder:default=0 + // +kubebuilder:validation:Minimum=0 + NumberOfReplicas int32 `json:"numberOfReplicas,omitempty"` + // CredentialsSecret is the name of a Secret with keys "username" and "password". // +optional CredentialsSecret string `json:"credentialsSecret,omitempty"` diff --git a/kubernetes/operator/config/crd/bases/openr.ag_openrags.yaml b/kubernetes/operator/config/crd/bases/openr.ag_openrags.yaml index 9f8ad5f19..8c0e385f5 100644 --- a/kubernetes/operator/config/crd/bases/openr.ag_openrags.yaml +++ b/kubernetes/operator/config/crd/bases/openr.ag_openrags.yaml @@ -13276,6 +13276,20 @@ spec: default: documents description: IndexName used for document storage. type: string + numberOfReplicas: + default: 0 + description: NumberOfReplicas configures replica shards for + OpenRAG indices. + format: int32 + minimum: 0 + type: integer + numberOfShards: + default: 1 + description: NumberOfShards configures primary shards for newly-created + OpenRAG indices. + format: int32 + minimum: 1 + type: integer port: default: 9200 format: int32 diff --git a/kubernetes/operator/config/samples/kind-cluster-openrag-cr.yaml b/kubernetes/operator/config/samples/kind-cluster-openrag-cr.yaml index 9d50a94b5..7ea87804d 100644 --- a/kubernetes/operator/config/samples/kind-cluster-openrag-cr.yaml +++ b/kubernetes/operator/config/samples/kind-cluster-openrag-cr.yaml @@ -54,6 +54,8 @@ spec: # port: 9200 # scheme: https # indexName: documents + # numberOfShards: 1 + # numberOfReplicas: 0 # credentialsSecret: opensearch-credentials # keys: username, password # Operator-managed Docling components — optional document processing diff --git a/kubernetes/operator/config/samples/openrag_v1alpha1_openrag.yaml b/kubernetes/operator/config/samples/openrag_v1alpha1_openrag.yaml index a3365dbb6..2856c983e 100644 --- a/kubernetes/operator/config/samples/openrag_v1alpha1_openrag.yaml +++ b/kubernetes/operator/config/samples/openrag_v1alpha1_openrag.yaml @@ -69,6 +69,8 @@ spec: # port: 9200 # scheme: https # indexName: documents + # numberOfShards: 1 + # numberOfReplicas: 0 # credentialsSecret: opensearch-credentials # keys: username, password # Operator-managed Docling components — optional document processing diff --git a/kubernetes/operator/internal/controller/env.go b/kubernetes/operator/internal/controller/env.go index df9af75c4..5f09d95ee 100644 --- a/kubernetes/operator/internal/controller/env.go +++ b/kubernetes/operator/internal/controller/env.go @@ -75,9 +75,11 @@ func NewEnvVarManager() *EnvVarManager { "SELECTED_EMBEDDING_MODEL": "", // OpenSearch defaults (for variables in LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT) - "OPENSEARCH_PASSWORD": "None", - "OPENSEARCH_URL": "None", - "OPENSEARCH_INDEX_NAME": "None", + "OPENSEARCH_PASSWORD": "None", + "OPENSEARCH_URL": "None", + "OPENSEARCH_INDEX_NAME": "None", + "OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS": "1", + "OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS": "0", // Docling defaults (for variables in LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT) "DOCLING_SERVE_URL": "None", @@ -114,7 +116,9 @@ func NewEnvVarManager() *EnvVarManager { "OPENRAG_VERSION": "latest", // OpenSearch configuration - "OPENSEARCH_DATA_PATH": "", + "OPENSEARCH_DATA_PATH": "", + "OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS": "1", + "OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS": "0", // Logging configuration "LOG_LEVEL": "DEBUG", diff --git a/kubernetes/operator/internal/controller/openrag_controller.go b/kubernetes/operator/internal/controller/openrag_controller.go index 5bf46ddae..86a8324cf 100644 --- a/kubernetes/operator/internal/controller/openrag_controller.go +++ b/kubernetes/operator/internal/controller/openrag_controller.go @@ -306,6 +306,12 @@ func (r *OpenRAGReconciler) buildBackendEnv(ctx context.Context, o *openragv1alp if os.IndexName != "" { envVars["OPENSEARCH_INDEX_NAME"] = os.IndexName } + if os.NumberOfShards > 0 { + envVars["OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS"] = fmt.Sprintf("%d", os.NumberOfShards) + } + if os.NumberOfReplicas >= 0 { + envVars["OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS"] = fmt.Sprintf("%d", os.NumberOfReplicas) + } // Read OpenSearch credentials from user-provided secret if os.CredentialsSecret != "" { @@ -480,6 +486,12 @@ func (r *OpenRAGReconciler) buildLangflowEnv(ctx context.Context, o *openragv1al if os.IndexName != "" { envVars["OPENSEARCH_INDEX_NAME"] = os.IndexName } + if os.NumberOfShards > 0 { + envVars["OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS"] = fmt.Sprintf("%d", os.NumberOfShards) + } + if os.NumberOfReplicas >= 0 { + envVars["OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS"] = fmt.Sprintf("%d", os.NumberOfReplicas) + } } // WatsonX configuration from CR spec diff --git a/kubernetes/operator/internal/controller/openrag_controller_test.go b/kubernetes/operator/internal/controller/openrag_controller_test.go index 986c3e874..2f304aa5f 100644 --- a/kubernetes/operator/internal/controller/openrag_controller_test.go +++ b/kubernetes/operator/internal/controller/openrag_controller_test.go @@ -1359,6 +1359,28 @@ func TestEnvHash_ChangesWhenEnvChanges(t *testing.T) { assert.NotEqual(t, hash1, hash2, "Hash should change when env vars change") } +func TestBuildEnv_IncludesOpenSearchIndexSettings(t *testing.T) { + s := newScheme(t) + cr := minimalCR("test-openrag", "test-ns") + cr.Spec.OpenSearch = &openragv1alpha1.OpenSearchSpec{ + Host: "opensearch.example.com", + NumberOfShards: 3, + NumberOfReplicas: 2, + } + + r, _ := reconciler(s, cr) + + backendEnvContent, err := r.buildBackendEnv(context.Background(), cr, "test-ns") + require.NoError(t, err) + langflowEnvContent, err := r.buildLangflowEnv(context.Background(), cr, "test-ns") + require.NoError(t, err) + + for _, envContent := range []string{backendEnvContent, langflowEnvContent} { + assert.Equal(t, "3", parseEnvValue(envContent, "OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS")) + assert.Equal(t, "2", parseEnvValue(envContent, "OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS")) + } +} + func TestDeployment_ContainsEnvHashAnnotation(t *testing.T) { // Test that backend deployment has env hash annotation s := newScheme(t) diff --git a/src/config/settings.py b/src/config/settings.py index 280667e73..9cffd827c 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -447,16 +447,24 @@ def is_no_auth_mode(): # actual frontend origin that is carried in the OAuth state parameter. OAUTH_BROKER_URL = os.getenv("OAUTH_BROKER_URL") + +def _get_min_env_int(key: str, default: int, minimum: int) -> int: + """Read an integer env var, clamped to a minimum valid value.""" + return max(get_env_int(key, default), minimum) + + # OpenSearch configuration VECTOR_DIM = 1536 KNN_EF_CONSTRUCTION = 100 KNN_M = 16 +OPENSEARCH_NUMBER_OF_SHARDS = _get_min_env_int("OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS", 1, 1) +OPENSEARCH_NUMBER_OF_REPLICAS = _get_min_env_int("OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS", 0, 0) INDEX_BODY = { "settings": { "index": {"knn": True}, - "number_of_shards": 1, - "number_of_replicas": 0, + "number_of_shards": OPENSEARCH_NUMBER_OF_SHARDS, + "number_of_replicas": OPENSEARCH_NUMBER_OF_REPLICAS, }, "mappings": { "properties": { @@ -492,7 +500,10 @@ def is_no_auth_mode(): DLS_PRINCIPAL_INDEX_NAME = "openrag_dls_principals" DLS_PRINCIPAL_INDEX_BODY: dict[str, Any] = { "settings": { - "index": {"number_of_replicas": 0, "number_of_shards": 1}, + "index": { + "number_of_replicas": OPENSEARCH_NUMBER_OF_REPLICAS, + "number_of_shards": OPENSEARCH_NUMBER_OF_SHARDS, + }, }, "mappings": { "properties": { @@ -511,8 +522,8 @@ def is_no_auth_mode(): API_KEYS_INDEX_NAME = "api_keys" API_KEYS_INDEX_BODY = { "settings": { - "number_of_shards": 1, - "number_of_replicas": 0, + "number_of_shards": OPENSEARCH_NUMBER_OF_SHARDS, + "number_of_replicas": OPENSEARCH_NUMBER_OF_REPLICAS, }, "mappings": { "properties": { diff --git a/src/tui/config_fields.py b/src/tui/config_fields.py index f3e0290ac..21be6d0a1 100644 --- a/src/tui/config_fields.py +++ b/src/tui/config_fields.py @@ -92,6 +92,16 @@ class ConfigSection: placeholder="documents", default="documents", helper_text="Name of the index to use in OpenSearch", ), + ConfigField( + "opensearch_number_of_shards", "OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS", "Primary Shards", + placeholder="1", default="1", advanced=True, + helper_text="Primary shard count for newly-created OpenRAG indices", + ), + ConfigField( + "opensearch_number_of_replicas", "OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS", "Replica Shards", + placeholder="0", default="0", advanced=True, + helper_text="Replica shard count for OpenRAG indices", + ), ]), # ── Langflow ──────────────────────────────────────────────── diff --git a/src/tui/managers/env_manager.py b/src/tui/managers/env_manager.py index 7f13e66e1..f93604975 100644 --- a/src/tui/managers/env_manager.py +++ b/src/tui/managers/env_manager.py @@ -37,6 +37,8 @@ class EnvConfig: opensearch_host: str = "opensearch" opensearch_port: str = "9200" opensearch_index_name: str = "documents" + opensearch_number_of_shards: str = "1" + opensearch_number_of_replicas: str = "0" langflow_secret_key: str = "" langflow_superuser: str = "admin" langflow_superuser_password: str = "" @@ -193,6 +195,8 @@ def _env_attr_map(self) -> Dict[str, str]: "OPENSEARCH_HOST": "opensearch_host", "OPENSEARCH_PORT": "opensearch_port", "OPENSEARCH_INDEX_NAME": "opensearch_index_name", + "OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS": "opensearch_number_of_shards", + "OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS": "opensearch_number_of_replicas", "LANGFLOW_SECRET_KEY": "langflow_secret_key", # pragma: allowlist secret "LANGFLOW_SUPERUSER": "langflow_superuser", "LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password", # pragma: allowlist secret @@ -478,6 +482,14 @@ def save_env_file(self) -> bool: if self.config.opensearch_port and self.config.opensearch_port != "9200": f.write(f"OPENSEARCH_PORT={self._quote_env_value(self.config.opensearch_port)}\n") f.write(f"OPENSEARCH_INDEX_NAME={self._quote_env_value(self.config.opensearch_index_name)}\n") + f.write( + "OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS=" + f"{self._quote_env_value(self.config.opensearch_number_of_shards)}\n" + ) + f.write( + "OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS=" + f"{self._quote_env_value(self.config.opensearch_number_of_replicas)}\n" + ) # Expand $HOME in paths before writing to .env # This ensures paths work with all compose implementations (docker, podman) diff --git a/src/utils/embeddings.py b/src/utils/embeddings.py index 4226324d8..01b6c5086 100644 --- a/src/utils/embeddings.py +++ b/src/utils/embeddings.py @@ -16,6 +16,8 @@ async def create_index_body( from config.embedding_constants import OPENAI_DEFAULT_EMBEDDING_MODEL from config.settings import ( ACL_PRINCIPAL_LABELS_MAPPING, + OPENSEARCH_NUMBER_OF_REPLICAS, + OPENSEARCH_NUMBER_OF_SHARDS, VECTOR_DIM, get_openrag_config, ) @@ -59,6 +61,10 @@ async def create_index_body( ) return { - "settings": {"index": {"knn": True}, "number_of_shards": 1, "number_of_replicas": 0}, + "settings": { + "index": {"knn": True}, + "number_of_shards": OPENSEARCH_NUMBER_OF_SHARDS, + "number_of_replicas": OPENSEARCH_NUMBER_OF_REPLICAS, + }, "mappings": {"properties": properties}, } diff --git a/src/utils/opensearch_init.py b/src/utils/opensearch_init.py index f365e1f5e..b5718ba3c 100644 --- a/src/utils/opensearch_init.py +++ b/src/utils/opensearch_init.py @@ -14,6 +14,8 @@ IBM_AUTH_ENABLED, INDEX_BODY, OPENRAG_SKIP_OS_SECURITY_SETUP, + OPENSEARCH_NUMBER_OF_REPLICAS, + OPENSEARCH_NUMBER_OF_SHARDS, PLATFORM_AUTH_DEV_MODE, clients, get_index_name, @@ -102,6 +104,37 @@ async def _ensure_field_mappings( ) +async def _ensure_index_replicas(os_client, index_name: str) -> None: + """Set an existing index's replica count to the configured value.""" + if IBM_AUTH_ENABLED and PLATFORM_AUTH_DEV_MODE: + return + + try: + current = await os_client.indices.get_settings(index=index_name) + current_replicas = int( + current[index_name]["settings"]["index"].get( + "number_of_replicas", OPENSEARCH_NUMBER_OF_REPLICAS + ) + ) + if current_replicas != OPENSEARCH_NUMBER_OF_REPLICAS: + await os_client.indices.put_settings( + index=index_name, + body={"index": {"number_of_replicas": OPENSEARCH_NUMBER_OF_REPLICAS}}, + ) + logger.info( + "Updated OpenSearch index replica setting", + index_name=index_name, + number_of_replicas=OPENSEARCH_NUMBER_OF_REPLICAS, + ) + except Exception as e: + logger.warning( + "Failed to check or update index replicas", + index_name=index_name, + desired_replicas=OPENSEARCH_NUMBER_OF_REPLICAS, + error=str(e), + ) + + async def wait_for_opensearch(opensearch_client=None): """Wait for OpenSearch to be ready, delegating to the shared utility.""" from utils.opensearch_utils import ( @@ -162,6 +195,7 @@ async def _ensure_opensearch_index(): index_name, {"allowed_principal_labels": ACL_PRINCIPAL_LABELS_MAPPING}, ) + await _ensure_index_replicas(clients.opensearch, index_name) return await clients.opensearch.indices.create(index=index_name, body=INDEX_BODY) @@ -236,25 +270,7 @@ async def init_index(opensearch_client=None, admin_username: str = None): index_name, {"allowed_principal_labels": ACL_PRINCIPAL_LABELS_MAPPING}, ) - if not (IBM_AUTH_ENABLED and PLATFORM_AUTH_DEV_MODE): - # Set number of replicas to 0 to not create unused nodes in OpenSearch, in case it was created with more replicas - try: - current = await os_client.indices.get_settings(index=index_name) - current_replicas = int( - current[index_name]["settings"]["index"].get("number_of_replicas", 1) - ) - if current_replicas != 0: - await os_client.indices.put_settings( - index=index_name, - body={"index": {"number_of_replicas": 0}}, - ) - logger.info("Updated documents index settings") - except Exception as e: - logger.warning( - "Failed to check or update index replicas", - index_name=index_name, - error=str(e), - ) + await _ensure_index_replicas(os_client, index_name) await TelemetryClient.send_event( Category.OPENSEARCH_INDEX, MessageId.ORB_OS_INDEX_EXISTS ) @@ -262,7 +278,10 @@ async def init_index(opensearch_client=None, admin_username: str = None): knowledge_filter_index_name = "knowledge_filters" knowledge_filter_index_body = { "settings": { - "index": {"number_of_replicas": 0, "number_of_shards": 1}, + "index": { + "number_of_replicas": OPENSEARCH_NUMBER_OF_REPLICAS, + "number_of_shards": OPENSEARCH_NUMBER_OF_SHARDS, + }, }, "mappings": { "properties": { @@ -303,28 +322,7 @@ async def init_index(opensearch_client=None, admin_username: str = None): ["allowed_users", "allowed_groups", "allowed_principals"], ) - if not (IBM_AUTH_ENABLED and PLATFORM_AUTH_DEV_MODE): - try: - current = await os_client.indices.get_settings( - index=knowledge_filter_index_name - ) - current_replicas = int( - current[knowledge_filter_index_name]["settings"]["index"].get( - "number_of_replicas", 1 - ) - ) - if current_replicas != 0: - await os_client.indices.put_settings( - index=knowledge_filter_index_name, - body={"index": {"number_of_replicas": 0}}, - ) - logger.info("Updated knowledge filters index settings") - except Exception as e: - logger.warning( - "Failed to check or update knowledge filter index replicas", - index_name=knowledge_filter_index_name, - error=str(e), - ) + await _ensure_index_replicas(os_client, knowledge_filter_index_name) if not await os_client.indices.exists(index=API_KEYS_INDEX_NAME): await os_client.indices.create(index=API_KEYS_INDEX_NAME, body=API_KEYS_INDEX_BODY) @@ -334,6 +332,7 @@ async def init_index(opensearch_client=None, admin_username: str = None): "API keys index already exists, skipping creation", index_name=API_KEYS_INDEX_NAME, ) + await _ensure_index_replicas(os_client, API_KEYS_INDEX_NAME) if not await os_client.indices.exists(index=DLS_PRINCIPAL_INDEX_NAME): await os_client.indices.create( @@ -356,6 +355,7 @@ async def init_index(opensearch_client=None, admin_username: str = None): DLS_PRINCIPAL_INDEX_NAME, {"principal_labels": ACL_PRINCIPAL_LABELS_MAPPING}, ) + await _ensure_index_replicas(os_client, DLS_PRINCIPAL_INDEX_NAME) await configure_alerting_security() diff --git a/tests/unit/test_embedding_fields.py b/tests/unit/test_embedding_fields.py index 1fa1acbbb..3d56e89cf 100644 --- a/tests/unit/test_embedding_fields.py +++ b/tests/unit/test_embedding_fields.py @@ -124,3 +124,17 @@ async def test_create_index_body_precreates_configured_embedding_field( assert properties[embedding_field] == build_knn_vector_field(3072) assert properties["owner_email"] == {"type": "keyword"} + + @pytest.mark.asyncio + async def test_create_index_body_uses_configured_shards_and_replicas( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setattr("config.settings.OPENSEARCH_NUMBER_OF_SHARDS", 3) + monkeypatch.setattr("config.settings.OPENSEARCH_NUMBER_OF_REPLICAS", 2) + + from utils.embeddings import create_index_body + + body = await create_index_body("text-embedding-3-small", 1536) + + assert body["settings"]["number_of_shards"] == 3 + assert body["settings"]["number_of_replicas"] == 2 From f0901d2c98425215fc2a62e7550ed9c80122bdfe Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 18:00:46 +0000 Subject: [PATCH 2/2] style: ruff autofix (auto) --- src/tui/config_fields.py | 611 +++++++++++++++++++------------- src/tui/managers/env_manager.py | 129 ++++--- 2 files changed, 448 insertions(+), 292 deletions(-) diff --git a/src/tui/config_fields.py b/src/tui/config_fields.py index 21be6d0a1..412def39e 100644 --- a/src/tui/config_fields.py +++ b/src/tui/config_fields.py @@ -4,13 +4,15 @@ Both the TUI config screen and the CLI wizard consume these definitions. """ -from dataclasses import dataclass, field as dataclass_field -from typing import Callable, Optional +from collections.abc import Callable +from dataclasses import dataclass +from dataclasses import field as dataclass_field +from typing import Optional from .utils.validation import ( - validate_openai_api_key, validate_anthropic_api_key, validate_ollama_endpoint, + validate_openai_api_key, validate_watsonx_endpoint, ) @@ -28,7 +30,7 @@ class ConfigField: required: bool = False advanced: bool = False # only shown in full/advanced mode helper_text: str = "" - validator: Optional[Callable[[str], bool]] = None + validator: Callable[[str], bool] | None = None validator_error: str = "" @@ -44,253 +46,378 @@ class ConfigSection: CONFIG_SECTIONS: list[ConfigSection] = [ # ── Security ──────────────────────────────────────────────── - ConfigSection("Security", [ - ConfigField( - "openrag_encryption_key", "OPENRAG_ENCRYPTION_KEY", "OpenRAG Master Key", - placeholder="Auto-generated secure Base64 key", - secret=True, required=True, - helper_text="32-byte Base64 key for securing your database credentials (auto-generates if empty)", - ), - ConfigField( - "openrag_tenant_id", "OPENRAG_TENANT_ID", "Tenant ID", - placeholder="openrag", default="openrag", - helper_text="Identifier for AAD tenant binding (default: openrag)", - ), - ConfigField( - "openrag_enforce_prerequisites", "OPENRAG_ENFORCE_PREREQUISITES", "Enforce Prerequisites", - placeholder="false", default="false", - advanced=True, - helper_text="If true, application will fail to start if the encryption key is missing", - ), - ]), - + ConfigSection( + "Security", + [ + ConfigField( + "openrag_encryption_key", + "OPENRAG_ENCRYPTION_KEY", + "OpenRAG Master Key", + placeholder="Auto-generated secure Base64 key", + secret=True, + required=True, + helper_text="32-byte Base64 key for securing your database credentials (auto-generates if empty)", + ), + ConfigField( + "openrag_tenant_id", + "OPENRAG_TENANT_ID", + "Tenant ID", + placeholder="openrag", + default="openrag", + helper_text="Identifier for AAD tenant binding (default: openrag)", + ), + ConfigField( + "openrag_enforce_prerequisites", + "OPENRAG_ENFORCE_PREREQUISITES", + "Enforce Prerequisites", + placeholder="false", + default="false", + advanced=True, + helper_text="If true, application will fail to start if the encryption key is missing", + ), + ], + ), # ── OpenSearch ────────────────────────────────────────────── - ConfigSection("OpenSearch", [ - ConfigField( - "opensearch_password", "OPENSEARCH_PASSWORD", "Admin Password", - placeholder="Auto-generated secure password", - secret=True, required=True, - helper_text="Validate your password here: https://lowe.github.io/tryzxcvbn/", - ), - ConfigField( - "opensearch_username", "OPENSEARCH_USERNAME", "Admin Username", - placeholder="admin", default="admin", - helper_text="OpenSearch admin username (default: admin)", - ), - ConfigField( - "opensearch_host", "OPENSEARCH_HOST", "Host", - placeholder="opensearch", default="opensearch", - helper_text="Override for remote OpenSearch instances (default: opensearch)", - ), - ConfigField( - "opensearch_port", "OPENSEARCH_PORT", "Port", - placeholder="9200", default="9200", - helper_text="Override for remote OpenSearch instances (default: 9200)", - ), - ConfigField( - "opensearch_index_name", "OPENSEARCH_INDEX_NAME", "Index Name", - placeholder="documents", default="documents", - helper_text="Name of the index to use in OpenSearch", - ), - ConfigField( - "opensearch_number_of_shards", "OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS", "Primary Shards", - placeholder="1", default="1", advanced=True, - helper_text="Primary shard count for newly-created OpenRAG indices", - ), - ConfigField( - "opensearch_number_of_replicas", "OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS", "Replica Shards", - placeholder="0", default="0", advanced=True, - helper_text="Replica shard count for OpenRAG indices", - ), - ]), - + ConfigSection( + "OpenSearch", + [ + ConfigField( + "opensearch_password", + "OPENSEARCH_PASSWORD", + "Admin Password", + placeholder="Auto-generated secure password", + secret=True, + required=True, + helper_text="Validate your password here: https://lowe.github.io/tryzxcvbn/", + ), + ConfigField( + "opensearch_username", + "OPENSEARCH_USERNAME", + "Admin Username", + placeholder="admin", + default="admin", + helper_text="OpenSearch admin username (default: admin)", + ), + ConfigField( + "opensearch_host", + "OPENSEARCH_HOST", + "Host", + placeholder="opensearch", + default="opensearch", + helper_text="Override for remote OpenSearch instances (default: opensearch)", + ), + ConfigField( + "opensearch_port", + "OPENSEARCH_PORT", + "Port", + placeholder="9200", + default="9200", + helper_text="Override for remote OpenSearch instances (default: 9200)", + ), + ConfigField( + "opensearch_index_name", + "OPENSEARCH_INDEX_NAME", + "Index Name", + placeholder="documents", + default="documents", + helper_text="Name of the index to use in OpenSearch", + ), + ConfigField( + "opensearch_number_of_shards", + "OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS", + "Primary Shards", + placeholder="1", + default="1", + advanced=True, + helper_text="Primary shard count for newly-created OpenRAG indices", + ), + ConfigField( + "opensearch_number_of_replicas", + "OPENRAG_OPENSEARCH_NUMBER_OF_REPLICAS", + "Replica Shards", + placeholder="0", + default="0", + advanced=True, + helper_text="Replica shard count for OpenRAG indices", + ), + ], + ), # ── Langflow ──────────────────────────────────────────────── - ConfigSection("Langflow", [ - ConfigField( - "langflow_superuser_password", "LANGFLOW_SUPERUSER_PASSWORD", - "Admin Password", - placeholder="Langflow password", secret=True, - helper_text="Leave empty for autologin (no password required)", - ), - ConfigField( - "langflow_superuser", "LANGFLOW_SUPERUSER", "Admin Username", - placeholder="admin", default="admin", - ), - ConfigField( - "langflow_data_path", "LANGFLOW_DATA_PATH", "Data Path", - placeholder="~/.openrag/data/langflow-data", - default="$HOME/.openrag/data/langflow-data", - helper_text="Directory to persist Langflow flows and state across restarts", - ), - ConfigField( - "langflow_public_url", "LANGFLOW_PUBLIC_URL", "Public URL", - placeholder="http://localhost:7860", - helper_text="External URL for Langflow access", - advanced=True, - ), - ]), - + ConfigSection( + "Langflow", + [ + ConfigField( + "langflow_superuser_password", + "LANGFLOW_SUPERUSER_PASSWORD", + "Admin Password", + placeholder="Langflow password", + secret=True, + helper_text="Leave empty for autologin (no password required)", + ), + ConfigField( + "langflow_superuser", + "LANGFLOW_SUPERUSER", + "Admin Username", + placeholder="admin", + default="admin", + ), + ConfigField( + "langflow_data_path", + "LANGFLOW_DATA_PATH", + "Data Path", + placeholder="~/.openrag/data/langflow-data", + default="$HOME/.openrag/data/langflow-data", + helper_text="Directory to persist Langflow flows and state across restarts", + ), + ConfigField( + "langflow_public_url", + "LANGFLOW_PUBLIC_URL", + "Public URL", + placeholder="http://localhost:7860", + helper_text="External URL for Langflow access", + advanced=True, + ), + ], + ), # ── AI Providers ──────────────────────────────────────────── - ConfigSection("AI Providers", [ - ConfigField( - "openai_api_key", "OPENAI_API_KEY", "OpenAI API Key", - placeholder="sk-...", secret=True, - helper_text="Get a key: https://platform.openai.com/api-keys", - validator=validate_openai_api_key, - validator_error="Invalid OpenAI API key format (should start with sk-)", - ), - ConfigField( - "anthropic_api_key", "ANTHROPIC_API_KEY", "Anthropic API Key", - placeholder="sk-ant-...", secret=True, - helper_text="Get a key: https://console.anthropic.com/settings/keys", - validator=validate_anthropic_api_key, - validator_error="Invalid Anthropic API key format (should start with sk-ant-)", - ), - ConfigField( - "ollama_endpoint", "OLLAMA_ENDPOINT", "Ollama Base URL", - placeholder="http://localhost:11434", - helper_text="Endpoint of your Ollama server", - validator=validate_ollama_endpoint, - validator_error="Invalid Ollama endpoint URL format", - ), - ConfigField( - "watsonx_api_key", "WATSONX_API_KEY", "IBM watsonx.ai API Key", - placeholder="", secret=True, - helper_text="Get a key: https://cloud.ibm.com/iam/apikeys", - ), - ConfigField( - "watsonx_endpoint", "WATSONX_ENDPOINT", "IBM watsonx.ai Endpoint", - placeholder="https://us-south.ml.cloud.ibm.com", - helper_text="Example: https://us-south.ml.cloud.ibm.com", - validator=validate_watsonx_endpoint, - validator_error="Invalid watsonx.ai endpoint URL format", - ), - ConfigField( - "watsonx_project_id", "WATSONX_PROJECT_ID", "IBM watsonx.ai Project ID", - placeholder="", - helper_text="Find in your IBM Cloud project settings", - ), - ]), - + ConfigSection( + "AI Providers", + [ + ConfigField( + "openai_api_key", + "OPENAI_API_KEY", + "OpenAI API Key", + placeholder="sk-...", + secret=True, + helper_text="Get a key: https://platform.openai.com/api-keys", + validator=validate_openai_api_key, + validator_error="Invalid OpenAI API key format (should start with sk-)", + ), + ConfigField( + "anthropic_api_key", + "ANTHROPIC_API_KEY", + "Anthropic API Key", + placeholder="sk-ant-...", + secret=True, + helper_text="Get a key: https://console.anthropic.com/settings/keys", + validator=validate_anthropic_api_key, + validator_error="Invalid Anthropic API key format (should start with sk-ant-)", + ), + ConfigField( + "ollama_endpoint", + "OLLAMA_ENDPOINT", + "Ollama Base URL", + placeholder="http://localhost:11434", + helper_text="Endpoint of your Ollama server", + validator=validate_ollama_endpoint, + validator_error="Invalid Ollama endpoint URL format", + ), + ConfigField( + "watsonx_api_key", + "WATSONX_API_KEY", + "IBM watsonx.ai API Key", + placeholder="", + secret=True, + helper_text="Get a key: https://cloud.ibm.com/iam/apikeys", + ), + ConfigField( + "watsonx_endpoint", + "WATSONX_ENDPOINT", + "IBM watsonx.ai Endpoint", + placeholder="https://us-south.ml.cloud.ibm.com", + helper_text="Example: https://us-south.ml.cloud.ibm.com", + validator=validate_watsonx_endpoint, + validator_error="Invalid watsonx.ai endpoint URL format", + ), + ConfigField( + "watsonx_project_id", + "WATSONX_PROJECT_ID", + "IBM watsonx.ai Project ID", + placeholder="", + helper_text="Find in your IBM Cloud project settings", + ), + ], + ), # ── Google OAuth ──────────────────────────────────────────── - ConfigSection("Google OAuth", [ - ConfigField( - "google_oauth_client_id", "GOOGLE_OAUTH_CLIENT_ID", "Client ID", - placeholder="xxx.apps.googleusercontent.com", - helper_text="Create credentials: https://console.cloud.google.com/apis/credentials", - ), - ConfigField( - "google_oauth_client_secret", "GOOGLE_OAUTH_CLIENT_SECRET", - "Client Secret", - placeholder="", secret=True, - ), - ], advanced=True, gate_prompt="Configure Google OAuth?"), - + ConfigSection( + "Google OAuth", + [ + ConfigField( + "google_oauth_client_id", + "GOOGLE_OAUTH_CLIENT_ID", + "Client ID", + placeholder="xxx.apps.googleusercontent.com", + helper_text="Create credentials: https://console.cloud.google.com/apis/credentials", + ), + ConfigField( + "google_oauth_client_secret", + "GOOGLE_OAUTH_CLIENT_SECRET", + "Client Secret", + placeholder="", + secret=True, + ), + ], + advanced=True, + gate_prompt="Configure Google OAuth?", + ), # ── Microsoft Graph OAuth ─────────────────────────────────── - ConfigSection("Microsoft Graph OAuth", [ - ConfigField( - "microsoft_graph_oauth_client_id", "MICROSOFT_GRAPH_OAUTH_CLIENT_ID", - "Client ID", - placeholder="", - helper_text="Create app: https://portal.azure.com/#view/Microsoft_AAD_RegisteredApps/ApplicationsListBlade", - ), - ConfigField( - "microsoft_graph_oauth_client_secret", - "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET", "Client Secret", - placeholder="", secret=True, - ), - ], advanced=True, gate_prompt="Configure Microsoft Graph OAuth?"), - + ConfigSection( + "Microsoft Graph OAuth", + [ + ConfigField( + "microsoft_graph_oauth_client_id", + "MICROSOFT_GRAPH_OAUTH_CLIENT_ID", + "Client ID", + placeholder="", + helper_text="Create app: https://portal.azure.com/#view/Microsoft_AAD_RegisteredApps/ApplicationsListBlade", + ), + ConfigField( + "microsoft_graph_oauth_client_secret", + "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET", + "Client Secret", + placeholder="", + secret=True, + ), + ], + advanced=True, + gate_prompt="Configure Microsoft Graph OAuth?", + ), # ── AWS ───────────────────────────────────────────────────── - ConfigSection("AWS", [ - ConfigField( - "aws_access_key_id", "AWS_ACCESS_KEY_ID", "Access Key ID", - placeholder="", - helper_text="Create keys: https://console.aws.amazon.com/iam/home#/security_credentials", - ), - ConfigField( - "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY", "Secret Access Key", - placeholder="", secret=True, - ), - ConfigField( - "aws_s3_endpoint", "AWS_S3_ENDPOINT", "S3 Endpoint URL (optional)", - placeholder="", - helper_text="Leave empty for AWS S3. For MinIO, R2, or other S3-compatible services, enter the endpoint URL.", - ), - ConfigField( - "aws_region", "AWS_REGION", "AWS Region (optional)", - placeholder="us-east-1", - default="us-east-1", - helper_text="AWS region (e.g. us-east-1, eu-west-1). Default: us-east-1.", - ), - ], advanced=True, gate_prompt="Configure AWS credentials?"), - + ConfigSection( + "AWS", + [ + ConfigField( + "aws_access_key_id", + "AWS_ACCESS_KEY_ID", + "Access Key ID", + placeholder="", + helper_text="Create keys: https://console.aws.amazon.com/iam/home#/security_credentials", + ), + ConfigField( + "aws_secret_access_key", + "AWS_SECRET_ACCESS_KEY", + "Secret Access Key", + placeholder="", + secret=True, + ), + ConfigField( + "aws_s3_endpoint", + "AWS_S3_ENDPOINT", + "S3 Endpoint URL (optional)", + placeholder="", + helper_text="Leave empty for AWS S3. For MinIO, R2, or other S3-compatible services, enter the endpoint URL.", + ), + ConfigField( + "aws_region", + "AWS_REGION", + "AWS Region (optional)", + placeholder="us-east-1", + default="us-east-1", + helper_text="AWS region (e.g. us-east-1, eu-west-1). Default: us-east-1.", + ), + ], + advanced=True, + gate_prompt="Configure AWS credentials?", + ), # ── IBM Cloud Object Storage ───────────────────────────────── - ConfigSection("IBM Cloud Object Storage", [ - ConfigField( - "ibm_cos_api_key", "IBM_COS_API_KEY", "API Key", - placeholder="", - helper_text="Create API key at https://cloud.ibm.com/iam/apikeys", - secret=True, - ), - ConfigField( - "ibm_cos_service_instance_id", "IBM_COS_SERVICE_INSTANCE_ID", - "Service Instance ID (CRN)", - placeholder="crn:v1:bluemix:...", - ), - ConfigField( - "ibm_cos_endpoint", "IBM_COS_ENDPOINT", "Service Endpoint", - placeholder="https://s3.us-south.cloud-object-storage.appdomain.cloud", - helper_text="Endpoints: https://cloud.ibm.com/docs/cloud-object-storage?topic=cloud-object-storage-endpoints", - ), - ConfigField( - "ibm_cos_hmac_access_key_id", "IBM_COS_HMAC_ACCESS_KEY_ID", - "HMAC Access Key ID (optional)", - placeholder="", - ), - ConfigField( - "ibm_cos_hmac_secret_access_key", "IBM_COS_HMAC_SECRET_ACCESS_KEY", - "HMAC Secret Access Key (optional)", - placeholder="", secret=True, - ), - ], advanced=True, gate_prompt="Configure IBM Cloud Object Storage?"), - + ConfigSection( + "IBM Cloud Object Storage", + [ + ConfigField( + "ibm_cos_api_key", + "IBM_COS_API_KEY", + "API Key", + placeholder="", + helper_text="Create API key at https://cloud.ibm.com/iam/apikeys", + secret=True, + ), + ConfigField( + "ibm_cos_service_instance_id", + "IBM_COS_SERVICE_INSTANCE_ID", + "Service Instance ID (CRN)", + placeholder="crn:v1:bluemix:...", + ), + ConfigField( + "ibm_cos_endpoint", + "IBM_COS_ENDPOINT", + "Service Endpoint", + placeholder="https://s3.us-south.cloud-object-storage.appdomain.cloud", + helper_text="Endpoints: https://cloud.ibm.com/docs/cloud-object-storage?topic=cloud-object-storage-endpoints", + ), + ConfigField( + "ibm_cos_hmac_access_key_id", + "IBM_COS_HMAC_ACCESS_KEY_ID", + "HMAC Access Key ID (optional)", + placeholder="", + ), + ConfigField( + "ibm_cos_hmac_secret_access_key", + "IBM_COS_HMAC_SECRET_ACCESS_KEY", + "HMAC Secret Access Key (optional)", + placeholder="", + secret=True, + ), + ], + advanced=True, + gate_prompt="Configure IBM Cloud Object Storage?", + ), # ── Langfuse ──────────────────────────────────────────────── - ConfigSection("Langfuse", [ - ConfigField( - "langfuse_secret_key", "LANGFUSE_SECRET_KEY", "Secret Key", - placeholder="sk-lf-...", secret=True, - helper_text="Get keys from your Langfuse project settings", - ), - ConfigField( - "langfuse_public_key", "LANGFUSE_PUBLIC_KEY", "Public Key", - placeholder="pk-lf-...", secret=True, - ), - ConfigField( - "langfuse_host", "LANGFUSE_HOST", "Host", - placeholder="https://cloud.langfuse.com", - helper_text="Leave empty for Langfuse Cloud, or set for self-hosted", - ), - ], gate_prompt="Configure Langfuse tracing?"), - + ConfigSection( + "Langfuse", + [ + ConfigField( + "langfuse_secret_key", + "LANGFUSE_SECRET_KEY", + "Secret Key", + placeholder="sk-lf-...", + secret=True, + helper_text="Get keys from your Langfuse project settings", + ), + ConfigField( + "langfuse_public_key", + "LANGFUSE_PUBLIC_KEY", + "Public Key", + placeholder="pk-lf-...", + secret=True, + ), + ConfigField( + "langfuse_host", + "LANGFUSE_HOST", + "Host", + placeholder="https://cloud.langfuse.com", + helper_text="Leave empty for Langfuse Cloud, or set for self-hosted", + ), + ], + gate_prompt="Configure Langfuse tracing?", + ), # ── Storage ───────────────────────────────────────────────── - ConfigSection("Storage", [ - ConfigField( - "openrag_documents_paths", "OPENRAG_DOCUMENTS_PATHS", "Documents Paths", - placeholder="~/.openrag/documents", - default="$HOME/.openrag/documents", - helper_text="Directories containing documents to ingest (comma-separated)", - ), - ]), - + ConfigSection( + "Storage", + [ + ConfigField( + "openrag_documents_paths", + "OPENRAG_DOCUMENTS_PATHS", + "Documents Paths", + placeholder="~/.openrag/documents", + default="$HOME/.openrag/documents", + helper_text="Directories containing documents to ingest (comma-separated)", + ), + ], + ), # ── Advanced ──────────────────────────────────────────────── - ConfigSection("Advanced", [ - ConfigField( - "webhook_base_url", "WEBHOOK_BASE_URL", "Webhook Base URL", - placeholder="https://your-domain.com", - helper_text="External URL for continuous ingestion webhooks", - ), - ], advanced=True), + ConfigSection( + "Advanced", + [ + ConfigField( + "webhook_base_url", + "WEBHOOK_BASE_URL", + "Webhook Base URL", + placeholder="https://your-domain.com", + helper_text="External URL for continuous ingestion webhooks", + ), + ], + advanced=True, + ), ] @@ -299,7 +426,7 @@ def get_all_fields() -> list[ConfigField]: return [f for section in CONFIG_SECTIONS for f in section.fields] -def get_field(name: str) -> Optional[ConfigField]: +def get_field(name: str) -> ConfigField | None: """Look up a config field by attribute name.""" for f in get_all_fields(): if f.name == name: diff --git a/src/tui/managers/env_manager.py b/src/tui/managers/env_manager.py index f93604975..0ac3358f6 100644 --- a/src/tui/managers/env_manager.py +++ b/src/tui/managers/env_manager.py @@ -10,6 +10,7 @@ from typing import Dict, List, Optional from dotenv import load_dotenv + from utils.logging_config import get_logger from ..utils.validation import ( @@ -106,20 +107,21 @@ class EnvConfig: openrag_version: str = "" # Validation errors - validation_errors: Dict[str, str] = field(default_factory=dict) + validation_errors: dict[str, str] = field(default_factory=dict) class EnvManager: """Manages environment configuration for OpenRAG.""" assignment_pattern = re.compile(r"^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=") - - def __init__(self, env_file: Optional[Path] = None): + + def __init__(self, env_file: Path | None = None): if env_file: self.env_file = env_file else: # Use centralized location for TUI .env file - from utils.paths import get_tui_env_file, get_legacy_paths + from utils.paths import get_legacy_paths, get_tui_env_file + self.env_file = get_tui_env_file() # Check for legacy .env in current directory and migrate if needed @@ -127,17 +129,17 @@ def __init__(self, env_file: Optional[Path] = None): if not self.env_file.exists() and legacy_env.exists(): try: import shutil + self.env_file.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(legacy_env, self.env_file) os.chmod(self.env_file, 0o600) logger.info(f"Migrated .env from {legacy_env} to {self.env_file}") - except Exception as e: logger.warning(f"Failed to migrate .env file: {e}") self.config = EnvConfig() - + def generate_secure_password(self) -> str: """Generate a secure password for OpenSearch.""" # Ensure at least one character from each category @@ -167,6 +169,7 @@ def generate_langflow_secret_key(self) -> str: def generate_openrag_encryption_key(self) -> str: """Generate a secure AES-256 base64 master key for OpenRAG.""" import base64 + return base64.b64encode(secrets.token_bytes(32)).decode("ascii") def _quote_env_value(self, value: str) -> str: @@ -178,7 +181,7 @@ def _quote_env_value(self, value: str) -> str: escaped_value = value.replace("'", "'\\''") return f"'{escaped_value}'" - def _env_attr_map(self) -> Dict[str, str]: + def _env_attr_map(self) -> dict[str, str]: """Map env vars to EnvConfig attribute names.""" return { # pragma: allowlist secret "OPENAI_API_KEY": "openai_api_key", # pragma: allowlist secret @@ -317,6 +320,7 @@ def setup_secure_defaults(self) -> None: if not self.config.openrag_version: try: from ..utils.version_check import get_current_version + current_version = get_current_version() if current_version != "unknown": self.config.openrag_version = current_version @@ -378,17 +382,13 @@ def validate_config(self, mode: str = "full") -> bool: # Validate documents paths only if provided (optional) if self.config.openrag_documents_paths: - is_valid, error_msg, _ = validate_documents_paths( - self.config.openrag_documents_paths - ) + is_valid, error_msg, _ = validate_documents_paths(self.config.openrag_documents_paths) if not is_valid: self.config.validation_errors["openrag_documents_paths"] = error_msg # Validate required fields if not validate_non_empty(self.config.opensearch_password): - self.config.validation_errors["opensearch_password"] = ( - "OpenSearch password is required" - ) + self.config.validation_errors["opensearch_password"] = "OpenSearch password is required" # Langflow secret key is auto-generated; no user input required @@ -396,11 +396,8 @@ def validate_config(self, mode: str = "full") -> bool: if mode == "full": # Validate OAuth settings if provided - if ( + if self.config.google_oauth_client_id and not validate_google_oauth_client_id( self.config.google_oauth_client_id - and not validate_google_oauth_client_id( - self.config.google_oauth_client_id - ) ): self.config.validation_errors["google_oauth_client_id"] = ( "Invalid Google OAuth client ID format" @@ -421,12 +418,8 @@ def validate_config(self, mode: str = "full") -> bool: ) # Validate optional URLs if provided - if self.config.webhook_base_url and not validate_url( - self.config.webhook_base_url - ): - self.config.validation_errors["webhook_base_url"] = ( - "Invalid webhook URL format" - ) + if self.config.webhook_base_url and not validate_url(self.config.webhook_base_url): + self.config.validation_errors["webhook_base_url"] = "Invalid webhook URL format" if self.config.langflow_public_url and not validate_url( self.config.langflow_public_url @@ -458,30 +451,54 @@ def save_env_file(self) -> bool: # Core settings f.write("# Core settings\n") - f.write(f"LANGFLOW_SECRET_KEY={self._quote_env_value(self.config.langflow_secret_key)}\n") + f.write( + f"LANGFLOW_SECRET_KEY={self._quote_env_value(self.config.langflow_secret_key)}\n" + ) # Only write LANGFLOW_SUPERUSER and password if password is set if self.config.langflow_superuser_password: - f.write(f"LANGFLOW_SUPERUSER={self._quote_env_value(self.config.langflow_superuser)}\n") + f.write( + f"LANGFLOW_SUPERUSER={self._quote_env_value(self.config.langflow_superuser)}\n" + ) f.write( f"LANGFLOW_SUPERUSER_PASSWORD={self._quote_env_value(self.config.langflow_superuser_password)}\n" ) - f.write(f"LANGFLOW_CHAT_FLOW_ID={self._quote_env_value(self.config.langflow_chat_flow_id)}\n") + f.write( + f"LANGFLOW_CHAT_FLOW_ID={self._quote_env_value(self.config.langflow_chat_flow_id)}\n" + ) f.write( f"LANGFLOW_INGEST_FLOW_ID={self._quote_env_value(self.config.langflow_ingest_flow_id)}\n" ) - f.write(f"LANGFLOW_URL_INGEST_FLOW_ID={self._quote_env_value(self.config.langflow_url_ingest_flow_id)}\n") + f.write( + f"LANGFLOW_URL_INGEST_FLOW_ID={self._quote_env_value(self.config.langflow_url_ingest_flow_id)}\n" + ) f.write(f"NUDGES_FLOW_ID={self._quote_env_value(self.config.nudges_flow_id)}\n") - f.write(f"OPENRAG_ENCRYPTION_KEY={self._quote_env_value(self.config.openrag_encryption_key)}\n") - f.write(f"OPENRAG_TENANT_ID={self._quote_env_value(self.config.openrag_tenant_id)}\n") - f.write(f"OPENRAG_ENFORCE_PREREQUISITES={self._quote_env_value(self.config.openrag_enforce_prerequisites)}\n") - f.write(f"OPENSEARCH_PASSWORD={self._quote_env_value(self.config.opensearch_password)}\n") + f.write( + f"OPENRAG_ENCRYPTION_KEY={self._quote_env_value(self.config.openrag_encryption_key)}\n" + ) + f.write( + f"OPENRAG_TENANT_ID={self._quote_env_value(self.config.openrag_tenant_id)}\n" + ) + f.write( + f"OPENRAG_ENFORCE_PREREQUISITES={self._quote_env_value(self.config.openrag_enforce_prerequisites)}\n" + ) + f.write( + f"OPENSEARCH_PASSWORD={self._quote_env_value(self.config.opensearch_password)}\n" + ) if self.config.opensearch_username and self.config.opensearch_username != "admin": - f.write(f"OPENSEARCH_USERNAME={self._quote_env_value(self.config.opensearch_username)}\n") + f.write( + f"OPENSEARCH_USERNAME={self._quote_env_value(self.config.opensearch_username)}\n" + ) if self.config.opensearch_host and self.config.opensearch_host != "opensearch": - f.write(f"OPENSEARCH_HOST={self._quote_env_value(self.config.opensearch_host)}\n") + f.write( + f"OPENSEARCH_HOST={self._quote_env_value(self.config.opensearch_host)}\n" + ) if self.config.opensearch_port and self.config.opensearch_port != "9200": - f.write(f"OPENSEARCH_PORT={self._quote_env_value(self.config.opensearch_port)}\n") - f.write(f"OPENSEARCH_INDEX_NAME={self._quote_env_value(self.config.opensearch_index_name)}\n") + f.write( + f"OPENSEARCH_PORT={self._quote_env_value(self.config.opensearch_port)}\n" + ) + f.write( + f"OPENSEARCH_INDEX_NAME={self._quote_env_value(self.config.opensearch_index_name)}\n" + ) f.write( "OPENRAG_OPENSEARCH_NUMBER_OF_SHARDS=" f"{self._quote_env_value(self.config.opensearch_number_of_shards)}\n" @@ -494,6 +511,7 @@ def save_env_file(self) -> bool: # Expand $HOME in paths before writing to .env # This ensures paths work with all compose implementations (docker, podman) from utils.paths import expand_path + f.write( f"OPENRAG_DOCUMENTS_PATHS={self._quote_env_value(expand_path(self.config.openrag_documents_paths))}\n" ) @@ -521,11 +539,14 @@ def save_env_file(self) -> bool: ) # Set OPENRAG_VERSION to TUI version if self.config.openrag_version: - f.write(f"OPENRAG_VERSION={self._quote_env_value(self.config.openrag_version)}\n") + f.write( + f"OPENRAG_VERSION={self._quote_env_value(self.config.openrag_version)}\n" + ) else: # Fallback: try to get current version try: from ..utils.version_check import get_current_version + current_version = get_current_version() if current_version != "unknown": f.write(f"OPENRAG_VERSION={self._quote_env_value(current_version)}\n") @@ -556,13 +577,19 @@ def save_env_file(self) -> bool: # Ingestion settings f.write("# Ingestion settings\n") - f.write(f"DISABLE_INGEST_WITH_LANGFLOW={self._quote_env_value(self.config.disable_ingest_with_langflow)}\n") - f.write(f"INGEST_SAMPLE_DATA={self._quote_env_value(self.config.ingest_sample_data)}\n") + f.write( + f"DISABLE_INGEST_WITH_LANGFLOW={self._quote_env_value(self.config.disable_ingest_with_langflow)}\n" + ) + f.write( + f"INGEST_SAMPLE_DATA={self._quote_env_value(self.config.ingest_sample_data)}\n" + ) f.write("\n") # Langflow auth settings f.write("# Langflow auth settings\n") - f.write(f"LANGFLOW_AUTO_LOGIN={self._quote_env_value(self.config.langflow_auto_login)}\n") + f.write( + f"LANGFLOW_AUTO_LOGIN={self._quote_env_value(self.config.langflow_auto_login)}\n" + ) f.write( f"LANGFLOW_NEW_USER_IS_ACTIVE={self._quote_env_value(self.config.langflow_new_user_is_active)}\n" ) @@ -572,10 +599,7 @@ def save_env_file(self) -> bool: f.write("\n") # OAuth settings - if ( - self.config.google_oauth_client_id - or self.config.google_oauth_client_secret - ): + if self.config.google_oauth_client_id or self.config.google_oauth_client_secret: f.write("# Google OAuth settings\n") f.write( f"GOOGLE_OAUTH_CLIENT_ID={self._quote_env_value(self.config.google_oauth_client_id)}\n" @@ -658,7 +682,7 @@ def save_env_file(self) -> bool: logger.error("Error saving .env file", error=str(e)) return False - def get_no_auth_setup_fields(self) -> List[tuple[str, str, str, bool]]: + def get_no_auth_setup_fields(self) -> list[tuple[str, str, str, bool]]: """Get fields required for no-auth setup mode. Returns (field_name, display_name, placeholder, can_generate).""" return [ ("openai_api_key", "OpenAI API Key", "sk-... or leave empty", False), @@ -688,7 +712,7 @@ def get_no_auth_setup_fields(self) -> List[tuple[str, str, str, bool]]: ), ] - def get_full_setup_fields(self) -> List[tuple[str, str, str, bool]]: + def get_full_setup_fields(self) -> list[tuple[str, str, str, bool]]: """Get all fields for full setup mode.""" base_fields = self.get_no_auth_setup_fields() @@ -759,6 +783,7 @@ def ensure_openrag_version(self) -> None: """Ensure OPENRAG_VERSION is set in .env file to match TUI version.""" try: from ..utils.version_check import get_current_version + current_version = get_current_version() if current_version == "unknown": return @@ -785,7 +810,9 @@ def ensure_openrag_version(self) -> None: for line in lines: if line.strip().startswith("OPENRAG_VERSION"): # Replace existing line - new_lines.append(f"OPENRAG_VERSION={self._quote_env_value(current_version)}") + new_lines.append( + f"OPENRAG_VERSION={self._quote_env_value(current_version)}" + ) updated = True else: new_lines.append(line) @@ -797,19 +824,21 @@ def ensure_openrag_version(self) -> None: if "LANGFLOW_DATA_PATH" in line: insert_pos = i + 1 break - new_lines.insert(insert_pos, f"OPENRAG_VERSION={self._quote_env_value(current_version)}") + new_lines.insert( + insert_pos, f"OPENRAG_VERSION={self._quote_env_value(current_version)}" + ) fd = os.open(self.env_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) # Ensure pre-existing files get restricted permissions os.chmod(self.env_file, 0o600) - with os.fdopen(fd, 'w') as f: + with os.fdopen(fd, "w") as f: f.write("\n".join(new_lines) + "\n") f.flush() os.fsync(f.fileno()) else: # Create new .env file with just OPENRAG_VERSION fd = os.open(self.env_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) - with os.fdopen(fd, 'w') as f: + with os.fdopen(fd, "w") as f: content = ( f"# OpenRAG Environment Configuration\n" f"# Generated by OpenRAG TUI\n\n" @@ -821,7 +850,7 @@ def ensure_openrag_version(self) -> None: except Exception as e: logger.error(f"Error ensuring OPENRAG_VERSION: {e}") - def generate_compose_volume_mounts(self) -> List[str]: + def generate_compose_volume_mounts(self) -> list[str]: """Generate Docker Compose volume mount strings from documents paths.""" # Expand $HOME before validation paths_str = self.config.openrag_documents_paths.replace("$HOME", str(Path.home()))