vanilla_rag/rag_system.py at main · UFResearchComputing/vanilla_rag · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import json
import requests
import time
from typing import List, Dict, Any, Optional, Tuple, Iterator
from tqdm import tqdm
import re
from pymilvus import MilvusClient
from sentence_transformers import SentenceTransformer
import hashlib
from pathlib import Path
import glob

from config import *
from temporal_analysis import TemporalAnalyzer
from pattern_discovery import PatternDiscoveryAgent

# Configuration
MILVUS_URI = os.getenv("MILVUS_URI", MILVUS_URI)
COLLECTION_NAME = os.getenv("COLLECTION_NAME", COLLECTION_NAME)
LLM_API_URL = os.getenv("LLM_API_URL", LLM_API_URL)
BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", BRAVE_API_KEY)
FILES_DIR = os.getenv("FILES_DIR", FILES_DIR)
META_DIR = os.getenv("META_DIR", META_DIR)
LLM_MODEL_NAME = os.getenv("LLM_MODEL_NAME", LLM_MODEL_NAME)

WOS_API_URL = os.getenv("WOS_API_URL", WOS_API_URL)
WOS_API_KEY = os.getenv("WOS_API_KEY", WOS_API_KEY)


# Initialize embedding model
embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2', device='cpu')


class RAGSystem:
    """
    Enhanced RAG system:

    - Conversation memory
    - Query expansion
    - Chain of thought reasoning
    - Tool use (Brave Search API)
    """

    def __init__(
            self,
            milvus_uri: str = MILVUS_URI,
            collection_name: str = COLLECTION_NAME,
            llm_api_url: str = LLM_API_URL,
            max_tokens: int = 2048,
            temperature: float = 0.7,
            search_limit: int = 5,
            dense_weight: float = 0.6,
            sparse_weight: float = 0.4,
            memory_enabled: bool = True,
            memory_turns: int = 3,
            query_expansion_enabled: bool = True,
            auto_load_metadata: bool = False
    ):
        """Initialize the RAG system"""
        self.milvus_client = MilvusClient(uri=milvus_uri)
        self.collection_name = collection_name

        # LLM configuration
        self.llm_api_url = llm_api_url
        self.max_tokens = max_tokens
        self.temperature = temperature

        # Search parameters
        self.search_limit = search_limit
        self.dense_weight = dense_weight
        self.sparse_weight = sparse_weight

        # Tool use
        self.use_web_search = False
        self.brave_api_key = BRAVE_API_KEY

        # Memory
        self.memory_enabled = memory_enabled
        self.memory_turns = memory_turns

        # Query expansion toggle
        self.query_expansion_enabled = query_expansion_enabled
        self.query_expansion_mode = "conservative"

        self.metadata = {}
        self.metadata_cache = {}

        self.temporal_analyzer = TemporalAnalyzer(self)
        self.pattern_discovery = PatternDiscoveryAgent(self)

        self.files_dir = Path(FILES_DIR) if FILES_DIR else None
        self.meta_dir = Path(META_DIR) if META_DIR else None

        self.use_wos_search = False
        self.wos_api_key = WOS_API_KEY
        self.wos_api_url = WOS_API_URL

        if not self._check_wos_connection():
            print("Warning: Could not connect to Web of Science API. WoS search will not work.")
            self.use_wos_search = False

        # Check if Milvus collection exists
        if not self._check_collection_exists():
            raise ValueError(f"Collection '{collection_name}' does not exist in Milvus")

        if auto_load_metadata and self.files_dir and self.meta_dir:
            try:
                print("Loading document metadata...")
                metadata_start = time.time()
                self.load_all_metadata()
                print(f"Metadata loaded in {time.time() - metadata_start:.2f}s")
            except Exception as e:
                print(f"Error loading metadata: {e}")

        # Check if LLM is accessible
        if not self._check_llm_connection():
            print("Warning: Could not connect to LLM API. Generation will not work.")

    # Check API Endpoints
    def _check_collection_exists(self) -> bool:
        """Check if Milvus collection exists"""
        try:
            collections = self.milvus_client.list_collections()
            return self.collection_name in collections
        except Exception as e:
            print(f"Error checking Milvus collection: {e}")
            return False

    def _check_llm_connection(self) -> bool:
        """Check if LLM API is accessible"""
        try:
            # health check
            response = requests.get(self.llm_api_url.replace("/v1/chat/completions", "/health"))
            return response.status_code == 200
        except Exception as e:
            print(f"Error connecting to LLM API: {e}")
            return False

    def _check_wos_connection(self) -> bool:
        """Check if WOS API accessible"""
        if not self.wos_api_key:
            print("Web of Science API key not provided")
            return False

        try:
            headers = {
                "X-ApiKey": self.wos_api_key,
                "Accept": "application/json"
            }

            # Use documents endpoint for WoS API
            test_url = f"{self.wos_api_url}/documents"

            # proper WoS query syntax with tags
            params = {
                "q": "TS=(science)",
                "limit": 1,
                "page": 1
            }

            print(f"Testing WoS connection with URL: {test_url}")
            print(f"Test query: {params['q']}")

            response = requests.get(
                test_url,
                headers=headers,
                params=params,
                timeout=10
            )

            print(f"WoS API Response Status: {response.status_code}")

            if response.status_code == 200:
                print("Web of Science API connection successful")
                data = response.json()
                total_results = data.get('metadata', {}).get('total', 'unknown')
                print(f"Test query returned {total_results} total results")
                return True
            else:
                print(f"WoS API Response: {response.text}")
                if response.status_code == 401:
                    print("Web of Science API authentication failed - check API key")
                elif response.status_code == 400:
                    print("Web of Science API bad request - query syntax issue")
                elif response.status_code == 403:
                    print("Web of Science API access forbidden - check your subscription")
                return False

        except Exception as e:
            print(f"Error connecting to Web of Science API: {e}")
            return False

    def switch_collection(self, new_collection_name: str, auto_load_metadata: bool = False) -> bool:
        """
        Switch to different Milvus collection
        :new_collection_name: name of collection
        :auto_load_metadata: load metadata immediately or not
        """

        try:
            collections = self.milvus_client.list_collections()
            if new_collection_name not in collections:
                print(f"Collection '{new_collection_name}' does not exist in Milvus")
                return False
        except Exception as e:
            print(f"Error checking collections: {e}")
            return False

        self.collection_name = new_collection_name

        # Load cached metadata
        if new_collection_name in self.metadata_cache:
            self.metadata = self.metadata_cache[new_collection_name]
            print(
                f"Switched to collection '{new_collection_name}' (loaded {len(self.metadata)} cached metadata entries)")
        else:
            self.metadata = {}
            if auto_load_metadata and self.files_dir and self.meta_dir:
                try:
                    print(f"Loading metadata for collection '{new_collection_name}'...")
                    self.load_all_metadata()
                except Exception as e:
                    print(f"Error loading metadata: {e}")
            print(f"Switched to collection '{new_collection_name}'")

        return True

    # Metadata Handling is Below
    def _get_first_value(self, text):
        """Extract first value from semicolon-separated text"""
        if not text:
            return ''
        return text.split(';')[0].strip()

    def _get_first_match(self, metadata, candidates):
        """Get first matching field from list of candidates"""
        for candidate in candidates:
            if candidate in metadata and metadata[candidate]:
                return metadata[candidate]
        return ''

    def _get_publication_date(self, date_text):
        """Enhanced publication date extraction"""
        if not date_text:
            return ''

        # Look for 4-digit years in historical range
        year_match = re.search(r'\b(1[4-8]\d{2})\b', date_text)
        if year_match:
            return year_match.group(1)

        # Handle complex EEBO format
        dates = date_text.split(';')
        for date in dates:
            date = date.strip()
            # Look for dates in brackets like [1586]
            if date.startswith('[') and date.endswith(']'):
                bracket_content = date[1:-1]
                # Extract year from bracket content
                year_match = re.search(r'\b(1[4-8]\d{2})\b', bracket_content)
                if year_match:
                    return year_match.group(1)
                return bracket_content

            # Look for plain 4-digit years in historical range
            year_match = re.search(r'\b(1[4-8]\d{2})\b', date)
            if year_match:
                return year_match.group(1)

        # Fallback to first date if no year pattern found
        return dates[0].strip() if dates else ''

    def _load_metadata(self, file_path):
        """Load metadata for specific file with field mapping"""
        try:
            # Convert text file path to metadata file path
            relative_path = Path(file_path).relative_to(self.files_dir)
            meta_file = self.meta_dir / relative_path

            if meta_file.exists():
                with open(meta_file, 'r', encoding='utf-8') as f:
                    meta_content = f.read().strip()

                # Parse the metadata text file
                metadata = {}
                current_key = None
                current_value = []

                for line in meta_content.split('\n'):
                    if ':' in line:
                        if current_key:
                            metadata[current_key] = '; '.join(current_value)

                        key, value = line.split(':', 1)
                        current_key = key.strip()
                        current_value = [value.strip()]
                    else:
                        if current_key and line.strip():
                            current_value.append(line.strip())

                if current_key:
                    metadata[current_key] = '; '.join(current_value)

                # Enhanced metadata mapping
                clean_metadata = self._map_metadata_fields(metadata)
                return clean_metadata

        except Exception as e:
            print(f"Error loading metadata for {file_path}: {e}")

        return None

    def load_all_metadata(self):
        """Load metadata for all files in directory"""
        if not self.files_dir or not self.meta_dir:
            print("Files directory and metadata directory must be set")
            return

        file_paths = glob.glob(os.path.join(self.files_dir, "**"), recursive=True)
        file_paths = [f for f in file_paths if os.path.isfile(f)]

        print(f"Loading metadata for {len(file_paths)} files...")

        for file_path in tqdm(file_paths):
            filename = os.path.basename(file_path)
            metadata = self._load_metadata(file_path)
            if metadata:
                self.metadata[filename] = metadata

        self.metadata_cache[self.collection_name] = self.metadata.copy()

        print(f"Loaded metadata for {len(self.metadata)} files")

    def _map_metadata_fields(self, metadata):
        clean_metadata = {}

        # Title mapping
        title_candidates = ['TITLE', 'Title', 'title']
        title = self._get_first_match(metadata, title_candidates)
        clean_metadata['title'] = title[:400] + '...' if len(title) > 400 else title

        # Author mapping
        author_candidates = ['AUTHOR', 'Author', 'author', 'Person', 'PERSON', 'person']
        clean_metadata['author'] = self._get_first_match(metadata, author_candidates)

        # Date mapping
        date_candidates = ['DATE', 'Date', 'date']
        date_value = self._get_first_match(metadata, date_candidates)
        clean_metadata['date'] = self._get_publication_date(date_value)

        # Language mapping
        language_candidates = ['LANGUAGE', 'Language', 'language']
        clean_metadata['language'] = self._get_first_match(metadata, language_candidates)

        # Publisher mapping
        publisher_candidates = ['PUBLISHER', 'Publisher', 'publisher']
        clean_metadata['publisher'] = self._get_first_value(self._get_first_match(metadata, publisher_candidates))

        # Publication place mapping
        pubplace_candidates = ['PUBPLACE', 'Pubplace', 'pubplace', 'Publication Place']
        clean_metadata['pubplace'] = self._get_first_value(self._get_first_match(metadata, pubplace_candidates))

        # ID mapping
        id_candidates = ['IDNO', 'ID', 'id', 'Identifier']
        clean_metadata['id'] = self._get_first_value(self._get_first_match(metadata, id_candidates))

        # Additional enhanced fields for historical documents
        collections_candidates = ['Collections', 'COLLECTIONS', 'collections']
        clean_metadata['collections'] = self._get_first_match(metadata, collections_candidates)

        category_candidates = ['Category', 'CATEGORY', 'category']
        clean_metadata['category'] = self._get_first_match(metadata, category_candidates)

        typology_candidates = ['Typology', 'TYPOLOGY', 'typology', 'Type', 'TYPE', 'type']
        clean_metadata['typology'] = self._get_first_match(metadata, typology_candidates)

        topic_candidates = ['Topic', 'TOPIC', 'topic', 'Subject', 'SUBJECT', 'subject']
        clean_metadata['topic'] = self._get_first_match(metadata, topic_candidates)

        clean_metadata = {k: v for k, v in clean_metadata.items() if v and v != 'N/A'}

        return clean_metadata

    def display_enhanced_metadata(self, doc: Dict[str, Any], show_all_fields: bool = False):
        """
        Display metadata information for a document
        """
        metadata = doc.get("metadata", {})
        base_filename = doc["filename"].split('_chunk_')[0] if '_chunk_' in doc["filename"] else doc["filename"]

        print(f"\n=== Metadata for {base_filename} ===")

        if not metadata:
            print("No metadata available for this document.")
            return

        # Essential fields
        essential_fields = {
            'title': 'Title',
            'author': 'Author',
            'date': 'Date',
            'publisher': 'Publisher',
            'pubplace': 'Publication Place',
            'language': 'Language'
        }

        # Additional fields
        additional_fields = {
            'collections': 'Collections',
            'category': 'Category',
            'typology': 'Document Type',
            'topic': 'Topic/Subject',
            'id': 'Identifier'
        }

        # Show essential fields
        for field, label in essential_fields.items():
            if field in metadata and metadata[field]:
                print(f"{label}: {metadata[field]}")

        # Show additional fields
        if show_all_fields:
            print("\nAdditional Information:")
            for field, label in additional_fields.items():
                if field in metadata and metadata[field]:
                    print(f"{label}: {metadata[field]}")

    # Query Embeddings
    def _embed_text(self, text: str) -> List[float]:
        """Generate embeddings for input"""
        embedding = embedding_model.encode(text)
        embedding = embedding / (embedding ** 2).sum() ** 0.5
        return embedding.tolist()

    # character handling
    def _escape_query(self, query: str) -> str:
        """Escape special characters in query text for TEXT_MATCH"""
        return query.replace("'", "''")

    # User query expansion - open to suggestions
    def expand_query(self, query: str) -> str:
        """
        Use the LLM to rewrite the query for better search recall, with memory awareness
        """
        # If query expansion is disabled, return original query
        if not self.query_expansion_enabled:
            return query

        # Determine prompt based on expansion mode
        if self.query_expansion_mode == "conservative":
            system_prompt = """You are a helpful assistant that rewrites queries for better search accuracy.
            Extract the core search intent from this query in a short phrase.
            RULES:
            - Identify only the main topic/concept/entity
            - Keep it EXTREMELY concise (5-10 words max)
            - Don't add concepts not in the original
            - Focus on nouns, proper names, and specific terms
            - If the original is already concise, return it unchanged
            - Return ONLY the search query without additional text
            - make it easy for an LLM and RAG system to understand"""

        elif self.query_expansion_mode == "moderate":
            system_prompt = """You are a helpful assistant that rewrites queries for better search accuracy.
            Extract the core search intent from this query in a moderate phrase.
            Your job is to make moderate improvements to the original query:
            RULES:
            - Identify only the main topic/concept/entity
            - Keep it concise (10-15 words max)
            - Don't add concepts not in the original
            - Focus on nouns, proper names, and specific terms
            - If the original is already concise, return it unchanged
            - Return ONLY the search query without additional text
            - make it easy for an LLM and RAG system to understand"""

        else:  # aggressive
            system_prompt = """You are a helpful assistant that rewrites queries for better search accuracy.
            Extract the core search intent from this query in a longer phrase.
            Your job is to expand the query to improve recall:
            RULES:
            - Identify only the main topic/concept/entity
            - Don't add concepts not in the original
            - Focus on nouns, proper names, and specific terms
            - make it easy for an LLM and RAG system to understand"""

        user_prompt = f"Rewrite the following query for better search recall: \n\n {query}. \n\n If appropriate, maintain references to entities mentioned in previous conversation."

        request_data = {
            "model": LLM_MODEL_NAME,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": 0.2,
            "max_tokens": 100
        }

        try:
            response = requests.post(self.llm_api_url, json=request_data)
            if response.status_code == 200:
                expanded = response.json()["choices"][0]["message"]["content"].strip()

                # For conservative mode, if the expansion is too long, fall back to original
                if self.query_expansion_mode == "conservative" and len(expanded) > len(query) * 5:
                    print(f"Expansion too verbose for conservative mode, using original query")
                    return query

                print(f"Original query: {query}")
                print(f"Expanded query: {expanded}")
                return expanded
        except Exception as e:
            print(f"Query expansion failed: {e}")
        return query

    def _ensure_metadata_loaded(self, filenames: List[str]):
        """
        Lazy load metadata only for files that were retrieved
        """
        if not self.files_dir or not self.meta_dir:
            return

        missing_files = []
        for filename in filenames:
            base_filename = filename.split('_chunk_')[0] if '_chunk_' in filename else filename
            if base_filename not in self.metadata:
                missing_files.append(base_filename)

        if missing_files:
            # Remove duplicates
            missing_files = list(set(missing_files))
            print(f"Lazy loading metadata for {len(missing_files)} files...")

            for base_filename in missing_files:
                file_path = self.files_dir / base_filename
                if file_path.exists():
                    metadata = self._load_metadata(file_path)
                    if metadata:
                        self.metadata[base_filename] = metadata

            # Update cache
            self.metadata_cache[self.collection_name] = self.metadata.copy()

    # Document searching in Milvus --> (R)AG
    def retrieve(self, query: str, text_match_filter: Optional[str] = None,
                 limit: Optional[int] = None, apply_feedback: bool = False) -> List[Dict[str, Any]]:
        """
        Standard retrieve method - temporal filtering now handled separately in temporal_analyzer
        """

        if limit is None:
            limit = self.search_limit

        # Generate dense embedding for query
        query_embedding = self._embed_text(query)

        ef_value = limit * 3
        limit_multiplier = 3

        # Handle text match filter only
        expr = None
        if text_match_filter:
            escaped_filter = self._escape_query(text_match_filter)
            words = escaped_filter.split()

            if len(words) > 1:
                word_expressions = []
                for word in words:
                    word_expressions.append(f"TEXT_MATCH(text, '{word}')")

                expr = " && ".join(word_expressions)
                expr = f"({expr}) && TEXT_MATCH(text, '{escaped_filter}')"
            else:
                expr = f"TEXT_MATCH(text, '{escaped_filter}')"

        try:
            from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker

            dense_params = {
                "data": [query_embedding],
                "anns_field": "dense",
                "param": {
                    "metric_type": "COSINE",
                    "params": {"ef": ef_value}
                },
                "limit": limit * limit_multiplier
            }

            if expr:
                dense_params["expr"] = expr

            dense_request = AnnSearchRequest(**dense_params)

            sparse_params = {
                "data": [query],
                "anns_field": "sparse",
                "param": {
                    "metric_type": "BM25",
                    "params": {"drop_ratio_search": 0.2}
                },
                "limit": limit * limit_multiplier
            }

            if expr:
                sparse_params["expr"] = expr

            sparse_request = AnnSearchRequest(**sparse_params)

            ranker = WeightedRanker(self.dense_weight, self.sparse_weight)

            # Execute hybrid search
            start_time = time.time()
            results = self.milvus_client.hybrid_search(
                collection_name=self.collection_name,
                reqs=[dense_request, sparse_request],
                ranker=ranker,
                output_fields=["filename", "text"],
                limit=limit * limit_multiplier
            )
            search_time = time.time() - start_time
            print(f"Search completed in {search_time:.4f} seconds")

            processed_results = []
            seen_files = set()

            for hit in results[0]:
                filename = hit['entity']['filename']
                text = hit['entity']['text']
                score = hit.get('distance', 0.0)

                base_filename = filename.split('_chunk_')[0] if '_chunk_' in filename else filename
                document_metadata = self.metadata.get(base_filename, {})

                if base_filename not in seen_files and len(processed_results) < limit:
                    seen_files.add(base_filename)
                    processed_results.append({
                        "filename": filename,
                        "text": text,
                        "score": score,
                        "metadata": document_metadata
                    })

            processed_results.sort(key=lambda x: x["score"], reverse=True)

            if processed_results:
                retrieved_filenames = [result["filename"] for result in processed_results]
                self._ensure_metadata_loaded(retrieved_filenames)

                for result in processed_results:
                    base_filename = result["filename"].split('_chunk_')[0] if '_chunk_' in result["filename"] else \
                    result["filename"]
                    if base_filename in self.metadata:
                        result["metadata"] = self.metadata[base_filename]

            print(f"Retrieved {len(processed_results)} results")
            return processed_results

        except Exception as e:
            print(f"Error in hybrid search: {e}")

            # Fall back to direct text match if hybrid fails
            if text_match_filter:
                try:
                    results = self.text_match_search(text_match_filter, limit)
                    return results
                except Exception as e2:
                    print(f"Text match fallback also failed: {e2}")

            # Fall back to standard dense search
            try:
                search_params = {
                    "metric_type": "COSINE",
                    "params": {"ef": ef_value}
                }

                results = self.milvus_client.search(
                    collection_name=self.collection_name,
                    data=[query_embedding],
                    anns_field="dense",
                    search_params=search_params,
                    limit=limit,
                    output_fields=["filename", "text"],
                    expr=expr
                )

                processed_results = []
                seen_files = set()

                for hit in results[0]:
                    filename = hit["entity"]["filename"]
                    text = hit["entity"]["text"]
                    score = hit.get("distance", 0.0)

                    base_filename = filename.split('_chunk_')[0] if '_chunk_' in filename else filename

                    if base_filename not in seen_files and len(processed_results) < limit:
                        seen_files.add(base_filename)
                        processed_results.append({
                            "filename": filename,
                            "text": text,
                            "score": score,
                            "metadata": self.metadata.get(base_filename, {})
                        })

                print(f"Retrieved {len(processed_results)} results using fallback search")
                return processed_results

            except Exception as e3:
                print(f"All search methods failed: {e3}")
                return []

    # Entire RAG pipeline from R->A->G
    def rag_query(
            self,
            query: str,
            text_match_filter: Optional[str] = None,
            retrieval_limit: Optional[int] = None,
            use_web_search: bool = False,
            generate_report: bool = False,
            use_wos_search: bool = False,
            wos_timespan: str = "1400-01-01+1740-12-31",
    ) -> Tuple[str, Dict[str, Any]]:
        """
        Complete RAG pipeline with simplified context referencing
        """
        # Parse query for length preference
        length_preference = self._detect_length_preference(query)

        # Clean the query
        clean_query = self._clean_query(query)

        # Expand the query for better recall
        expanded_query = self.expand_query(clean_query) if self.query_expansion_enabled else clean_query

        # Retrieve relevant documents
        retrieved_docs = self.retrieve(
            query=expanded_query,
            text_match_filter=text_match_filter,
            limit=retrieval_limit
        )

        # Perform web search
        web_results = []
        if use_web_search and self.use_web_search and self.brave_api_key:
            web_results = self.web_search(clean_query, limit=2)

        if use_wos_search and WOS_API_KEY:
            wos_results = self.web_of_science_search(clean_query, limit=3, timespan=wos_timespan)

        # Combine results, prioritizing knowledge base
        combined_docs = retrieved_docs + web_results + wos_results

        # Sort by score
        combined_docs.sort(key=lambda x: x.get("score", 0), reverse=True)

        # Limit to retrieval limit
        if retrieval_limit:
            combined_docs = combined_docs[:retrieval_limit]

        if not combined_docs:
            return ("I couldn't find any relevant information to answer your question.",
                    {"error": "No relevant documents found"})

        # Check if COT should be used
        cot_used = self._should_use_cot(clean_query)

        # Generate answer using simplified method
        answer, context_data = self.generate_answer(
            clean_query, combined_docs, length_preference
        )

        # Add additional metadata to context_data
        context_data.update({
            "query": clean_query,
            "expanded_query": expanded_query if expanded_query != clean_query else None,
            "web_search_used": use_web_search and len(web_results) > 0,
            "wos_search_used": use_wos_search and len(wos_results) > 0,
            "wos_results_count": len(wos_results),
            "sources": [
                {
                    "filename": doc["filename"],
                    "score": doc.get("score", 0),
                    "metadata": doc.get("metadata", {}),
                    "source_type": "wos" if doc["filename"].startswith("wos:") else
                    "web" if doc["filename"].startswith("web:") else "knowledge_base"
                }
                for doc in combined_docs
            ]
        })

        # Generate report
        if generate_report:
            report_path, conversation_id = self.generate_report(
                query=clean_query,
                expanded_query=expanded_query,
                answer=answer,
                contexts=combined_docs,
                context_data=context_data
            )
            print(f"Report generated: {report_path}")
            context_data["report_path"] = report_path
            context_data["conversation_id"] = conversation_id

        return answer, context_data

    # Milvus 2.5 text match search implementation
    def text_match_search(self, term: str, limit: int = 20) -> List[Dict[str, Any]]:
        """
        Perform a direct text match search using TEXT_MATCH
        """
        escaped_term = self._escape_query(term)
        words = escaped_term.split()

        if len(words) > 1:
            word_expressions = []
            for word in words:
                word_expressions.append(f"TEXT_MATCH(text, '{word}')")

            expr = " && ".join(word_expressions)
            expr = f"({expr}) && TEXT_MATCH(text, '{escaped_term}')"
        else:
            # Single word query
            expr = f"TEXT_MATCH(text, '{escaped_term}')"

        print(f"Searching with expression: {expr}")

        try:
            # Use query method
            results = self.milvus_client.query(
                collection_name=self.collection_name,
                filter=expr,
                output_fields=["filename", "text"],
                limit=limit
            )

            processed_results = []
            seen_files = set()

            for doc in results:
                filename = doc["filename"]
                text = doc["text"]
                base_filename = filename.split('_chunk_')[0] if '_chunk_' in filename else filename

                # Only add first occurrence of each file to avoid duplicates
                if base_filename not in seen_files:
                    seen_files.add(base_filename)

                    # Get metadata for document
                    document_metadata = self.metadata.get(base_filename, {})

                    processed_results.append({
                        "filename": filename,
                        "text": text,
                        "score": 1.0,
                        "metadata": document_metadata
                    })

            # Load metadata for retrieved documents
            if processed_results:
                retrieved_filenames = [result["filename"] for result in processed_results]
                self._ensure_metadata_loaded(retrieved_filenames)

                # Update results with fresh metadata
                for result in processed_results:
                    base_filename = result["filename"].split('_chunk_')[0] if '_chunk_' in result["filename"] else \
                    result["filename"]
                    if base_filename in self.metadata:
                        result["metadata"] = self.metadata[base_filename]

            print(f"Found {len(processed_results)} results using query method")
            return processed_results

        except Exception as e:
            print(f"Query method failed: {e}")

            try:
                # Use search method with corrected parameters
                results = self.milvus_client.search(
                    collection_name=self.collection_name,
                    data=[""],
                    anns_field="sparse",
                    search_params={"metric_type": "BM25", "params": {}},
                    limit=limit,
                    expr=expr,
                    output_fields=["filename", "text"]
                )

                processed_results = []
                seen_files = set()

                for hit in results[0]:
                    filename = hit["entity"]["filename"]
                    text = hit["entity"]["text"]
                    base_filename = filename.split('_chunk_')[0] if '_chunk_' in filename else filename

                    if base_filename not in seen_files:
                        seen_files.add(base_filename)

                        # Get metadata for document
                        document_metadata = self.metadata.get(base_filename, {})

                        processed_results.append({
                            "filename": filename,
                            "text": text,
                            "score": hit.get("distance", 0.0),
                            "metadata": document_metadata
                        })

                print(f"Found {len(processed_results)} results using search method")
                return processed_results

            except Exception as e2:
                print(f"Search method also failed: {e2}")

                # Simple fallback without TEXT_MATCH
                try:
                    print("Attempting simple fallback search...")

                    # Use basic BM25 search on sparse field
                    results = self.milvus_client.search(
                        collection_name=self.collection_name,
                        data=[term],
                        anns_field="sparse",
                        search_params={"metric_type": "BM25"},
                        limit=limit,
                        output_fields=["filename", "text"]
                    )

                    processed_results = []
                    seen_files = set()

                    for hit in results[0]:
                        filename = hit["entity"]["filename"]
                        text = hit["entity"]["text"]

                        # Only include if text actually contains the term (manual filter)
                        if term.lower() in text.lower():
                            base_filename = filename.split('_chunk_')[0] if '_chunk_' in filename else filename

                            if base_filename not in seen_files:
                                seen_files.add(base_filename)

                                document_metadata = self.metadata.get(base_filename, {})

                                processed_results.append({
                                    "filename": filename,
                                    "text": text,
                                    "score": hit.get("distance", 0.0),
                                    "metadata": document_metadata
                                })

                    print(f"Found {len(processed_results)} results using fallback method")
                    return processed_results

                except Exception as e3:
                    print(f"All search methods failed: {e3}")
                    return []

    # Brave API for web searches. --> soon to change to WoS
    def web_search(self, query: str, limit: int = 3) -> List[Dict[str, Any]]:
        """
        Perform a web search using Brave Search API
        """
        if not self.brave_api_key:
            print("Brave API key not set. Skipping web search.")
            return []

        headers = {
            "Accept": "application/json",
            "X-Subscription-Token": self.brave_api_key
        }

        params = {
            "q": query,
            "count": limit
        }

        try:
            response = requests.get(
                BRAVE_SEARCH_API_URL,
                headers=headers,
                params=params
            )

            if response.status_code == 200:
                results = response.json()

                web_results = []
                for web in results.get("web", {}).get("results", []):
                    web_results.append({
                        "filename": f"web:{web.get('url', '')}",
                        "text": f"{web.get('title', '')}\n{web.get('description', '')}\n{web.get('url', '')}",
                        "score": 0.9,
                        "doc_id": f"web:{hashlib.md5(web.get('url', '').encode()).hexdigest()[:8]}"
                    })

                return web_results
            else:
                print(f"Web search failed with status code: {response.status_code}")
                return []