Brain/test_brain.py at main · sss777999/Brain · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""Unified Brain model test file.

Usage:
    python3 test_brain.py              # ALL tests (curriculum + preschool + grade1 + fineweb + paraphrase + babi)
    python3 test_brain.py --curriculum # Only curriculum tests
    python3 test_brain.py --preschool  # Only preschool tests (3-6 years)
    python3 test_brain.py --grade1     # Only grade 1 tests
    python3 test_brain.py --fineweb    # Only FineWeb-Edu tests
    python3 test_brain.py --paraphrase # Only paraphrase robustness tests
    python3 test_brain.py --compare-baselines  # Compare Brain vs TF-IDF/BM25 baselines
    python3 test_brain.py --train      # Train curriculum from scratch
    python3 test_brain.py --strict     # Strict tests with verification
    python3 test_brain.py --raw        # Without LLM postprocessing
    python3 test_brain.py --no-gpt     # Without GPT answer quality evaluation
    python3 test_brain.py --no-llm     # Without LLM postprocessing (shows raw model output)
    python3 test_brain.py --skip-babi  # Skip bAbI tests (slow, needed only for PFC changes)

GPT evaluation:
    Each answer is evaluated by GPT-4o-mini on criteria:
    - Coherence (1-10): coherence and logic of the answer
    - Relevance (1-10): relevance to the question
    - Score (1-10): overall quality score

    Requires OPENAI_API_KEY environment variable.
    To disable: --no-gpt or GPT_EVAL_ENABLED=False in config.py
"""

import sys
import time
import os
import re
from typing import Any, Dict, List, Sequence, Set, Union
from datetime import datetime
from hippocampus import Hippocampus
from train import train_on_curriculum, ask, get_statistics
from llm_postprocess import postprocess_answer
from gpt_evaluator import evaluate_answer_quality
from config import print_config, CONFIG


# ANCHOR: LOG_FILE_SETUP
# Global file for logging test results
LOG_FILE = None
NO_LLM_MODE = False  # Set by --no-llm flag


def setup_log_file():
    """
    Creates log file with date in logs/ folder.

    Intent: Save test results for history and analysis.

    Returns:
        Path to log file
    """
    global LOG_FILE
    os.makedirs('logs', exist_ok=True)
    date_str = datetime.now().strftime('%d.%m.%Y_%H-%M-%S')
    LOG_FILE = f'logs/test_results_{date_str}.txt'
    return LOG_FILE


def log(message: str):
    """
    Outputs message to console and writes to log file.

    Args:
        message: Message to output
    """
    print(message)
    if LOG_FILE:
        with open(LOG_FILE, 'a', encoding='utf-8') as f:
            f.write(message + '\n')


# ANCHOR: DEFAULT_QUESTIONS
DEFAULT_QUESTIONS = [
    'What is a dog?',
    'What color is the sky?',
    'What is the capital of France?',
    'What does a dog say?',
    'What is the sun?',
    'Where is Paris?',
    'Who wrote Hamlet?',
    'What is water?',
    'What is a cat?',
    'What is the Earth?',
]

# ANCHOR: CURRICULUM_TESTS
# Tests for basic knowledge from curriculum.py
# Format: (question, list of acceptable keywords in answer)
CURRICULUM_TESTS = [
    # === CATEGORIES (IS-A) ===
    ("What is a dog?", ["animal", "pet", "mammal"]),
    ("What is a cat?", ["animal", "pet", "mammal"]),
    ("What is a lion?", ["animal", "wild", "predator"]),
    ("What is a whale?", ["animal", "mammal", "ocean"]),
    ("What is an apple?", ["fruit"]),
    ("What is a carrot?", ["vegetable"]),
    ("What is a car?", ["vehicle"]),
    ("What is a piano?", ["instrument"]),

    # === PROPERTIES ===
    ("What color is the sky?", ["blue"]),
    ("What color is grass?", ["green"]),
    ("What color is the sun?", ["yellow"]),
    ("What color is snow?", ["white"]),
    ("What color is a banana?", ["yellow"]),
    ("What color is an orange?", ["orange"]),

    # === ANIMAL SOUNDS ===
    ("What does a dog say?", ["woof", "bark"]),
    ("What does a cat say?", ["meow", "purr"]),
    ("What does a cow say?", ["moo"]),
    ("What does a duck say?", ["quack"]),
    ("What does a lion say?", ["roar"]),

    # === OPPOSITES ===
    ("What is the opposite of hot?", ["cold"]),
    ("What is the opposite of big?", ["small", "little"]),
    ("What is the opposite of fast?", ["slow"]),
    ("What is the opposite of up?", ["down"]),
    ("What is the opposite of happy?", ["sad"]),
    ("What is the opposite of day?", ["night"]),

    # === GEOGRAPHY ===
    ("What is the capital of France?", ["paris"]),
    ("What is the capital of England?", ["london"]),
    ("What is the capital of Japan?", ["tokyo"]),
    ("Where is Paris?", ["france"]),
    ("Where is London?", ["england", "uk"]),

    # === SCIENCE ===
    ("What is the sun?", ["star"]),
    ("What is the Earth?", ["planet"]),
    ("What is the moon?", ["satellite", "round", "night"]),
    ("What is water?", ["liquid", "h2o", "drink"]),
    ("What is ice?", ["solid", "cold", "frozen"]),

    # === BABY ANIMALS ===
    ("What is a puppy?", {"any_of": ["baby dog", "young dog"]}),
    ("What is a kitten?", {"any_of": ["baby cat", "young cat"]}),
    ("What is a calf?", {"any_of": ["baby cow", "young cow"]}),
    ("What is a chick?", {"any_of": ["baby chicken", "baby bird"]}),

    # === BODY PARTS ===
    ("What do we see with?", ["eyes"]),
    ("What do we hear with?", ["ears"]),
    ("What do we smell with?", ["nose"]),

    # === TIME ===
    ("When do we wake up?", ["morning"]),
    ("When do we sleep?", ["night"]),
    ("What comes after Monday?", ["tuesday"]),

    # === SHAPES ===
    ("What shape is a ball?", ["round", "circle", "sphere"]),
    ("How many sides does a triangle have?", ["three", "3"]),
    ("How many sides does a square have?", ["four", "4"]),

    # === NUMBERS ===
    ("What comes after one?", ["two", "2", "after one comes two"]),
    ("What comes after five?", ["six", "6", "after five comes six"]),
]

# ANCHOR: STRICT_TESTS
# Additional strict tests (philosophy, hallucinations)
STRICT_TESTS = [
    # === PHILOSOPHICAL QUESTIONS ===
    ("What is the meaning of life?", ["love", "happiness"]),

    # === SHOULD NOT KNOW (hallucination check) ===
    ("Who wrote Hamlet?", ["not know", "don't know", "unknown"]),
    ("Who is the president of Mars?", ["not know", "don't know", "unknown"]),
]

# ANCHOR: CATEGORY_TESTS
# Tests by categories for detailed analysis
CATEGORY_TESTS = {
    "IS-A (categories)": [
        ("What is a dog?", ["animal"]),
        ("What is a cat?", ["animal"]),
        ("What is a bird?", ["animal"]),
        ("What is an apple?", ["fruit"]),
        ("What is a carrot?", ["vegetable"]),
    ],
    "Colors": [
        ("What color is the sky?", ["blue"]),
        ("What color is grass?", ["green"]),
        ("What color is the sun?", ["yellow"]),
        ("What color is snow?", ["white"]),
    ],
    "Animal sounds": [
        ("What does a dog say?", ["woof", "bark"]),
        ("What does a cat say?", ["meow"]),
        ("What does a cow say?", ["moo"]),
    ],
    "Opposites": [
        ("What is the opposite of hot?", ["cold"]),
        ("What is the opposite of big?", ["small"]),
        ("What is the opposite of fast?", ["slow"]),
        ("What is the opposite of up?", ["down"]),
    ],
    "Geography": [
        ("What is the capital of France?", ["paris"]),
        ("What is the capital of England?", ["london"]),
        ("Where is Paris?", ["france"]),
    ],
    "Science": [
        ("What is the sun?", ["star"]),
        ("What is the Earth?", ["planet"]),
        ("What is water?", ["liquid"]),
    ],
    "Baby animals": [
        ("What is a puppy?", {"any_of": ["baby dog", "young dog"]}),
        ("What is a kitten?", {"any_of": ["baby cat", "young cat"]}),
    ],
    "Hallucinations (should NOT know)": [
        ("Who wrote Hamlet?", ["not know", "don't know"]),
    ],
    "Philosophical questions": [
        ("What is the meaning of life?", ["love", "happiness"]),
    ],
}

# ANCHOR: FINEWEB_TESTS
# Tests based on DIRECT FACTS from FineWeb-Edu texts
# Do NOT require inference - only extraction of what is written in text
FINEWEB_TESTS = [
    # === LINCOLN (Article 35) ===
    # Text: "John Wilkes Booth was 26 years old"
    # Text: "one of the nation's most famous actors"
    # Text: "shot President Lincoln"
    ("Who shot Lincoln?", ["booth"]),
    ("How old was Booth?", {"all_of": ["26", "years"]}),
    ("What was Booth?", ["actor", "famous"]),

    # === SOHO (Article 6) ===
    # NOTE: Commented out - Article 6 not in first 1000 FineWeb-Edu articles
    # When training on 5000+ articles - uncomment
    # Text: "SOHO spacecraft is expected to discover its 1,000TH comet"
    # Text: "Solar and Heliospheric Observatory"
    # ("What is SOHO?", ["spacecraft", "observatory"]),
    # ("What does SOHO discover?", ["comet"]),

    # === LEAVES/CHLOROPHYLL (Article 459) ===
    # Text: "The green chlorophyll disappears from the leaves"
    ("What color is chlorophyll?", ["green"]),
    ("What disappears from leaves?", ["chlorophyll", "green"]),

    # === SEDIMENTARY ROCK (Article 297) ===
    # Text: "sedimentary rock... formed from accumulation of bones, shells"
    # Also valid: types of sedimentary rock (sandstone, limestone, shale)
    ("What is sedimentary rock made of?", ["bones", "shells", "organic", "sandstone", "limestone", "shale"]),

    # === DARWIN (Article 166) ===
    # Text: "Darwin... Origin of Species... natural selection"
    ("What is the origin of species?", ["darwin", "selection"]),

    # === "I DO NOT KNOW" CHECK ===
    # Facts that are NOT in the texts
    ("Who invented the telephone?", ["not know"]),
    ("Who discovered America?", ["not know"]),
]

# ANCHOR: PARAPHRASE_TESTS
# Paraphrased versions of existing questions to test robustness
# Each group: (original_id, [(paraphrase, expected), ...])
# Tests: passive voice, synonyms, word order changes, connector variations
PARAPHRASE_TESTS = [
    # === CATEGORIES - alternative phrasings ===
    # Original: "What is a dog?" -> "animal"
    ("A dog is what kind of thing?", ["animal", "pet", "mammal"]),
    ("Dogs belong to what category?", ["animal", "pet", "mammal"]),
    ("Tell me what a dog is", ["animal", "pet", "mammal"]),
    ("What category does a dog belong to?", ["animal", "pet", "mammal"]),

    # Original: "What is an apple?" -> "fruit"
    ("An apple is a type of what?", ["fruit"]),
    ("Apples are classified as what?", ["fruit"]),
    ("What kind of food is an apple?", ["fruit"]),

    # === PROPERTIES - alternative phrasings ===
    # Original: "What color is the sky?" -> "blue"
    ("The sky is what color?", ["blue"]),
    ("What is the color of the sky?", ["blue"]),
    ("Tell me the sky's color", ["blue"]),
    ("The color of the sky is what?", ["blue"]),

    # Original: "What color is grass?" -> "green"
    ("Grass is what color?", ["green"]),
    ("What is the color of grass?", ["green"]),

    # === OPPOSITES - alternative phrasings ===
    # Original: "What is the opposite of hot?" -> "cold"
    ("Hot is the opposite of what?", ["cold"]),
    ("What word is opposite to hot?", ["cold"]),
    ("The opposite of hot is what?", ["cold"]),
    ("What is hot's opposite?", ["cold"]),

    # Original: "What is the opposite of big?" -> "small"
    ("Big is the opposite of what?", ["small", "little"]),
    ("What word means the opposite of big?", ["small", "little"]),

    # === GEOGRAPHY - alternative phrasings ===
    # Original: "What is the capital of France?" -> "paris"
    ("France has what capital?", ["paris"]),
    ("The capital of France is what?", ["paris"]),
    ("Which city is the capital of France?", ["paris"]),
    ("Name the capital of France", ["paris"]),

    # Original: "Where is Paris?" -> "france"
    ("Paris is located where?", ["france"]),
    ("In what country is Paris?", ["france"]),
    ("Paris is in what country?", ["france"]),

    # === SCIENCE - alternative phrasings ===
    # Original: "What is the sun?" -> "star"
    ("The sun is a type of what?", ["star"]),
    ("What kind of celestial body is the sun?", ["star"]),
    ("Tell me what the sun is", ["star"]),

    # Original: "What is water?" -> "liquid"
    ("Water is what state of matter?", ["liquid"]),
    ("What kind of substance is water?", ["liquid", "drink"]),

    # === ANIMAL SOUNDS - alternative phrasings ===
    # Original: "What does a dog say?" -> "woof/bark"
    ("What sound does a dog make?", ["woof", "bark"]),
    ("A dog says what?", ["woof", "bark"]),
    ("Dogs make what sound?", ["woof", "bark"]),

    # Original: "What does a cat say?" -> "meow"
    ("What sound does a cat make?", ["meow", "purr"]),
    ("A cat says what?", ["meow", "purr"]),

    # === BABY ANIMALS - alternative phrasings ===
    # Original: "What is a puppy?" -> "baby dog"
    ("A puppy is what kind of animal?", {"any_of": ["baby dog", "young dog"]}),
    ("What is a puppy called?", {"any_of": ["baby dog", "young dog"]}),

    # === BODY PARTS - alternative phrasings ===
    # Original: "What do we see with?" -> "eyes"
    ("We see using what?", ["eyes"]),
    ("What body part do we use to see?", ["eyes"]),
    ("Seeing is done with what?", ["eyes"]),

    # Original: "What do we hear with?" -> "ears"
    ("We hear using what?", ["ears"]),
    ("Hearing is done with what?", ["ears"]),

    # === PASSIVE VOICE variants ===
    ("By what is sound heard?", ["ears"]),
    ("By what organ do we smell?", ["nose"]),

    # === TIME - alternative phrasings ===
    # Original: "When do we wake up?" -> "morning"
    ("We wake up at what time of day?", ["morning"]),
    ("What time of day do people wake up?", ["morning"]),

    # Original: "What comes after Monday?" -> "tuesday"
    ("After Monday comes what?", ["tuesday"]),
    ("Monday is followed by what day?", ["tuesday"]),
    ("The day after Monday is what?", ["tuesday"]),
]


def check_answer_with_llm(question: str, answer: str, expected: list) -> bool:
    """
    Uses LLM to check answer correctness.

    Args:
        question: Question
        answer: Model answer (Brain raw)
        expected: List of expected keywords

    Returns:
        True if answer is correct
    """
    import requests
    from config import CONFIG

    prompt = f"""Question: {question}
Answer: {answer}
Expected keywords: {_format_expectation_text(expected)}

Does the answer correctly respond to the question with the expected meaning?
Reply only YES or NO."""

    try:
        response = requests.post(
            CONFIG["LLM_OLLAMA_URL"],
            json={"model": CONFIG["LLM_MODEL"], "prompt": prompt, "stream": False},
            timeout=CONFIG["LLM_TIMEOUT"]
        )
        if response.status_code == 200:
            result = response.json().get("response", "").strip().upper()
            return result.startswith("YES")
    except:
        pass

    # Fallback: simple keyword check
    return check_answer_simple(answer, expected)


# ANCHOR: QA_EVAL_STOPWORDS
_QA_EVAL_STOPWORDS: Set[str] = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'being', 'but', 'by', 'can',
    'could', 'did', 'do', 'does', 'for', 'from', 'had', 'has', 'have', 'he', 'her',
    'his', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'me', 'might', 'my', 'of',
    'on', 'or', 'our', 'she', 'so', 'than', 'that', 'the', 'their', 'them', 'there',
    'they', 'this', 'to', 'too', 'up', 'us', 'was', 'we', 'were', 'what', 'when',
    'where', 'which', 'who', 'why', 'with', 'would', 'you', 'your', 'am', 'not',
}


# ANCHOR: QA_EVAL_UNKNOWN_MARKERS
_QA_EVAL_UNKNOWN_MARKERS: Set[str] = {
    'do not know',
    'not know',
    'unknown',
    'i do not know',
}


# ANCHOR: QA_EVAL_NORMALIZE_TOKENS
# API_PRIVATE
def _normalize_eval_tokens(text: str) -> List[str]:
    """
    Normalize text for strict whole-token answer evaluation.

    Intent:
        Test evaluation must judge whether the answer expresses the expected
        concept rather than rewarding accidental substring overlap inside noisy
        readouts.

    Args:
        text: Raw answer or expectation text.

    Returns:
        Normalized list of tokens.

    Raises:
        AssertionError: If text is None.
    """
    assert text is not None, "text cannot be None because answer evaluation needs a concrete linguistic trace"
    normalized = text.lower()
    replacements = {
        "don't": 'do not',
        "dont": 'do not',
        "doesn't": 'does not',
        "didn't": 'did not',
        "can't": 'can not',
        "cannot": 'can not',
        "won't": 'will not',
        "i'm": 'i am',
    }
    for source, target in replacements.items():
        normalized = normalized.replace(source, target)
    normalized = re.sub(r'[^a-z0-9\s]', ' ', normalized)
    normalized = re.sub(r'\s+', ' ', normalized).strip()
    tokens = [token for token in normalized.split(' ') if token]
    assert all(token and token == token.strip() for token in tokens), "normalized tokens must stay non-empty because evaluation compares explicit token identities"
    return tokens


# ANCHOR: QA_EVAL_CONTENT_TOKENS
# API_PRIVATE
def _extract_content_tokens(tokens: Sequence[str]) -> List[str]:
    """
    Remove low-information function words from normalized evaluation tokens.

    Intent:
        Precision checks should estimate how focused the answer is on the target
        concept, which requires measuring informative lexical content rather than
        counting articles and auxiliaries.

    Args:
        tokens: Normalized tokens.

    Returns:
        Content-bearing tokens.

    Raises:
        AssertionError: If tokens is None.
    """
    assert tokens is not None, "tokens cannot be None because evaluation precision depends on answer content density"
    content_tokens = [token for token in tokens if token not in _QA_EVAL_STOPWORDS]
    assert len(content_tokens) <= len(tokens), "content token filtering must never invent extra lexical evidence"
    return content_tokens


# ANCHOR: QA_EVAL_EXPECTATION_SPEC
# API_PRIVATE
def _coerce_expectation_spec(expected_keywords: Union[List[str], Dict[str, Any], str]) -> Dict[str, Any]:
    """
    Convert legacy expectation formats into an explicit evaluation spec.

    Intent:
        The test corpus historically used plain keyword lists. Converting them to
        a single normalized schema allows stricter matching without breaking the
        existing datasets.

    Args:
        expected_keywords: Legacy list/string or structured expectation spec.

    Returns:
        Normalized expectation dictionary.

    Raises:
        AssertionError: If no expected answer content is provided.
    """
    assert expected_keywords is not None, "expected_keywords cannot be None because a test must define what counts as success"
    if isinstance(expected_keywords, dict):
        spec: Dict[str, Any] = {
            'any_of': list(expected_keywords.get('any_of', [])),
            'all_of': list(expected_keywords.get('all_of', [])),
            'min_any_matches': expected_keywords.get('min_any_matches', None),
            'max_content_tokens': expected_keywords.get('max_content_tokens', None),
            'min_match_ratio': expected_keywords.get('min_match_ratio', None),
        }
    elif isinstance(expected_keywords, str):
        spec = {
            'any_of': [expected_keywords],
            'all_of': [],
            'min_any_matches': None,
            'max_content_tokens': None,
            'min_match_ratio': None,
        }
    else:
        spec = {
            'any_of': list(expected_keywords),
            'all_of': [],
            'min_any_matches': None,
            'max_content_tokens': None,
            'min_match_ratio': None,
        }
    assert spec['any_of'] or spec['all_of'], "expectation spec must define at least one acceptable concept because pass/fail depends on it"
    return spec


# ANCHOR: QA_EVAL_EXPECTATION_FORMAT
# API_PRIVATE
def _format_expectation_text(expected_keywords: Union[List[str], Dict[str, Any], str]) -> str:
    """
    Convert an expectation spec into a readable text description.

    Intent:
        Human-facing logs and LLM/GPT evaluators should see the semantic target in
        explicit form even when tests use structured expectation dictionaries.

    Args:
        expected_keywords: Legacy or structured expectation specification.

    Returns:
        Readable expectation string.

    Raises:
        AssertionError: If the resulting text is empty.
    """
    spec = _coerce_expectation_spec(expected_keywords)
    parts: List[str] = []
    if spec['all_of']:
        parts.append(f"ALL OF: {', '.join(spec['all_of'])}")
    if spec['any_of']:
        parts.append(f"ANY OF: {', '.join(spec['any_of'])}")
    if spec['min_any_matches']:
        parts.append(f"MIN ANY MATCHES: {spec['min_any_matches']}")
    result = ' | '.join(parts)
    assert result, "formatted expectation text must stay non-empty because evaluation prompts need explicit success criteria"
    return result


# ANCHOR: QA_EVAL_REGULAR_PLURAL_LEMMAS
# API_PRIVATE
def _regular_plural_lemmas(token: str) -> Set[str]:
    """
    Generate simple lemma candidates for regular English plural morphology.

    Intent:
        The evaluator should treat regular plural/singular surface forms as the
        same lexical concept when that distinction is not semantically relevant
        to answer correctness.

    Args:
        token: Normalized token.

    Returns:
        Set of plausible singular/plural lemma variants.

    Raises:
        AssertionError: If token is empty.
    """
    assert token, "token must be non-empty because morphological decomposition requires a concrete lexical form"
    lemmas: Set[str] = set()
    if len(token) > 4 and token.endswith('ies'):
        lemmas.add(token[:-3] + 'y')
    if len(token) > 4 and token.endswith('es'):
        lemmas.add(token[:-2])
    if len(token) > 3 and token.endswith('s') and not token.endswith('ss'):
        lemmas.add(token[:-1])
    result = {lemma for lemma in lemmas if lemma}
    assert all(lemma for lemma in result), "plural decomposition must preserve only valid lexical variants because empty lemmas cannot guide matching"
    return result


# ANCHOR: QA_EVAL_MORPH_TOKEN_MATCH
# API_PRIVATE
def _tokens_match_with_morphology(answer_token: str, expected_token: str) -> bool:
    """
    Check token equivalence using existing hippocampal morphology mappings.

    Intent:
        Strict evaluation should distinguish conceptually correct plural/singular
        forms from unrelated noise, reusing the same lexical-variant knowledge
        already present in the retrieval system.

    Args:
        answer_token: Token produced by the model.
        expected_token: Token required by the test expectation.

    Returns:
        True when the two tokens are equivalent under the morphology map.

    Raises:
        AssertionError: If either token is empty.
    """
    assert answer_token, "answer_token must be non-empty because lexical matching depends on concrete word forms"
    assert expected_token, "expected_token must be non-empty because empty expectation tokens cannot define correctness"
    if answer_token == expected_token:
        return True
    answer_forms = {answer_token} | set(Hippocampus.VERB_FORMS.get(answer_token, set())) | _regular_plural_lemmas(answer_token)
    expected_forms = {expected_token} | set(Hippocampus.VERB_FORMS.get(expected_token, set())) | _regular_plural_lemmas(expected_token)
    result = bool(answer_forms & expected_forms)
    assert isinstance(result, bool), "morphology-aware token comparison must yield a boolean because evaluator decisions are binary"
    return result


# ANCHOR: QA_EVAL_EXPECTATION_MATCH
# API_PRIVATE
def _matches_expectation(answer_tokens: Sequence[str], expectation: str) -> bool:
    """
    Check whether the answer expresses a full expected concept.

    Intent:
        Multi-word expectations such as `baby dog` or `do not know` should only
        match when all concept tokens are present, preventing partial-credit
        matches like `dog` for `baby dog`.

    Args:
        answer_tokens: Normalized answer tokens.
        expectation: Expected token or phrase.

    Returns:
        True when all expectation tokens are present.

    Raises:
        AssertionError: If expectation is empty.
    """
    expectation_tokens = _normalize_eval_tokens(expectation)
    assert expectation_tokens, "expectation must contain at least one token because empty targets cannot define correctness"
    result = all(
        any(_tokens_match_with_morphology(answer_token, expectation_token) for answer_token in answer_tokens)
        for expectation_token in expectation_tokens
    )
    assert isinstance(result, bool), "expectation matching must return a boolean because pass/fail accounting depends on it"
    return result


# ANCHOR: QA_EVAL_STRICT_SCORING
# API_PRIVATE
def evaluate_answer_strict(answer: str, expected_keywords: Union[List[str], Dict[str, Any], str], question: str = "") -> Dict[str, Any]:
    """
    Evaluate answer correctness with semantic matching and precision gating.

    Intent:
        A biologically plausible model can transiently co-activate related traces,
        but the QA benchmark should only credit focused readouts that express the
        requested concept rather than long noisy blends containing one lucky word.

    Args:
        answer: Model answer to evaluate.
        expected_keywords: Expected answer specification.
        question: Original question for debugging context.

    Returns:
        Evaluation details including correctness, matched expectations, and
        precision statistics.

    Raises:
        AssertionError: If answer is None.
    """
    assert answer is not None, "answer cannot be None because QA evaluation needs an observable behavioral output"
    spec = _coerce_expectation_spec(expected_keywords)
    answer_tokens = _normalize_eval_tokens(answer)
    answer_content_tokens = _extract_content_tokens(answer_tokens)
    question_tokens = _normalize_eval_tokens(question)
    question_content_tokens = _extract_content_tokens(question_tokens)

    unique_answer_content_tokens = list(dict.fromkeys(answer_content_tokens))
    unique_question_content_tokens = list(dict.fromkeys(question_content_tokens))
    precision_content_tokens = [
        token for token in unique_answer_content_tokens
        if not any(
            _tokens_match_with_morphology(token, question_token)
            for question_token in unique_question_content_tokens
        )
    ]
    if not precision_content_tokens and unique_answer_content_tokens:
        precision_content_tokens = list(unique_answer_content_tokens)

    any_matches = [candidate for candidate in spec['any_of'] if _matches_expectation(answer_tokens, candidate)]
    all_matches = [candidate for candidate in spec['all_of'] if _matches_expectation(answer_tokens, candidate)]

    matched_tokens: Set[str] = set()
    for candidate in any_matches + all_matches:
        matched_tokens.update(_normalize_eval_tokens(candidate))

    min_any_matches = spec['min_any_matches']
    if min_any_matches is None:
        min_any_matches = 1 if spec['any_of'] else 0

    semantic_ok = len(any_matches) >= int(min_any_matches) and len(all_matches) == len(spec['all_of'])

    matched_content_tokens = [
        token for token in precision_content_tokens
        if any(_tokens_match_with_morphology(token, matched_token) for matched_token in matched_tokens)
    ]
    matched_content_count = len(set(matched_content_tokens))
    content_count = len(precision_content_tokens)
    answer_is_unknown = any(' '.join(_normalize_eval_tokens(candidate)) in _QA_EVAL_UNKNOWN_MARKERS for candidate in spec['any_of'] + spec['all_of'])

    if answer_is_unknown:
        max_content_tokens = int(spec['max_content_tokens'] or 5)
        precision_ok = content_count <= max_content_tokens
    else:
        default_max_content_tokens = max(2, matched_content_count + 1)
        if spec['all_of']:
            default_max_content_tokens = max(default_max_content_tokens, matched_content_count + 1)
        max_content_tokens = int(spec['max_content_tokens'] or default_max_content_tokens)
        default_min_match_ratio = 0.50 if matched_content_count <= 1 else 0.34
        if spec['all_of']:
            default_min_match_ratio = max(default_min_match_ratio, 0.50)
        min_match_ratio = float(spec['min_match_ratio'] or default_min_match_ratio)
        short_clause_ok = semantic_ok and content_count <= min(4, max_content_tokens + 1)
        precision_ok = content_count <= max_content_tokens or (
            matched_content_count / max(1, content_count)
        ) >= min_match_ratio or short_clause_ok

    is_correct = semantic_ok and precision_ok
    result = {
        'is_correct': is_correct,
        'semantic_ok': semantic_ok,
        'precision_ok': precision_ok,
        'matched_any': any_matches,
        'matched_all': all_matches,
        'content_count': content_count,
        'matched_content_count': matched_content_count,
        'question': question,
    }
    assert isinstance(result['is_correct'], bool), "strict evaluator must return a boolean decision because test accounting depends on binary pass/fail"
    return result


def check_answer_simple(answer: str, expected_keywords: Union[List[str], Dict[str, Any], str]) -> bool:
    """
    Strict check: require semantic match and sufficient answer precision.
    """
    evaluation = evaluate_answer_strict(answer, expected_keywords)
    return evaluation['is_correct']


def check_answer(answer: str, expected_keywords: Union[List[str], Dict[str, Any], str], question: str = "") -> bool:
    """
    Checks answer correctness.

    Applies strict semantic matching and rejects noisy partial answers that only
    contain one lucky keyword.
    """
    evaluation = evaluate_answer_strict(answer, expected_keywords, question)
    return evaluation['is_correct']


def run_tests(questions: list = None, show_llm: bool = True):
    """
    Runs Q&A tests.

    Args:
        questions: List of questions (default DEFAULT_QUESTIONS)
        show_llm: Show LLM postprocessing
    """
    if questions is None:
        questions = DEFAULT_QUESTIONS

    print()
    print('=' * 70)
    if show_llm:
        print("TESTS - Brain raw -> Broca's area (LLM)")
    else:
        print('TESTS - Brain raw output')
    print('=' * 70)

    for q in questions:
        raw = ask(q)
        print(f'Q: {q}')
        print(f'Brain raw: {raw}')
        if show_llm:
            fixed = postprocess_answer(raw, q)
            print(f"Broca's area (LLM): {fixed}")
        print()


def run_test_suite(tests: list, suite_name: str):
    """
    Runs test suite with answer correctness checking.

    Args:
        tests: List of tests [(question, expected_keywords), ...]
        suite_name: Test suite name for output

    Returns:
        dict with statistics: passed, failed, total, accuracy, timing, gpt_scores
    """
    log('')
    log('=' * 70)
    log(f'TESTS {suite_name}')
    log('=' * 70)

    # Load QA baselines (TF-IDF, BM25). MemNet/NTM are for bAbI only.
    from baselines.tfidf_baseline import get_all_baselines
    all_baselines = get_all_baselines(use_openai=False)
    qa_baselines = {
        'tfidf': all_baselines.get('tfidf'),
        'bm25': all_baselines.get('bm25'),
    }

    passed = 0
    failed = 0
    failed_tests = []
    total_brain_time = 0.0
    total_llm_time = 0.0
    total_gpt_time = 0.0
    gpt_scores = []  # List of GPT scores
    gpt_enabled = CONFIG.get("GPT_EVAL_ENABLED", False)

    # Baseline stats for QA baselines (MemNet/NTM only for bAbI)
    baseline_passed = {name: 0 for name in ['tfidf', 'bm25']}
    baseline_time = {name: 0.0 for name in ['tfidf', 'bm25']}

    for question, expected in tests:
        # Measure Brain time (ask)
        t0 = time.time()
        raw = ask(question)
        t_brain = time.time() - t0

        # Measure LLM time (postprocess)
        t1 = time.time()
        verbalized = postprocess_answer(raw, question)
        t_llm = time.time() - t1

        # GPT answer quality evaluation
        t2 = time.time()
        if gpt_enabled:
            gpt_eval = evaluate_answer_quality(
                question=question,
                brain_raw=raw,
                llm_fixed=verbalized,
                expected=expected
            )
            gpt_scores.append(gpt_eval)
        else:
            gpt_eval = None
        t_gpt = time.time() - t2

        t_total = t_brain + t_llm + t_gpt
        total_brain_time += t_brain
        total_llm_time += t_llm
        total_gpt_time += t_gpt

        is_correct = check_answer(raw, expected, question)

        # Test QA baselines (TF-IDF, BM25)
        baseline_results = {}
        for bl_name, bl in qa_baselines.items():
            if bl is None:
                baseline_results[bl_name] = ("N/A", False, 0.0)
                continue
            try:
                t_bl = time.time()
                bl_ans = bl.answer(question)
                bl_time = time.time() - t_bl
                bl_ok = check_answer(bl_ans, expected, question)
                baseline_results[bl_name] = (bl_ans[:60] if bl_ans else "N/A", bl_ok, bl_time)
                baseline_time[bl_name] += bl_time
                if bl_ok:
                    baseline_passed[bl_name] += 1
            except Exception:
                baseline_results[bl_name] = ("error", False, 0.0)

        if is_correct:
            passed += 1
            status = "✅ PASS"
        else:
            failed += 1
            status = "❌ FAIL"
            failed_tests.append((question, raw, verbalized, expected, gpt_eval))

        # GPT score: 🧠 raw -> 🗣 final
        gpt_str = ""
        if gpt_eval and not gpt_eval.get("error"):
            r, f = gpt_eval.get('raw', 0), gpt_eval.get('final', 0)
            r_emoji = "🟢" if r >= 8 else "🟡" if r >= 5 else "🔴"
            f_emoji = "🟢" if f >= 8 else "🟡" if f >= 5 else "🔴"
            gpt_str = f" | Score: raw={r}{r_emoji} LLM={f}{f_emoji}"
            if gpt_eval.get("issue"):
                gpt_str += f" ({gpt_eval['issue'][:35]})"
        elif gpt_eval and gpt_eval.get("error"):
            gpt_str = f" | ⚠️GPT: {str(gpt_eval['error'])[:25]}"

        gpt_str_full = gpt_str
        log(f'{status} | Q: {question}')
        brain_status = "✅" if is_correct else "❌"
        log(f'         {brain_status} Brain  [{t_brain:.3f}s]: {raw}')
        if not NO_LLM_MODE:
            log(f'         📝 LLM    [{t_llm:.3f}s]: {verbalized}')
        if t_gpt > 0.01:
            log(f'         🤖 GPT    [{t_gpt:.3f}s]{gpt_str_full}')
        # Show QA baselines with time
        for bl_name in ['tfidf', 'bm25']:
            if bl_name in baseline_results:
                ans, ok, bl_t = baseline_results[bl_name]
                label = {'tfidf': 'TF-IDF', 'bm25': 'BM25'}[bl_name]
                log(f'         {"✅" if ok else "❌"} {label:6} [{bl_t:.3f}s]: {ans}')
        log(f'         Expected: {_format_expectation_text(expected)}')

        log('')

    total = passed + failed
    accuracy = (passed / total * 100) if total > 0 else 0
    total_time = total_brain_time + total_llm_time + total_gpt_time

    # Average GPT score (raw and final)
    avg_raw, avg_final = 0.0, 0.0
    avg_gpt_score = 0.0
    if gpt_scores:
        valid = [s for s in gpt_scores if not s.get("error") and s.get("raw", 0) > 0]
        if valid:
            avg_raw = sum(s["raw"] for s in valid) / len(valid)
            avg_final = sum(s["final"] for s in valid) / len(valid)
            avg_gpt_score = (avg_raw + avg_final) / 2

    log('=' * 70)
    result_str = f'RESULT {suite_name}: {passed}/{total} ({accuracy:.1f}%)'
    if avg_raw > 0:
        result_str += f' | GPT: 🧠{avg_raw:.1f}→🗣{avg_final:.1f}'
    result_str += f' | Brain: {total_brain_time:.2f}s'
    if not NO_LLM_MODE:
        result_str += f' | LLM: {total_llm_time:.2f}s'
    if total_gpt_time > 0:
        result_str += f' | GPT: {total_gpt_time:.2f}s'
    log(result_str)

    # Baseline comparison summary
    if qa_baselines and total > 0:
        tfidf_passed = baseline_passed.get('tfidf', 0)
        bm25_passed = baseline_passed.get('bm25', 0)
        tfidf_acc = tfidf_passed / total * 100
        bm25_acc = bm25_passed / total * 100
        log(f'BASELINES: TF-IDF {tfidf_passed}/{total} ({tfidf_acc:.1f}%) | BM25 {bm25_passed}/{total} ({bm25_acc:.1f}%)')
        log(f'Brain advantage: vs TF-IDF {accuracy - tfidf_acc:+.1f}% | vs BM25 {accuracy - bm25_acc:+.1f}%')
    log('=' * 70)

    if failed_tests:
        log('')
        log(f'FAILED ({suite_name}):')
        for item in failed_tests:
            q, raw, llm, exp = item[0], item[1], item[2], item[3]
            gpt_e = item[4] if len(item) > 4 else None
            log(f'  Q: {q}')
            log(f'  Brain raw: {raw}')
            log(f"  Broca's area (LLM): {llm}")
            log(f'  Expected: {_format_expectation_text(exp)}')
            if gpt_e and not gpt_e.get("error"):
                score = gpt_e.get("score") or gpt_e.get("final") or 0
                log(f'  GPT Score: {score}/10 — {gpt_e.get("explanation", "")[:60]}')
            log('')

    # Calculate accuracies and times for QA baselines (MemNet/NTM only for bAbI)
    bl_accuracies = {}
    bl_times = {}
    for bl_name in ['tfidf', 'bm25']:
        bl_accuracies[f'{bl_name}_accuracy'] = (baseline_passed.get(bl_name, 0) / total * 100) if total > 0 else 0
        bl_times[f'{bl_name}_time'] = baseline_time.get(bl_name, 0)
    # MemNet/NTM are N/A for standard QA tests
    bl_accuracies['memnet_accuracy'] = None
    bl_accuracies['ntm_accuracy'] = None

    return {