vrre/multi_model_comparison.py at main · vanta-research/vrre · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
#!/usr/bin/env python3
"""
Multi-Model VRRE Comparison Tool

Runs VRRE evaluation across multiple models to compare reasoning capabilities.
Generates comprehensive analysis reports showing relative strengths and weaknesses.

Usage:
    python multi_model_comparison.py --models apollo-reasoning-enhanced mistral:7b
    python multi_model_comparison.py --all-models  # Test all available models
    python multi_model_comparison.py --apollo-only  # Just Apollo variants
"""

import ollama
import json
import time
import argparse
from typing import List, Dict, Any, Optional
from datetime import datetime
import sys
import os
from dataclasses import dataclass, asdict

# Import VRRE components
from vrre_eval import VRREvaluator, ReasoningTask

@dataclass
class ModelPerformance:
    """Stores comprehensive performance metrics for a single model"""
    model_name: str
    total_score: float
    accuracy_by_type: Dict[str, float]
    accuracy_by_difficulty: Dict[str, float]
    avg_response_time: float
    total_tasks: int
    successful_extractions: int
    reasoning_quality_score: float
    detailed_results: List[Dict[str, Any]]

class MultiModelComparator:
    """Orchestrates VRRE evaluation across multiple models"""

    def __init__(self):
        self.results = {}

    def get_available_models(self) -> List[str]:
        """Get list of all available Ollama models"""
        try:
            models = ollama.list()
            return [model['name'] for model in models['models']]
        except Exception as e:
            print(f"Error fetching models: {e}")
            return []

    def filter_apollo_models(self, all_models: List[str]) -> List[str]:
        """Filter to only Apollo-related models"""
        apollo_keywords = ['apollo', 'vanta-apollo']
        return [model for model in all_models
                if any(keyword in model.lower() for keyword in apollo_keywords)]

    def create_comparative_test_suite(self) -> List[ReasoningTask]:
        """Create a test suite designed to reveal model differences"""

        # Standard reasoning tasks
        tasks = [
            # Logical reasoning - should show differences in reasoning capability
            ReasoningTask(
                question="All roses are flowers. Some flowers are red. Therefore, some roses are red. Is this conclusion logically valid?",
                correct_answer="false",
                explanation="This is a classic logical fallacy. While all roses are flowers, and some flowers are red, we cannot conclude that some roses are red because the red flowers might not be roses.",
                task_type="logical",
                difficulty="medium"
            ),

            # Mathematical reasoning
            ReasoningTask(
                question="If a train travels 60 miles in 45 minutes, what is its speed in miles per hour?",
                correct_answer="80",
                explanation="60 miles ÷ 0.75 hours = 80 mph",
                task_type="mathematical",
                difficulty="easy"
            ),

            # Complex logical chain
            ReasoningTask(
                question="In a group of 100 people, 70 like coffee, 60 like tea, and 40 like both. How many like neither coffee nor tea?",
                correct_answer="10",
                explanation="Using set theory: |Coffee ∪ Tea| = |Coffee| + |Tea| - |Coffee ∩ Tea| = 70 + 60 - 40 = 90. So 100 - 90 = 10 like neither.",
                task_type="mathematical",
                difficulty="hard"
            ),

            # Reading comprehension with inference
            ReasoningTask(
                question="The ancient library contained thousands of scrolls, but after the fire, only charred fragments remained. Scholars spent decades trying to reconstruct the lost knowledge. What can we infer about the fire's impact?",
                correct_answer="devastating",
                explanation="The fire was devastating because it destroyed most of the library's contents, leaving only fragments and requiring decades of reconstruction effort.",
                task_type="reading_comprehension",
                difficulty="medium"
            ),

            # Boolean logic with negation
            ReasoningTask(
                question="If it's not true that all birds can fly, does this mean no birds can fly?",
                correct_answer="false",
                explanation="If 'all birds can fly' is false, it only means that at least one bird cannot fly. Many birds can still fly.",
                task_type="boolean",
                difficulty="medium"
            ),

            # Probabilistic reasoning
            ReasoningTask(
                question="You flip a fair coin 3 times and get heads each time. What's the probability of getting heads on the 4th flip?",
                correct_answer="0.5",
                explanation="Each coin flip is independent. Previous results don't affect future flips. The probability remains 50%.",
                task_type="mathematical",
                difficulty="medium"
            ),

            # Causal reasoning
            ReasoningTask(
                question="Studies show that people who exercise regularly have lower rates of heart disease. Does this prove that exercise prevents heart disease?",
                correct_answer="false",
                explanation="Correlation does not imply causation. There could be confounding variables, or people with better health might be more likely to exercise.",
                task_type="logical",
                difficulty="hard"
            ),

            # Counterfactual reasoning
            ReasoningTask(
                question="If Shakespeare had been born in the 20th century instead of the 16th, would his plays still be considered masterpieces?",
                correct_answer="uncertain",
                explanation="This is counterfactual reasoning with no definitive answer. His genius might have adapted to modern times, or the cultural context was essential to his success.",
                task_type="logical",
                difficulty="hard"
            )
        ]

        return tasks

    def evaluate_model(self, model_name: str, tasks: List[ReasoningTask], verbose: bool = True) -> ModelPerformance:
        """Evaluate a single model on the test suite"""

        if verbose:
            print(f"\n🔍 Evaluating {model_name}...")
            print("=" * 50)

        start_time = time.time()
        detailed_results = []
        successful_extractions = 0
        total_score = 0.0
        response_times = []

        # Track performance by category
        type_scores = {}
        type_counts = {}
        difficulty_scores = {}
        difficulty_counts = {}

        # Create evaluator for this specific model
        evaluator = VRREvaluator(model_name, verbose=False)

        for i, task in enumerate(tasks, 1):
            if verbose:
                print(f"Task {i}/{len(tasks)}: {task.task_type.title()} ({task.difficulty})")

            # Evaluate this task
            task_start = time.time()
            result = evaluator.evaluate_task(task)
            task_time = time.time() - task_start
            response_times.append(task_time)

            # Extract the score and other metrics from result
            score = result.get('score', 0.0)
            extracted_answer = result.get('extracted_answer')
            confidence = result.get('confidence', 0.0)
            raw_response = result.get('response', '')

            # Track success
            if extracted_answer is not None:
                successful_extractions += 1

            # Update scores
            total_score += score

            # Update category tracking
            task_type = task.task_type
            if task_type not in type_scores:
                type_scores[task_type] = 0
                type_counts[task_type] = 0
            type_scores[task_type] += score
            type_counts[task_type] += 1

            difficulty = task.difficulty
            if difficulty not in difficulty_scores:
                difficulty_scores[difficulty] = 0
                difficulty_counts[difficulty] = 0
            difficulty_scores[difficulty] += score
            difficulty_counts[difficulty] += 1

            # Store detailed result
            detailed_results.append({
                'task_id': task.id,
                'question': task.question,
                'correct_answer': task.correct_answer,
                'model_response': raw_response,
                'extracted_answer': extracted_answer,
                'score': score,
                'confidence': confidence,
                'response_time': task_time,
                'task_type': task.task_type,
                'difficulty': task.difficulty
            })

            if verbose:
                status = "✓" if score > 0.5 else "✗"
                print(f"  {status} Score: {score:.2f} | Time: {task_time:.1f}s")

        # Calculate final metrics
        total_time = time.time() - start_time
        avg_response_time = sum(response_times) / len(response_times)

        # Calculate accuracy by type and difficulty
        accuracy_by_type = {t: type_scores[t] / type_counts[t] for t in type_scores}
        accuracy_by_difficulty = {d: difficulty_scores[d] / difficulty_counts[d] for d in difficulty_scores}

        # Calculate reasoning quality score (combination of accuracy and extraction success)
        extraction_rate = successful_extractions / len(tasks)
        avg_accuracy = total_score / len(tasks)
        reasoning_quality_score = (avg_accuracy * 0.7) + (extraction_rate * 0.3)

        performance = ModelPerformance(
            model_name=model_name,
            total_score=total_score,
            accuracy_by_type=accuracy_by_type,
            accuracy_by_difficulty=accuracy_by_difficulty,
            avg_response_time=avg_response_time,
            total_tasks=len(tasks),
            successful_extractions=successful_extractions,
            reasoning_quality_score=reasoning_quality_score,
            detailed_results=detailed_results
        )

        if verbose:
            print(f"\n📊 Results for {model_name}:")
            print(f"  Overall Score: {avg_accuracy:.1%}")
            print(f"  Extraction Success: {extraction_rate:.1%}")
            print(f"  Reasoning Quality: {reasoning_quality_score:.1%}")
            print(f"  Avg Response Time: {avg_response_time:.1f}s")

        return performance

    def compare_models(self, model_names: List[str], output_file: Optional[str] = None) -> Dict[str, ModelPerformance]:
        """Compare multiple models and generate analysis"""

        print(f"Starting VRRE Multi-Model Comparison")
        print(f"Models to test: {', '.join(model_names)}")
        print(f"Timestamp: {datetime.now().isoformat()}")

        # Create test suite
        tasks = self.create_comparative_test_suite()
        print(f"Test suite: {len(tasks)} reasoning tasks")

        # Evaluate each model
        results = {}
        for model_name in model_names:
            try:
                results[model_name] = self.evaluate_model(model_name, tasks)
            except Exception as e:
                print(f"❌ Error evaluating {model_name}: {e}")
                continue

        # Generate comparative analysis
        self.generate_analysis_report(results, output_file)

        return results

    def generate_analysis_report(self, results: Dict[str, ModelPerformance], output_file: Optional[str] = None):
        """Generate comprehensive analysis report"""

        if not results:
            print("No results to analyze!")
            return

        report = []
        report.append("🔬 VRRE Multi-Model Analysis Report")
        report.append("=" * 50)
        report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report.append(f"Models evaluated: {len(results)}")
        report.append("")

        # Overall rankings
        report.append("🏆 Overall Rankings (by Reasoning Quality Score)")
        report.append("-" * 30)
        sorted_models = sorted(results.items(), key=lambda x: x[1].reasoning_quality_score, reverse=True)

        for i, (model_name, perf) in enumerate(sorted_models, 1):
            report.append(f"{i}. {model_name}")
            report.append(f"   Reasoning Quality: {perf.reasoning_quality_score:.1%}")
            report.append(f"   Accuracy: {perf.total_score / perf.total_tasks:.1%}")
            report.append(f"   Extraction Rate: {perf.successful_extractions / perf.total_tasks:.1%}")
            report.append("")

        # Performance by task type
        report.append("📊 Performance by Task Type")
        report.append("-" * 30)

        # Get all task types
        all_types = set()
        for perf in results.values():
            all_types.update(perf.accuracy_by_type.keys())

        for task_type in sorted(all_types):
            report.append(f"\n{task_type.title()} Reasoning:")
            type_results = [(name, perf.accuracy_by_type.get(task_type, 0))
                          for name, perf in results.items()]
            type_results.sort(key=lambda x: x[1], reverse=True)

            for name, score in type_results:
                report.append(f"  {name}: {score:.1%}")

        # Performance by difficulty
        report.append("\n🎯 Performance by Difficulty")
        report.append("-" * 30)

        all_difficulties = set()
        for perf in results.values():
            all_difficulties.update(perf.accuracy_by_difficulty.keys())

        for difficulty in ['easy', 'medium', 'hard']:  # Ordered by difficulty
            if difficulty in all_difficulties:
                report.append(f"\n{difficulty.title()} Tasks:")
                diff_results = [(name, perf.accuracy_by_difficulty.get(difficulty, 0))
                              for name, perf in results.items()]
                diff_results.sort(key=lambda x: x[1], reverse=True)

                for name, score in diff_results:
                    report.append(f"  {name}: {score:.1%}")

        # Response time analysis
        report.append("\n⏱️ Response Time Analysis")
        report.append("-" * 30)
        time_results = [(name, perf.avg_response_time) for name, perf in results.items()]
        time_results.sort(key=lambda x: x[1])

        for name, avg_time in time_results:
            report.append(f"{name}: {avg_time:.1f}s average")

        # Key insights
        report.append("\n💡 Key Insights")
        report.append("-" * 30)

        best_overall = sorted_models[0]
        worst_overall = sorted_models[-1]

        report.append(f"• Best overall reasoning: {best_overall[0]} ({best_overall[1].reasoning_quality_score:.1%})")
        report.append(f"• Needs improvement: {worst_overall[0]} ({worst_overall[1].reasoning_quality_score:.1%})")

        # Find the model with biggest gap between easy and hard tasks
        difficulty_gaps = []
        for name, perf in results.items():
            if 'easy' in perf.accuracy_by_difficulty and 'hard' in perf.accuracy_by_difficulty:
                gap = perf.accuracy_by_difficulty['easy'] - perf.accuracy_by_difficulty['hard']
                difficulty_gaps.append((name, gap))

        if difficulty_gaps:
            difficulty_gaps.sort(key=lambda x: x[1], reverse=True)
            largest_gap = difficulty_gaps[0]
            smallest_gap = difficulty_gaps[-1]

            report.append(f"• Most difficulty-sensitive: {largest_gap[0]} ({largest_gap[1]:.1%} gap)")
            report.append(f"• Most consistent across difficulty: {smallest_gap[0]} ({smallest_gap[1]:.1%} gap)")

        # Find fastest model
        fastest = min(time_results, key=lambda x: x[1])
        slowest = max(time_results, key=lambda x: x[1])
        report.append(f"• Fastest responses: {fastest[0]} ({fastest[1]:.1f}s avg)")
        report.append(f"• Slowest responses: {slowest[0]} ({slowest[1]:.1f}s avg)")

        # Print report
        full_report = "\n".join(report)
        print("\n" + full_report)

        # Save to file if requested
        if output_file:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"vrre_comparison_{timestamp}.txt"

            with open(filename, 'w') as f:
                f.write(full_report)
                f.write("\n\n" + "="*50)
                f.write("\nDetailed Results JSON:\n")
                f.write(json.dumps({name: asdict(perf) for name, perf in results.items()}, indent=2))

            print(f"\n📄 Full report saved to: {filename}")

        return full_report

def main():
    parser = argparse.ArgumentParser(description="Multi-Model VRRE Comparison")
    parser.add_argument("--models", nargs="+", help="Specific models to test")
    parser.add_argument("--all-models", action="store_true", help="Test all available models")
    parser.add_argument("--apollo-only", action="store_true", help="Test only Apollo variants")
    parser.add_argument("--output", help="Save detailed report to file")
    parser.add_argument("--quick", action="store_true", help="Run shorter test suite")

    args = parser.parse_args()

    comparator = MultiModelComparator()

    # Determine which models to test
    if args.models:
        models_to_test = args.models
    elif args.all_models:
        models_to_test = comparator.get_available_models()
    elif args.apollo_only:
        all_models = comparator.get_available_models()
        models_to_test = comparator.filter_apollo_models(all_models)
    else:
        # Default: test a few key models
        all_models = comparator.get_available_models()
        priority_models = ['apollo-reasoning-enhanced', 'apollo-system-prompt', 'mistral:7b']
        models_to_test = [m for m in priority_models if m in all_models]

        if not models_to_test:
            print("No priority models found. Use --all-models to test everything available.")
            return

    if not models_to_test:
        print("No models to test!")
        return

    print(f"Testing {len(models_to_test)} models with VRRE...")

    # Run comparison
    results = comparator.compare_models(models_to_test, args.output)

    print(f"\n✅ Comparison complete! Tested {len(results)} models successfully.")

if __name__ == "__main__":
    main()