-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmulti_model_comparison.py
More file actions
441 lines (359 loc) · 18.6 KB
/
multi_model_comparison.py
File metadata and controls
441 lines (359 loc) · 18.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
#!/usr/bin/env python3
"""
Multi-Model VRRE Comparison Tool
Runs VRRE evaluation across multiple models to compare reasoning capabilities.
Generates comprehensive analysis reports showing relative strengths and weaknesses.
Usage:
python multi_model_comparison.py --models apollo-reasoning-enhanced mistral:7b
python multi_model_comparison.py --all-models # Test all available models
python multi_model_comparison.py --apollo-only # Just Apollo variants
"""
import ollama
import json
import time
import argparse
from typing import List, Dict, Any, Optional
from datetime import datetime
import sys
import os
from dataclasses import dataclass, asdict
# Import VRRE components
from vrre_eval import VRREvaluator, ReasoningTask
@dataclass
class ModelPerformance:
"""Stores comprehensive performance metrics for a single model"""
model_name: str
total_score: float
accuracy_by_type: Dict[str, float]
accuracy_by_difficulty: Dict[str, float]
avg_response_time: float
total_tasks: int
successful_extractions: int
reasoning_quality_score: float
detailed_results: List[Dict[str, Any]]
class MultiModelComparator:
"""Orchestrates VRRE evaluation across multiple models"""
def __init__(self):
self.results = {}
def get_available_models(self) -> List[str]:
"""Get list of all available Ollama models"""
try:
models = ollama.list()
return [model['name'] for model in models['models']]
except Exception as e:
print(f"Error fetching models: {e}")
return []
def filter_apollo_models(self, all_models: List[str]) -> List[str]:
"""Filter to only Apollo-related models"""
apollo_keywords = ['apollo', 'vanta-apollo']
return [model for model in all_models
if any(keyword in model.lower() for keyword in apollo_keywords)]
def create_comparative_test_suite(self) -> List[ReasoningTask]:
"""Create a test suite designed to reveal model differences"""
# Standard reasoning tasks
tasks = [
# Logical reasoning - should show differences in reasoning capability
ReasoningTask(
question="All roses are flowers. Some flowers are red. Therefore, some roses are red. Is this conclusion logically valid?",
correct_answer="false",
explanation="This is a classic logical fallacy. While all roses are flowers, and some flowers are red, we cannot conclude that some roses are red because the red flowers might not be roses.",
task_type="logical",
difficulty="medium"
),
# Mathematical reasoning
ReasoningTask(
question="If a train travels 60 miles in 45 minutes, what is its speed in miles per hour?",
correct_answer="80",
explanation="60 miles ÷ 0.75 hours = 80 mph",
task_type="mathematical",
difficulty="easy"
),
# Complex logical chain
ReasoningTask(
question="In a group of 100 people, 70 like coffee, 60 like tea, and 40 like both. How many like neither coffee nor tea?",
correct_answer="10",
explanation="Using set theory: |Coffee ∪ Tea| = |Coffee| + |Tea| - |Coffee ∩ Tea| = 70 + 60 - 40 = 90. So 100 - 90 = 10 like neither.",
task_type="mathematical",
difficulty="hard"
),
# Reading comprehension with inference
ReasoningTask(
question="The ancient library contained thousands of scrolls, but after the fire, only charred fragments remained. Scholars spent decades trying to reconstruct the lost knowledge. What can we infer about the fire's impact?",
correct_answer="devastating",
explanation="The fire was devastating because it destroyed most of the library's contents, leaving only fragments and requiring decades of reconstruction effort.",
task_type="reading_comprehension",
difficulty="medium"
),
# Boolean logic with negation
ReasoningTask(
question="If it's not true that all birds can fly, does this mean no birds can fly?",
correct_answer="false",
explanation="If 'all birds can fly' is false, it only means that at least one bird cannot fly. Many birds can still fly.",
task_type="boolean",
difficulty="medium"
),
# Probabilistic reasoning
ReasoningTask(
question="You flip a fair coin 3 times and get heads each time. What's the probability of getting heads on the 4th flip?",
correct_answer="0.5",
explanation="Each coin flip is independent. Previous results don't affect future flips. The probability remains 50%.",
task_type="mathematical",
difficulty="medium"
),
# Causal reasoning
ReasoningTask(
question="Studies show that people who exercise regularly have lower rates of heart disease. Does this prove that exercise prevents heart disease?",
correct_answer="false",
explanation="Correlation does not imply causation. There could be confounding variables, or people with better health might be more likely to exercise.",
task_type="logical",
difficulty="hard"
),
# Counterfactual reasoning
ReasoningTask(
question="If Shakespeare had been born in the 20th century instead of the 16th, would his plays still be considered masterpieces?",
correct_answer="uncertain",
explanation="This is counterfactual reasoning with no definitive answer. His genius might have adapted to modern times, or the cultural context was essential to his success.",
task_type="logical",
difficulty="hard"
)
]
return tasks
def evaluate_model(self, model_name: str, tasks: List[ReasoningTask], verbose: bool = True) -> ModelPerformance:
"""Evaluate a single model on the test suite"""
if verbose:
print(f"\n🔍 Evaluating {model_name}...")
print("=" * 50)
start_time = time.time()
detailed_results = []
successful_extractions = 0
total_score = 0.0
response_times = []
# Track performance by category
type_scores = {}
type_counts = {}
difficulty_scores = {}
difficulty_counts = {}
# Create evaluator for this specific model
evaluator = VRREvaluator(model_name, verbose=False)
for i, task in enumerate(tasks, 1):
if verbose:
print(f"Task {i}/{len(tasks)}: {task.task_type.title()} ({task.difficulty})")
# Evaluate this task
task_start = time.time()
result = evaluator.evaluate_task(task)
task_time = time.time() - task_start
response_times.append(task_time)
# Extract the score and other metrics from result
score = result.get('score', 0.0)
extracted_answer = result.get('extracted_answer')
confidence = result.get('confidence', 0.0)
raw_response = result.get('response', '')
# Track success
if extracted_answer is not None:
successful_extractions += 1
# Update scores
total_score += score
# Update category tracking
task_type = task.task_type
if task_type not in type_scores:
type_scores[task_type] = 0
type_counts[task_type] = 0
type_scores[task_type] += score
type_counts[task_type] += 1
difficulty = task.difficulty
if difficulty not in difficulty_scores:
difficulty_scores[difficulty] = 0
difficulty_counts[difficulty] = 0
difficulty_scores[difficulty] += score
difficulty_counts[difficulty] += 1
# Store detailed result
detailed_results.append({
'task_id': task.id,
'question': task.question,
'correct_answer': task.correct_answer,
'model_response': raw_response,
'extracted_answer': extracted_answer,
'score': score,
'confidence': confidence,
'response_time': task_time,
'task_type': task.task_type,
'difficulty': task.difficulty
})
if verbose:
status = "✓" if score > 0.5 else "✗"
print(f" {status} Score: {score:.2f} | Time: {task_time:.1f}s")
# Calculate final metrics
total_time = time.time() - start_time
avg_response_time = sum(response_times) / len(response_times)
# Calculate accuracy by type and difficulty
accuracy_by_type = {t: type_scores[t] / type_counts[t] for t in type_scores}
accuracy_by_difficulty = {d: difficulty_scores[d] / difficulty_counts[d] for d in difficulty_scores}
# Calculate reasoning quality score (combination of accuracy and extraction success)
extraction_rate = successful_extractions / len(tasks)
avg_accuracy = total_score / len(tasks)
reasoning_quality_score = (avg_accuracy * 0.7) + (extraction_rate * 0.3)
performance = ModelPerformance(
model_name=model_name,
total_score=total_score,
accuracy_by_type=accuracy_by_type,
accuracy_by_difficulty=accuracy_by_difficulty,
avg_response_time=avg_response_time,
total_tasks=len(tasks),
successful_extractions=successful_extractions,
reasoning_quality_score=reasoning_quality_score,
detailed_results=detailed_results
)
if verbose:
print(f"\n📊 Results for {model_name}:")
print(f" Overall Score: {avg_accuracy:.1%}")
print(f" Extraction Success: {extraction_rate:.1%}")
print(f" Reasoning Quality: {reasoning_quality_score:.1%}")
print(f" Avg Response Time: {avg_response_time:.1f}s")
return performance
def compare_models(self, model_names: List[str], output_file: Optional[str] = None) -> Dict[str, ModelPerformance]:
"""Compare multiple models and generate analysis"""
print(f"Starting VRRE Multi-Model Comparison")
print(f"Models to test: {', '.join(model_names)}")
print(f"Timestamp: {datetime.now().isoformat()}")
# Create test suite
tasks = self.create_comparative_test_suite()
print(f"Test suite: {len(tasks)} reasoning tasks")
# Evaluate each model
results = {}
for model_name in model_names:
try:
results[model_name] = self.evaluate_model(model_name, tasks)
except Exception as e:
print(f"❌ Error evaluating {model_name}: {e}")
continue
# Generate comparative analysis
self.generate_analysis_report(results, output_file)
return results
def generate_analysis_report(self, results: Dict[str, ModelPerformance], output_file: Optional[str] = None):
"""Generate comprehensive analysis report"""
if not results:
print("No results to analyze!")
return
report = []
report.append("🔬 VRRE Multi-Model Analysis Report")
report.append("=" * 50)
report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"Models evaluated: {len(results)}")
report.append("")
# Overall rankings
report.append("🏆 Overall Rankings (by Reasoning Quality Score)")
report.append("-" * 30)
sorted_models = sorted(results.items(), key=lambda x: x[1].reasoning_quality_score, reverse=True)
for i, (model_name, perf) in enumerate(sorted_models, 1):
report.append(f"{i}. {model_name}")
report.append(f" Reasoning Quality: {perf.reasoning_quality_score:.1%}")
report.append(f" Accuracy: {perf.total_score / perf.total_tasks:.1%}")
report.append(f" Extraction Rate: {perf.successful_extractions / perf.total_tasks:.1%}")
report.append("")
# Performance by task type
report.append("📊 Performance by Task Type")
report.append("-" * 30)
# Get all task types
all_types = set()
for perf in results.values():
all_types.update(perf.accuracy_by_type.keys())
for task_type in sorted(all_types):
report.append(f"\n{task_type.title()} Reasoning:")
type_results = [(name, perf.accuracy_by_type.get(task_type, 0))
for name, perf in results.items()]
type_results.sort(key=lambda x: x[1], reverse=True)
for name, score in type_results:
report.append(f" {name}: {score:.1%}")
# Performance by difficulty
report.append("\n🎯 Performance by Difficulty")
report.append("-" * 30)
all_difficulties = set()
for perf in results.values():
all_difficulties.update(perf.accuracy_by_difficulty.keys())
for difficulty in ['easy', 'medium', 'hard']: # Ordered by difficulty
if difficulty in all_difficulties:
report.append(f"\n{difficulty.title()} Tasks:")
diff_results = [(name, perf.accuracy_by_difficulty.get(difficulty, 0))
for name, perf in results.items()]
diff_results.sort(key=lambda x: x[1], reverse=True)
for name, score in diff_results:
report.append(f" {name}: {score:.1%}")
# Response time analysis
report.append("\n⏱️ Response Time Analysis")
report.append("-" * 30)
time_results = [(name, perf.avg_response_time) for name, perf in results.items()]
time_results.sort(key=lambda x: x[1])
for name, avg_time in time_results:
report.append(f"{name}: {avg_time:.1f}s average")
# Key insights
report.append("\n💡 Key Insights")
report.append("-" * 30)
best_overall = sorted_models[0]
worst_overall = sorted_models[-1]
report.append(f"• Best overall reasoning: {best_overall[0]} ({best_overall[1].reasoning_quality_score:.1%})")
report.append(f"• Needs improvement: {worst_overall[0]} ({worst_overall[1].reasoning_quality_score:.1%})")
# Find the model with biggest gap between easy and hard tasks
difficulty_gaps = []
for name, perf in results.items():
if 'easy' in perf.accuracy_by_difficulty and 'hard' in perf.accuracy_by_difficulty:
gap = perf.accuracy_by_difficulty['easy'] - perf.accuracy_by_difficulty['hard']
difficulty_gaps.append((name, gap))
if difficulty_gaps:
difficulty_gaps.sort(key=lambda x: x[1], reverse=True)
largest_gap = difficulty_gaps[0]
smallest_gap = difficulty_gaps[-1]
report.append(f"• Most difficulty-sensitive: {largest_gap[0]} ({largest_gap[1]:.1%} gap)")
report.append(f"• Most consistent across difficulty: {smallest_gap[0]} ({smallest_gap[1]:.1%} gap)")
# Find fastest model
fastest = min(time_results, key=lambda x: x[1])
slowest = max(time_results, key=lambda x: x[1])
report.append(f"• Fastest responses: {fastest[0]} ({fastest[1]:.1f}s avg)")
report.append(f"• Slowest responses: {slowest[0]} ({slowest[1]:.1f}s avg)")
# Print report
full_report = "\n".join(report)
print("\n" + full_report)
# Save to file if requested
if output_file:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"vrre_comparison_{timestamp}.txt"
with open(filename, 'w') as f:
f.write(full_report)
f.write("\n\n" + "="*50)
f.write("\nDetailed Results JSON:\n")
f.write(json.dumps({name: asdict(perf) for name, perf in results.items()}, indent=2))
print(f"\n📄 Full report saved to: {filename}")
return full_report
def main():
parser = argparse.ArgumentParser(description="Multi-Model VRRE Comparison")
parser.add_argument("--models", nargs="+", help="Specific models to test")
parser.add_argument("--all-models", action="store_true", help="Test all available models")
parser.add_argument("--apollo-only", action="store_true", help="Test only Apollo variants")
parser.add_argument("--output", help="Save detailed report to file")
parser.add_argument("--quick", action="store_true", help="Run shorter test suite")
args = parser.parse_args()
comparator = MultiModelComparator()
# Determine which models to test
if args.models:
models_to_test = args.models
elif args.all_models:
models_to_test = comparator.get_available_models()
elif args.apollo_only:
all_models = comparator.get_available_models()
models_to_test = comparator.filter_apollo_models(all_models)
else:
# Default: test a few key models
all_models = comparator.get_available_models()
priority_models = ['apollo-reasoning-enhanced', 'apollo-system-prompt', 'mistral:7b']
models_to_test = [m for m in priority_models if m in all_models]
if not models_to_test:
print("No priority models found. Use --all-models to test everything available.")
return
if not models_to_test:
print("No models to test!")
return
print(f"Testing {len(models_to_test)} models with VRRE...")
# Run comparison
results = comparator.compare_models(models_to_test, args.output)
print(f"\n✅ Comparison complete! Tested {len(results)} models successfully.")
if __name__ == "__main__":
main()