vrre/examples.py at main · vanta-research/vrre · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python3
"""
Example usage of the VANTA Research Reasoning Evaluation (VRRE)

This script demonstrates various ways to use the evaluation framework
for measuring LLM reasoning capabilities.
"""

from vrre_eval import (
    VRREvaluator,
    ReasoningTask,
    compare_models,
    save_results
)

def example_basic_evaluation():
    """Example: Basic single model evaluation"""
    print("🔬 Example 1: Basic Model Evaluation")

    evaluator = VRREvaluator("apollo-reasoning-enhanced")
    results = evaluator.run_evaluation()
    evaluator.print_summary(results)

    return results

def example_comparative_evaluation():
    """Example: Compare multiple models"""
    print("\n🔬 Example 2: Comparative Model Evaluation")

    models = ["apollo-reasoning-enhanced", "apollo-system-prompt"]
    results = compare_models(models)

    return results

def example_custom_tasks():
    """Example: Create and use custom reasoning tasks"""
    print("\n🔬 Example 3: Custom Reasoning Tasks")

    custom_tasks = [
        ReasoningTask(
            question="If all swans are white, and this bird is a swan, what color is this bird?",
            correct_answer="white",
            explanation="Valid deductive reasoning from universal statement",
            task_type="boolean",
            difficulty="easy",
            id="custom_swan_logic"
        ),
        ReasoningTask(
            question="A store sells apples for $2 each. If you buy 3 apples and pay with a $10 bill, how much change do you get?",
            correct_answer="4",
            explanation="Cost: 3 × $2 = $6, Change: $10 - $6 = $4",
            task_type="mathematical",
            difficulty="medium",
            id="custom_apple_math"
        ),
        ReasoningTask(
            question="Either it's raining or it's sunny. It's not raining. What's the weather?",
            correct_answer="sunny",
            explanation="Disjunctive syllogism: A or B, not A, therefore B",
            task_type="logical",
            difficulty="easy",
            id="custom_weather_logic"
        )
    ]

    evaluator = VRREvaluator("apollo-reasoning-enhanced")
    results = evaluator.run_evaluation(custom_tasks)
    evaluator.print_summary(results)

    return results

def example_save_and_analyze():
    """Example: Save results and perform analysis"""
    print("\n🔬 Example 4: Save and Analyze Results")

    # Run evaluation
    evaluator = VRREvaluator("apollo-reasoning-enhanced", verbose=False)
    results = evaluator.run_evaluation()

    # Save to file
    filename = save_results({"apollo-reasoning-enhanced": results}, "example_results.json")
    print(f"Results saved to: {filename}")

    # Analyze specific aspects
    print("\n📊 Detailed Analysis:")

    # Task-by-task breakdown
    print("\nTask Performance:")
    for result in results['results']:
        if 'error' not in result:
            task = result['task']
            status = "✅" if result['correct'] else "❌"
            print(f"  {status} {task['id']}: {result['score']:.2f} (confidence: {result['confidence']:.2f})")

    # Confidence correlation
    valid_results = [r for r in results['results'] if 'confidence' in r and 'correct' in r]
    correct_confidences = [r['confidence'] for r in valid_results if r['correct']]
    incorrect_confidences = [r['confidence'] for r in valid_results if not r['correct']]

    if correct_confidences and incorrect_confidences:
        avg_correct_conf = sum(correct_confidences) / len(correct_confidences)
        avg_incorrect_conf = sum(incorrect_confidences) / len(incorrect_confidences)
        print(f"\nConfidence Analysis:")
        print(f"  Average confidence when correct: {avg_correct_conf:.3f}")
        print(f"  Average confidence when incorrect: {avg_incorrect_conf:.3f}")
        print(f"  Confidence calibration: {'Good' if avg_correct_conf > avg_incorrect_conf else 'Poor'}")

    return results

def example_quiet_mode():
    """Example: Run evaluation in quiet mode for automation"""
    print("\n🔬 Example 5: Quiet Mode (for automation)")

    evaluator = VRREvaluator("apollo-reasoning-enhanced", verbose=False)
    results = evaluator.run_evaluation()

    # Just print key metrics
    print(f"Model: {results['model_name']}")
    print(f"Accuracy: {results['accuracy']:.1%}")
    print(f"Average Score: {results['average_score']:.3f}")
    print(f"Tasks Completed: {results['valid_results']}/{results['total_tasks']}")

    return results

def main():
    """Run all examples"""
    print("VANTA Research Reasoning Evaluation (VRRE) - Example Usage")
    print("=" * 70)

    # Run examples (comment out any you don't want to run)
    try:
        example_basic_evaluation()
    except Exception as e:
        print(f"Example 1 failed: {e}")

    try:
        example_comparative_evaluation()
    except Exception as e:
        print(f"Example 2 failed: {e}")

    try:
        example_custom_tasks()
    except Exception as e:
        print(f"Example 3 failed: {e}")

    try:
        example_save_and_analyze()
    except Exception as e:
        print(f"Example 4 failed: {e}")

    try:
        example_quiet_mode()
    except Exception as e:
        print(f"Example 5 failed: {e}")

    print("\n✅ Examples completed!")
    print("\nNext steps:")
    print("1. Modify the custom tasks for your specific use case")
    print("2. Add your own models to the comparison")
    print("3. Integrate the evaluator into your model development pipeline")
    print("4. Use the saved JSON results for further analysis")

if __name__ == "__main__":
    main()