mastif/main.py at main · cewebbr/mastif · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
Main entry point for the Agentic Stack Testing Framework

Usage:
    python main.py
"""

import os
import sys
import datetime
from tester import Mastif
from config import ConfigExpert

def main():
    """Main execution function with Mind2Web support"""

    # Check for config file argument
    if len(sys.argv) > 1:
        config_path = sys.argv[1]
    else:
        config_path = "experiments/example.yaml" # Default config file

    config = ConfigExpert.get_instance(config_path)
    MODE = config.get("test_mode", "standard")  # "standard" or "mind2web"

    # Get HuggingFace token
    hf_token = os.getenv("HF_TOKEN")
    if not hf_token:
        print("ERROR: HF_TOKEN environment variable not set!")
        print("Please set it with: export HF_TOKEN='your_token_here'")
        return 1

    # Get OpenAI key
    open_ai_key = os.getenv("OPENAI_API_KEY")
    if not open_ai_key:
        print("ERROR: OPENAI_API_KEY environment variable not set!")
        print("Please set it with: export OPENAI_API_KEY='your_key_here'")
        return 1

    # Initialize tester
    tester = Mastif(config_path)

    if MODE == "mind2web":
        # Run Mind2Web evaluation
        print("="*70)
        print("MIND2WEB BENCHMARK MODE")
        print("="*70)

        # Run Mind2Web evaluation
        tester.run_mind2web_evaluation()
        tester.print_summary()

        # Export Mind2Web results
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"./logs/mind2web-results-{timestamp}.json"
        tester.export_mind2web_results(filename)

        # Also export standard results
        standard_filename = f"./logs/results-{timestamp}.json"
        tester.export_results(standard_filename)

        print(f"\n{'='*70}")
        print(f"Mind2Web evaluation complete!")
        print(f"Results: {filename}")
        print(f"Full logs: {standard_filename}")
        print(f"{'='*70}\n")

    else:
        # Run standard evaluation
        print("="*70)
        print("STANDARD TESTING MODE")
        print("="*70)

        tester.run_comprehensive_test()
        tester.print_summary()

        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"./logs/results-{timestamp}.json"
        tester.export_results(filename)

        print(f"\n{'='*70}")
        print(f"Testing complete! Check {filename} for detailed results.")
        print(f"{'='*70}\n")

    tester.close() # Cleanup of resources, file handles, etc.

    return 0

if __name__ == "__main__":
    exit(main())