-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
90 lines (70 loc) · 2.57 KB
/
main.py
File metadata and controls
90 lines (70 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
Main entry point for the Agentic Stack Testing Framework
Usage:
python main.py
"""
import os
import sys
import datetime
from tester import Mastif
from config import ConfigExpert
def main():
"""Main execution function with Mind2Web support"""
# Check for config file argument
if len(sys.argv) > 1:
config_path = sys.argv[1]
else:
config_path = "experiments/example.yaml" # Default config file
config = ConfigExpert.get_instance(config_path)
MODE = config.get("test_mode", "standard") # "standard" or "mind2web"
# Get HuggingFace token
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
print("ERROR: HF_TOKEN environment variable not set!")
print("Please set it with: export HF_TOKEN='your_token_here'")
return 1
# Get OpenAI key
open_ai_key = os.getenv("OPENAI_API_KEY")
if not open_ai_key:
print("ERROR: OPENAI_API_KEY environment variable not set!")
print("Please set it with: export OPENAI_API_KEY='your_key_here'")
return 1
# Initialize tester
tester = Mastif(config_path)
if MODE == "mind2web":
# Run Mind2Web evaluation
print("="*70)
print("MIND2WEB BENCHMARK MODE")
print("="*70)
# Run Mind2Web evaluation
tester.run_mind2web_evaluation()
tester.print_summary()
# Export Mind2Web results
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"./logs/mind2web-results-{timestamp}.json"
tester.export_mind2web_results(filename)
# Also export standard results
standard_filename = f"./logs/results-{timestamp}.json"
tester.export_results(standard_filename)
print(f"\n{'='*70}")
print(f"Mind2Web evaluation complete!")
print(f"Results: {filename}")
print(f"Full logs: {standard_filename}")
print(f"{'='*70}\n")
else:
# Run standard evaluation
print("="*70)
print("STANDARD TESTING MODE")
print("="*70)
tester.run_comprehensive_test()
tester.print_summary()
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"./logs/results-{timestamp}.json"
tester.export_results(filename)
print(f"\n{'='*70}")
print(f"Testing complete! Check {filename} for detailed results.")
print(f"{'='*70}\n")
tester.close() # Cleanup of resources, file handles, etc.
return 0
if __name__ == "__main__":
exit(main())