mastif/experiments/example.yaml at main · cewebbr/mastif · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# =============================================================================
# MASTIF — Multi-Agent System TestIng Framework
# Example configuration file — copy and edit to create your own experiment.
# =============================================================================

experiment:
  name: "Accessibility Expert WCAG"
  description: "Accessibility Expert on ABNT NBR 17225 and WCAG 2.2."

# -----------------------------------------------------------------------------
# Model settings
# -----------------------------------------------------------------------------

temperature: 0.7          # Sampling temperature (0.0 = deterministic, 1.0 = creative; this influences tool use)
max_tokens: 1024          # Max tokens generated per request
max_steps: 3              # Max research iterations per agent workflow (default: 3)
max_tool_rounds: 5        # Max tool execution rounds per generate() call (default: 5)

# -----------------------------------------------------------------------------
# Workflow
# Defines the sequence of nodes each agent executes.
# Each node references a prompt template file under the prompts/ folder.
#
# Node fields:
#   name            : Unique node identifier
#   prompt_template : Path to the prompt template file (relative to this config)
#   output_key      : State dict key where the node's output is stored
#   loop            : If true, the node repeats until step > max_steps
#
# entry_node : First node to execute
# exit_node  : Last node; its output_key value is returned as the final answer
# -----------------------------------------------------------------------------

workflow:
  nodes:
    - name: "plan"
      prompt_template: "prompts/plan.txt"
      output_key: "plan"
      loop: false

    - name: "research"
      prompt_template: "prompts/research.txt"
      output_key: "research_results"
      loop: true

    - name: "report"
      prompt_template: "prompts/report.txt"
      output_key: "final_report"
      loop: false

  entry_node: "plan"
  exit_node: "report"

# -----------------------------------------------------------------------------
# Test settings
# -----------------------------------------------------------------------------

test_mode: "standard"     # "standard" | "mind2web"
requests_soft_limit: 1000 # Prompts for confirmation if total API calls exceed this value
                          # Total calls = models × protocols × frameworks × tasks × max_steps

# -----------------------------------------------------------------------------
# Models
# Any model ID listed here must be available on HuggingFace or OpenAI.
# Prefix OpenAI models with "gpt-". All others are treated as HuggingFace models.
# TIP: Test models independently with a small number of tasks before scaling up to larger benchmarks.
# IMPORTANT: If you want to study tool use, make sure to select models that support tool use by agents (e.g., DeepSeek, GPT-4o, etc.)
# -----------------------------------------------------------------------------

models:
  # - "gpt-4o"
  # - "meta-llama/Llama-3.3-70B-Instruct"
  # - "meta-llama/Llama-4-Scout-17B-16E-Instruct"
  - "deepseek-ai/DeepSeek-V3.2"
  # - "openai/gpt-oss-120b"

# -----------------------------------------------------------------------------
# Protocols
# Controls how messages are structured between agents and the model.
#   STANDARD : Direct API call with no protocol overhead
#   MCP      : Model Context Protocol
#   A2A      : Agent-to-Agent Protocol
#   ACP      : Agent Communication Protocol
# -----------------------------------------------------------------------------

protocols:
  - "STANDARD"
  # - "MCP"
  # - "A2A"
  # - "ACP"

# -----------------------------------------------------------------------------
# Frameworks
# Agentic frameworks to test. Each runs the same workflow independently.
# Frameworks suported are listed bellow.
# -----------------------------------------------------------------------------

frameworks:
  - "CrewAI"
  # - "Smolagents"
  # - "LangChain"
  # - "LangGraph"
  # - "LlamaIndex"
  # - "SemanticKernel"

# -----------------------------------------------------------------------------
# Tools
# Tools available to all agents. Must match names registered in tool_pool.py.
# Available builtin and custom tools:
#   web_search            — DuckDuckGo web search (no API key required)
#   web_browser           — Playwright headless browser navigation
#   wikipedia             — Wikipedia topic lookup
#   arxiv                 — Academic paper search on arXiv
#   python_repl           — Sandboxed Python code execution
#   requests_get          — Simple HTTP GET requests
#   beautifulsoup_scraper — Extract structured HTML content from a URL
#   pdf_reader            — Extract text from PDF files or URLs
#   datetime              — Get current UTC date and time
#   json_parser           — Parse and query JSON strings
#   pubmed                — Search biomedical literature on PubMed
#   youtube_transcript    — Retrieve YouTube video transcripts
#   sympy                 — Evaluate or simplify symbolic math expressions
#   web_interaction       — Custom tool for complex web interactions (e.g. Mind2Web tasks)
#   keyboard_interaction  — Custom tool for keyboard-driven web interactions (e.g. Mind2Web tasks)
# -----------------------------------------------------------------------------

tools:
  # Core & utility tools
  - name: "web_search"
    description: >
      Use this tool when you need to find information that is not already known or may be outdated.
      This includes current events, recent facts, or unknown topics.
      Input should be a clear search query.
      Do NOT use this for simple reasoning or when the answer is already known.

  - name: "web_browser"
    description: >
      Use this tool to open and read the content of a specific webpage when you already have a URL.
      Useful after performing a search.
      Do NOT use this for searching—use web_search first.

  - name: "requests_get"
    description: >
      Use this to retrieve raw content from a specific URL (API, webpage, or endpoint).
      Prefer this for structured or programmatic access.
      Do NOT use for general browsing or search.

  - name: "beautifulsoup_scraper"
    description: >
      Use this to extract structured information from HTML content of a webpage.
      Typically used after fetching a page via web_browser or requests_get.
      Do NOT use if you only need raw text.

  - name: "datetime"
    description: >
      Use this when you need the current date or time.
      Do NOT guess or fabricate current timestamps.

  # Knowledge tools
  - name: "wikipedia"
    description: >
      Use this for general knowledge lookups on well-known topics, people, or concepts.
      Faster and more reliable than web_search for encyclopedic content.
      Do NOT use for recent or niche information.

  - name: "arxiv"
    description: >
      Use this to find academic papers in technical or scientific domains.
      Input should be a research topic or keywords.
      Do NOT use for general knowledge or non-academic queries.

  - name: "pubmed"
    description: >
      Use this for biomedical or clinical research queries.
      Input should be medical terms or research questions.
      Do NOT use for general health advice or non-scientific queries.

  # Computation & parsing
  - name: "python_repl"
    description: >
      Use this for calculations, data processing, or executing code.
      Required for precise arithmetic, transformations, or simulations.
      Do NOT use for simple math that can be done mentally.

  - name: "sympy"
    description: >
      Use this for symbolic mathematics such as algebra, equation solving, or simplification.
      Prefer this over python_repl for symbolic operations.

  - name: "json_parser"
    description: >
      Use this to parse, extract, or query structured JSON data.
      Input should be a JSON string and query instructions.
      Do NOT use for plain text.

  # Content extraction
  - name: "pdf_reader"
    description: >
      Use this to extract text from PDF files or PDF URLs.
      Required when the content is not directly readable as text.
      Do NOT use for HTML pages.

  - name: "youtube_transcript"
    description: >
      Use this to retrieve the transcript of a YouTube video.
      Input should be a video URL.
      Do NOT use if you only need metadata or summaries.

  # Mind2Web-style tools
  - name: "web_interaction"
    description: >
      Use this to interact with web pages in multi-step tasks.
      Includes clicking elements, filling forms, navigating pages, and extracting results.
      Required for tasks involving user workflows on websites.
      Do NOT use for simple retrieval—use web_search or web_browser instead.

  - name: "keyboard_interaction"
    description: >
      Use this for keyboard-based navigation of web interfaces.
      Includes tabbing between elements, pressing enter, typing text, and triggering shortcuts.
      Prefer this when interaction must simulate accessibility or non-mouse workflows.
      Do NOT use for static content extraction.

# =============================================================================
# STANDARD test mode settings
# Used when test_mode is "standard".
# =============================================================================

# prompt_template wraps each task before it is sent to the agent.
# Use {task} as the placeholder for the actual task text.
prompt_template: |
  You are an Accessibility Expert (WCAG Specialist) with knowledge about WCAG 2.2 and ABNT NBR 17225.
  Your expertise is crucial in making the web more accessible for everyone.
  Your job is to perform the following task:

  {task}

  Be confident in your expertise.
  Do not provide hypothetical answers.
  Use any available tool that is a fit for performing the given task.

tasks:
  - "Provide me a set of accessibility tools I can use while accessing the web in a way to overcome existing web accessibility barriers."
  - "Create a report about accessibility issues for website www.globo.com based on requirements A and AA from ABNT NBR 17225."
  - "Assess ABNT NBR 17225 and recommend how to make it less susceptible to short-term technology advances."
  - "Simulate access to the website www.globo.com using a screen reader. Identify and list all accessibility barriers found during the navigation."
  - "Create a figure description for the image located at URL https://www.terra.com.br/noticias/educacao/fuvest-divulga-lista-de-convocados-e-locais-da-2-fase,3f7901dcac109963816c46d327a225efgi55074i.html. Consider the context of the surrounding text on the webpage."
  - "Evaluate the HTML code of the webpage located at URL https://www.terra.com.br/noticias/educacao/fuvest-divulga-lista-de-convocados-e-locais-da-2-fase,3f7901dcac109963816c46d327a225efgi55074i.html and provide suggestions on how to improve code semantics following ABNT NBR 17225."
  - "Create a heading structure for the webpage located at URL https://www.terra.com.br/noticias/educacao/fuvest-divulga-lista-de-convocados-e-locais-da-2-fase,3f7901dcac109963816c46d327a225efgi55074i.html following best practices for web accessibility and ABNT NBR 17225 guidelines."

# =============================================================================
# MIND2WEB test mode settings
# Used when test_mode is "mind2web".
# =============================================================================

mind2web_num_tasks: 10      # Number of tasks to sample (max: 2350 for full benchmark)
# judge_model: "gpt-4o"       # Model used to evaluate task completion quality
judge_model: "gpt-4o-mini"  # Model used to evaluate task completion quality