parameterlab · cemde · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/.github/issue_template.md b/.github/issue_template.md
@@ -0,0 +1,22 @@
+## Type
+
+- [ ] Bug
+- [ ] Feature request
+- [ ] Question
+
+## Summary
+
+<!-- One-sentence description -->
+
+## Details
+
+<!--
+For bugs: Steps to reproduce, expected vs actual behavior
+For features: Motivation and proposed design
+For questions: Context and what you've tried
+-->
+
+## Environment (if applicable)
+
+- maseval version:
+- Python version:
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+**Parallel Execution**
+
+- Added parallel task execution with `num_workers` parameter in `Benchmark.run()` using `ThreadPoolExecutor` (PR: #14)
+- Added `ComponentRegistry` class for thread-safe component registration with thread-local storage (PR: #14)
+- Added `TaskContext` for cooperative timeout checking with `check_timeout()`, `elapsed`, `remaining`, and `is_expired` properties (PR: #14)
+- Added `TaskProtocol` dataclass with `timeout_seconds`, `timeout_action`, `max_retries`, `priority`, and `tags` fields for task-level execution control (PR: #14)
+- Added `TimeoutAction` enum (`SKIP`, `RETRY`, `RAISE`) for configurable timeout behavior (PR: #14)
+- Added `TaskTimeoutError` exception with `elapsed`, `timeout`, and `partial_traces` attributes (PR: #14)
+- Added `TASK_TIMEOUT` to `TaskExecutionStatus` enum for timeout classification (PR: #14)
+
+**Task Queue Abstraction**
+
+- Added `TaskQueue` abstract base class with iterator interface for flexible task scheduling (PR: #14)
+- Added `SequentialQueue` for simple FIFO task ordering (PR: #14)
+- Added `PriorityQueue` for priority-based task scheduling using `TaskProtocol.priority` (PR: #14)
+- Added `AdaptiveQueue` placeholder for future feedback-based scheduling (PR: #14)
+
 **ModelAdapter Chat Interface**
 
 - Added `chat()` method to `ModelAdapter` as the primary interface for LLM inference, accepting a list of messages in OpenAI format and returning a `ChatResponse` object and accepting tools
@@ -48,6 +65,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 **Benchmark**
 
 - `Benchmark.agent_data` parameter is now optional (defaults to empty dict) (PR: #16)
+- Refactored `Benchmark` to delegate registry operations to `ComponentRegistry` class (PR: #)
+- `Benchmark.run()` now accepts optional `queue` parameter for custom task scheduling (PR: #14)
 
 **Task**
 

diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md
@@ -117,7 +117,7 @@ Once implemented, run your benchmark:
 
 ```python
 # Define your tasks
-tasks = TaskCollection([Task(query="...", expected="..."), ...])
+tasks = TaskQueue([Task(query="..."), ...])
 
 # Configure your agents (e.g., model parameters, tool settings)
 agent_config = {"model": "gpt-4", "temperature": 0.7}

diff --git a/docs/guides/exception-handling.md b/docs/guides/exception-handling.md
@@ -90,27 +90,22 @@ class SimulatedUser:
 
 One approach to exception handling places the boundary between agent responsibility and infrastructure responsibility at input validation:
 
-```
-┌─────────────────────────────────────────────────────────────┐
-│                     TOOL EXECUTION                          │
-├─────────────────────────────────────────────────────────────┤
-│                                                             │
-│   ┌─────────────────┐                                       │
-│   │  INPUT          │  Agent passes arguments               │
-│   │  VALIDATION     │                                       │
-│   │                 │  ❌ Fails → AgentError                │
-│   │                 │  ✓ Passes ↓                           │
-│   └─────────────────┘                                       │
-│           │                                                 │
-│           ▼                                                 │
-│   ┌─────────────────┐                                       │
-│   │  EXECUTION      │  Tool runs its logic                  │
-│   │                 │                                       │
-│   │                 │  ❌ Fails → EnvironmentError          │
-│   │                 │  ✓ Passes → Result                    │
-│   └─────────────────┘                                       │
-│                                                             │
-└─────────────────────────────────────────────────────────────┘
+```mermaid
+flowchart TD
+    subgraph TOOL_EXECUTION[" "]
+        A[Agent passes arguments] --> B{INPUT VALIDATION}
+        B -->|Fails| C[AgentError]
+        B -->|Passes| D{EXECUTION}
+        D -->|Fails| E[EnvironmentError]
+        D -->|Passes| F[Result]
+    end
+
+    style TOOL_EXECUTION fill:none,stroke:#888
+    style B fill:#f5f5f5,stroke:#333
+    style D fill:#f5f5f5,stroke:#333
+    style C fill:#ffebee,stroke:#c62828
+    style E fill:#ffebee,stroke:#c62828
+    style F fill:#e8f5e9,stroke:#2e7d32
 ```
 
 With this pattern:

diff --git a/docs/reference/task.md b/docs/reference/task.md
@@ -1,9 +1,9 @@
 # Task
 
-Tasks define individual benchmark scenarios including inputs, expected outputs, and any metadata needed for evaluation. TaskCollections group related tasks together.
+Tasks define individual benchmark scenarios including inputs, expected outputs, and any metadata needed for evaluation. TaskQueues group related tasks together.
 
 [:material-github: View source](https://github.com/parameterlab/maseval/blob/main/maseval/core/task.py){ .md-source-file }
 
 ::: maseval.core.task.Task
 
-::: maseval.core.task.TaskCollection
+::: maseval.core.task.TaskQueue
diff --git a/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb b/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
@@ -124,7 +124,7 @@
     "from smolagents import ToolCallingAgent, LiteLLMModel, FinalAnswerTool\n",
     "\n",
     "# MASEval core components\n",
-    "from maseval import Benchmark, Environment, Task, TaskCollection, AgentAdapter, Evaluator, ModelAdapter\n",
+    "from maseval import Benchmark, Environment, Task, TaskQueue, AgentAdapter, Evaluator, ModelAdapter\n",
     "from maseval.interface.agents.smolagents import SmolAgentAdapter\n",
     "\n",
     "# Import evaluators module (dynamically loaded later)\n",
@@ -139,7 +139,7 @@
     "    limit: int | None = None,\n",
     "    seed: int | None = None,\n",
     "    task_indices: list[int] | None = None,\n",
-    ") -> tuple[TaskCollection, list[Dict[str, Any]]]:\n",
+    ") -> tuple[TaskQueue, list[Dict[str, Any]]]:\n",
     "    \"\"\"Load tasks and agent configurations.\n",
     "\n",
     "    Args:\n",
@@ -152,7 +152,7 @@
     "        task_indices: Optional list of task indices to load (e.g., [0, 2, 4])\n",
     "\n",
     "    Returns:\n",
-    "        Tuple of (TaskCollection, list of agent configs)\n",
+    "        Tuple of (TaskQueue, list of agent configs)\n",
     "    \"\"\"\n",
     "    data_dir = Path(\"examples/five_a_day_benchmark/data\")\n",
     "\n",
@@ -200,7 +200,7 @@
     "\n",
     "        configs_data.append(config)\n",
     "\n",
-    "    return TaskCollection(tasks_data), configs_data"
+    "    return TaskQueue(tasks_data), configs_data"
    ]
   },
   {
@@ -745,17 +745,7 @@
    "id": "3764c0be",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Create and run benchmark (will take approx. 2 min)\n",
-    "benchmark = FiveADayBenchmark(\n",
-    "    agent_data=agent_configs,\n",
-    "    fail_on_setup_error=True,\n",
-    "    fail_on_task_error=True,\n",
-    "    fail_on_evaluation_error=True,\n",
-    ")\n",
-    "\n",
-    "results = benchmark.run(tasks=tasks)"
-   ]
+   "source": "# Create and run benchmark (will take approx. 2 min)\nbenchmark = FiveADayBenchmark(\n    fail_on_setup_error=True,\n    fail_on_task_error=True,\n    fail_on_evaluation_error=True,\n)\n\nresults = benchmark.run(tasks=tasks, agent_data=agent_configs)"
   },
   {
    "cell_type": "markdown",
@@ -899,4 +889,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/examples/five_a_day_benchmark/five_a_day_benchmark.py b/examples/five_a_day_benchmark/five_a_day_benchmark.py
@@ -26,7 +26,7 @@
 
 from utils import derive_seed, sanitize_name  # type: ignore[unresolved-import]
 
-from maseval import Benchmark, Environment, Evaluator, Task, TaskCollection, AgentAdapter, ModelAdapter
+from maseval import Benchmark, Environment, Evaluator, Task, TaskQueue, AgentAdapter, ModelAdapter
 from maseval.core.callbacks.result_logger import FileResultLogger
 
 # Import tool implementations
@@ -825,7 +825,7 @@ def load_benchmark_data(
     limit: Optional[int] = None,
     specific_task: Optional[int] = None,
     seed: Optional[int] = None,
-) -> tuple[TaskCollection, List[Dict[str, Any]]]:
+) -> tuple[TaskQueue, List[Dict[str, Any]]]:
     """Load tasks and agent configurations with validation.
 
     Args:
@@ -838,7 +838,7 @@ def load_benchmark_data(
         seed: Base random seed for reproducibility (None for non-deterministic)
 
     Returns:
-        Tuple of (TaskCollection, agent_configs_list)
+        Tuple of (TaskQueue, agent_configs_list)
     """
     if limit is not None and specific_task is not None:
         raise ValueError("Cannot specify both limit and specific_task")
@@ -897,7 +897,7 @@ def load_benchmark_data(
 
     print(f"Loaded {len(tasks_data)} tasks and {len(configs_data)} agent configs\n")
 
-    return TaskCollection(tasks_data), configs_data
+    return TaskQueue(tasks_data), configs_data
 
 
 # ============================================================================
@@ -935,13 +935,12 @@ def load_benchmark_data(
     )
 
     benchmark = FiveADayBenchmark(
-        agent_data=agent_configs,
         callbacks=[logger],
         fail_on_setup_error=True,
         fail_on_task_error=True,
         fail_on_evaluation_error=True,
     )
-    results = benchmark.run(tasks=tasks)
+    results = benchmark.run(tasks=tasks, agent_data=agent_configs)
 
     print("\n--- Benchmark Complete ---")
     print(f"Total tasks: {len(tasks)}")

diff --git a/examples/introduction/tutorial.ipynb b/examples/introduction/tutorial.ipynb
@@ -330,7 +330,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from maseval import Benchmark, Environment, Evaluator, Task, TaskCollection\n",
+    "from maseval import Benchmark, Environment, Evaluator, Task, TaskQueue\n",
     "from maseval.interface.agents.smolagents import SmolAgentAdapter\n",
     "\n",
     "print(\"MASEval components imported successfully!\")"
@@ -634,23 +634,7 @@
    "id": "b3ee60a7",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Create benchmark instance with agent configuration\n",
-    "agent_data = {\"model_id\": \"gemini/gemini-2.5-flash\", \"temperature\": 0.7}\n",
-    "\n",
-    "benchmark = SimpleBenchmark(agent_data=agent_data, progress_bar=False)\n",
-    "\n",
-    "# Create task collection\n",
-    "tasks = TaskCollection([task])\n",
-    "\n",
-    "# Run the benchmark\n",
-    "print(\"Running benchmark...\\n\")\n",
-    "reports = benchmark.run(tasks=tasks)\n",
-    "\n",
-    "print(\"\\n\" + \"=\" * 60)\n",
-    "print(\"BENCHMARK COMPLETE\")\n",
-    "print(\"=\" * 60)"
-   ]
+   "source": "# Create benchmark instance\nagent_data = {\"model_id\": \"gemini/gemini-2.5-flash\", \"temperature\": 0.7}\n\nbenchmark = SimpleBenchmark(progress_bar=False)\n\n# Create task queue\ntasks = TaskQueue([task])\n\n# Run the benchmark\nprint(\"Running benchmark...\\n\")\nreports = benchmark.run(tasks=tasks, agent_data=agent_data)\n\nprint(\"\\n\" + \"=\" * 60)\nprint(\"BENCHMARK COMPLETE\")\nprint(\"=\" * 60)"
   },
   {
    "cell_type": "markdown",
@@ -746,4 +730,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/examples/macs_benchmark/macs_benchmark.py b/examples/macs_benchmark/macs_benchmark.py
@@ -737,7 +737,6 @@ def run_benchmark(
     # Get benchmark class and instantiate
     BenchmarkClass = get_benchmark_class(framework)
     benchmark = BenchmarkClass(
-        agent_data=agent_config,
         callbacks=[logger],
         n_task_repeats=n_task_repeats,
         fail_on_setup_error=True,
@@ -747,7 +746,7 @@ def run_benchmark(
 
     # Run benchmark
     print(f"\nRunning {framework} benchmark on {domain} domain...")
-    results = benchmark.run(tasks=tasks)
+    results = benchmark.run(tasks=tasks, agent_data=agent_config)
 
     # Compute summary metrics
     summary = compute_benchmark_metrics(results)

diff --git a/maseval/__init__.py b/maseval/__init__.py
@@ -8,7 +8,17 @@
 Benchmarks sit in the `maseval.benchmark` submodule.
 """
 
-from .core.task import Task, TaskCollection
+from .core.task import (
+    Task,
+    TaskProtocol,
+    TimeoutAction,
+    # Task queue classes
+    BaseTaskQueue,
+    TaskQueue,
+    SequentialTaskQueue,
+    PriorityTaskQueue,
+    AdaptiveTaskQueue,
+)
 from .core.environment import Environment
 from .core.agent import AgentAdapter
 from .core.benchmark import Benchmark, TaskExecutionStatus
@@ -27,11 +37,14 @@
 from .core.evaluator import Evaluator
 from .core.history import MessageHistory, ToolInvocationHistory
 from .core.tracing import TraceableMixin
+from .core.registry import ComponentRegistry
+from .core.context import TaskContext
 from .core.exceptions import (
     MASEvalError,
     AgentError,
     EnvironmentError,
     UserError,
+    TaskTimeoutError,
     validate_argument_type,
     validate_required_arguments,
     validate_no_extra_arguments,
@@ -41,7 +54,8 @@
 __all__ = [
     # Tasks
     "Task",
-    "TaskCollection",
+    "TaskProtocol",
+    "TimeoutAction",
     # Core abstractions
     "Environment",
     "AgentAdapter",
@@ -68,6 +82,15 @@
     "MessageHistory",
     "ToolInvocationHistory",
     "TraceableMixin",
+    # Registry and execution context
+    "ComponentRegistry",
+    "TaskContext",
+    # Task queues
+    "BaseTaskQueue",
+    "TaskQueue",
+    "SequentialTaskQueue",
+    "PriorityTaskQueue",
+    "AdaptiveTaskQueue",
     # Model adapters
     "ModelAdapter",
     "ChatResponse",
@@ -76,6 +99,7 @@
     "AgentError",
     "EnvironmentError",
     "UserError",
+    "TaskTimeoutError",
     "validate_argument_type",
     "validate_required_arguments",
     "validate_no_extra_arguments",