diff --git a/.gitignore b/.gitignore index 7f31157..02fc08f 100644 --- a/.gitignore +++ b/.gitignore @@ -245,3 +245,6 @@ local_experiments/ .claude/** !.claude/CLAUDE.md !.claude/settings.json + +experiments/kdd 2026 + diff --git a/README.md b/README.md index e997429..21ef75d 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,12 @@ client.delete_env(envId=env.environmentId) SDK provides **code execution proxies** - tools for AI agents. You add it to your toolbox in Vercel AI SDK, Langchain or OpenAI Agents, making LLM write Python or Bash code to talk with Slack or Linear API. Requests will automatically be intercepted and routed to isolated test environments. This enables agents to interact with service replicas without any code changes. See more in: **[Python SDK](sdk/agent-diff-python/README.md)** +## Benchmark & Training + +- **HuggingFace Dataset**: [hubertmarek/agent-diff-bench](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) — 224 tasks across all 4 services (80/20 train/test split, stratified by service) +- **Prime Intellect Environment**: [agent-diff-bench on Prime Lab](https://app.primeintellect.ai/dashboard/environments/hubert-marek/agent-diff-bench) — run evaluations or RL training via Hosted Training +- **Paper**: [AgentDiff: Agentic API Evaluation via State Differencing (KDD 2026 pre-print)](https://drive.google.com/file/d/1BlmJTSMX7ohwvD1aYBByg7_Y815fgsxp/view?usp=sharing) + ## Evaluations & Test Suites Collections of test cases with assertions that you can run against agent runs using evaluations. diff --git a/backend/Dockerfile b/backend/Dockerfile index bd16ba6..901a8c7 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -29,6 +29,7 @@ RUN echo '#!/bin/sh\n\ python utils/seed_slack_template.py\n\ python utils/seed_linear_template.py\n\ python utils/seed_box_template.py\n\ + python utils/seed_calendar_template.py\n\ python utils/seed_tests.py\n\ else\n\ echo "=== Skipping seed (set SEED=true to enable) ==="\n\ diff --git a/backend/src/services/slack/api/methods.py b/backend/src/services/slack/api/methods.py index 368d748..f622a7a 100644 --- a/backend/src/services/slack/api/methods.py +++ b/backend/src/services/slack/api/methods.py @@ -153,6 +153,23 @@ def _slack_error( raise SlackAPIError(code, status_code, extra) +def _parse_bool_param(value: Any, default: bool = False) -> bool: + """Safely parse a boolean parameter from JSON (bool) or form data (string). + + Handles: + - Boolean values: True/False + - String values: "true"/"false" (case-insensitive) + - None/missing: returns default + """ + if value is None: + return default + if isinstance(value, bool): + return value + if isinstance(value, str): + return value.lower() == "true" + return default + + def _resolve_channel_id(channel: str, session=None) -> str: """Resolve channel name or ID to channel ID. @@ -1033,7 +1050,7 @@ async def conversations_list(request: Request) -> JSONResponse: except ValueError: _slack_error("invalid_arguments") - exclude_archived = params.get("exclude_archived", "false").lower() == "true" + exclude_archived = _parse_bool_param(params.get("exclude_archived"), default=False) types_param = params.get("types", "public_channel") # Default: public_channel session = _session(request) @@ -1146,7 +1163,7 @@ async def conversations_history(request: Request) -> JSONResponse: _slack_error("invalid_cursor") oldest_param = params.get("oldest") latest_param = params.get("latest") - inclusive = params.get("inclusive", "false").lower() == "true" + inclusive = _parse_bool_param(params.get("inclusive"), default=False) # Validate channel (required) if not channel: @@ -1270,7 +1287,7 @@ async def conversations_replies(request: Request) -> JSONResponse: oldest_param = params.get("oldest") latest_param = params.get("latest") - inclusive = params.get("inclusive", "false").lower() == "true" + inclusive = _parse_bool_param(params.get("inclusive"), default=False) oldest_dt = None latest_dt = None @@ -1710,8 +1727,8 @@ async def conversations_open(request: Request) -> JSONResponse: async def conversations_info(request: Request) -> JSONResponse: params = await _get_params_async(request) channel = params.get("channel") - include_locale = params.get("include_locale", "false").lower() == "true" - include_num_members = params.get("include_num_members", "false").lower() == "true" + include_locale = _parse_bool_param(params.get("include_locale"), default=False) + include_num_members = _parse_bool_param(params.get("include_num_members"), default=False) # Validate channel (required) if not channel: @@ -2283,7 +2300,7 @@ async def users_info(request: Request) -> JSONResponse: if user is None: _slack_error("user_not_found") - include_locale = params.get("include_locale", "false").lower() == "true" + include_locale = _parse_bool_param(params.get("include_locale"), default=False) session = _session(request) @@ -2317,7 +2334,7 @@ async def users_list(request: Request) -> JSONResponse: except ValueError: _slack_error("invalid_cursor") - include_locale = params.get("include_locale", "false").lower() == "true" + include_locale = _parse_bool_param(params.get("include_locale"), default=False) session = _session(request) actor = _principal_user_id(request) team_id = _get_env_team_id(request, channel_id=None, actor_user_id=actor) @@ -2857,7 +2874,7 @@ async def search_messages(request: Request) -> JSONResponse: if not query_str: _slack_error("No query passed") - highlight = str(params.get("highlight", "false")).lower() == "true" + highlight = _parse_bool_param(params.get("highlight"), default=False) sort = (params.get("sort") or "score").lower() sort_dir = (params.get("sort_dir") or "desc").lower() count_param = params.get("count") diff --git a/datasets/agent-diff-bench/test.jsonl b/datasets/agent-diff-bench/test.jsonl new file mode 100644 index 0000000..d3a95d7 --- /dev/null +++ b/datasets/agent-diff-bench/test.jsonl @@ -0,0 +1,45 @@ +{"question": "In the history readings under digital humanities, there is a markdown file whose filename misspells the word 'computational' (letters swapped). Find it (try a couple search queries) and fix the typo in the filename without changing the content.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"3266469077\"}},\"expected_changes\":{\"name\":{\"to\":{\"eq\":\"computational approaches to hist research.md\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_145", "test_name": "Level 3: Typo Fix (Computational)", "service": "box", "task_horizon": 3, "operation_type": "search+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"GET /search\",\"PUT /files/{id}\"]}"} +{"question": "Hey, I uploaded some ethics and philosophy notes to history readings recently. I'm dyslexic so I probably made spelling mistakes in the filenames - could you find them and fix any typos? I think there were a few files about moral philosophy, judgment, research ethics, that kind of stuff.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"6478815895\"}},\"expected_changes\":{\"name\":{\"to\":{\"regex\":\"moral judge?ment in history\\\\.md\"}}}},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"2228856784\"}},\"expected_changes\":{\"name\":{\"to\":{\"regex\":\"moral judge?ment in history\\\\.docx\"}}}},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"1956298215\"}},\"expected_changes\":{\"name\":{\"to\":{\"regex\":\"moral judge?ment in history\\\\.pdf\"}}}},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"8847291035\"}},\"expected_changes\":{\"name\":{\"to\":{\"eq\":\"philosophy of science.md\"}}}},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"9958302146\"}},\"expected_changes\":{\"name\":{\"to\":{\"eq\":\"research ethics guidelines.txt\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_139", "test_name": "Level 3: Dyslexic User Typo Fix", "service": "box", "task_horizon": 6, "operation_type": "search+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"PUT /files/{id}\",\"PUT /files/{id}\",\"PUT /files/{id}\",\"PUT /files/{id}\",\"PUT /files/{id}\"]}"} +{"question": "In the personal_final history area, there is a digital humanities reading that was stored directly under the main 'history' folder instead of under 'readings/digital humanities'. Find the misfiled text reading about digital history methods and move it into the 'digital humanities' folder under 'readings'.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"2797160615\"}},\"expected_changes\":{\"parent_id\":{\"to\":{\"eq\":\"7905906319\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_140", "test_name": "Level 3: Sort Misfiled Reading", "service": "box", "task_horizon": 3, "operation_type": "search+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"GET /search\",\"PUT /files/{id}\"]}"} +{"question": "Create a folder 'Project_Beta' in root, then create a subfolder 'Docs' inside it, and move 'interviewing tips FINAL.txt' into 'Docs'.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Project_Beta\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Docs\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"1364279594\"}},\"expected_changes\":{\"parent_id\":{\"to\":{\"ne\":\"1088403890\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_126", "test_name": "Level 2: Nested Folders", "service": "box", "task_horizon": 4, "operation_type": "C+search+U", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"POST /folders\",\"POST /folders\",\"GET /search\",\"PUT /files/{id}\"]}"} +{"question": "Look at the files in the 'investments' folder. Rename the smallest file to 'smallest_file' (keep extension) and the largest file to 'largest_file' (keep extension).", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"1064362959\"}},\"expected_changes\":{\"name\":{\"to\":{\"eq\":\"smallest_file.csv\"}}}},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"1490177849\"}},\"expected_changes\":{\"name\":{\"to\":{\"eq\":\"largest_file.csv\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_137", "test_name": "Level 3: Ambiguous Sorting", "service": "box", "task_horizon": 4, "operation_type": "search+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"GET /folders/{id}/items\",\"PUT /files/{id}\",\"PUT /files/{id}\"]}"} +{"question": "In the history area, find the plain-text study notes about Argentina's 2001 economic crisis. Add a comment 'Please review this note' to that file and then create a task 'Review content' for the same file.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"item_id\":{\"eq\":\"5696874158\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_tasks\",\"where\":{\"item_id\":{\"eq\":\"5696874158\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_125", "test_name": "Level 2: Comment and Task", "service": "box", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"POST /comments\",\"POST /tasks\"]}"} +{"question": "Upload a txt note saying 'Hi, I am working on history project' inside the history folder.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_files\",\"where\":{\"parent_id\":{\"eq\":\"1660804823\"},\"extension\":{\"eq\":\"txt\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_146", "test_name": "Level 1: Upload History Note", "service": "box", "task_horizon": 2, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"POST /files/content\"]}"} +{"question": "In the investments/macroeconomics area, locate all CSV files containing economic time-series data (files with columns 'Series_reference', 'Period', 'Data_value'). For each CSV file, download it and extract the domain code from the first data row's Series_reference value (the domain is everything before the first dot, e.g., 'CPIM' from 'CPIM.SE9A', or 'TTRC' from 'TTRC.S1A1A'). Create a new folder called 'Economic_Domains' in the root directory. Inside it, create one subfolder for each unique domain you discover, named exactly Domain_ (e.g., 'Domain_CPIM'). Move each CSV file into its corresponding domain subfolder, and add the tag domain: to each file (e.g., tag domain:CPIM for files in the CPIM domain). After organising all files, create a Hub named 'Economic Data Index' and add only the domain subfolders that contain 2 or more files to this hub. Finally, upload a new text file named 'domain_manifest.txt' into the 'Economic_Domains' folder. This manifest should list each domain alphabetically, one per line, in the format: : file(s) | Hub: .", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Economic_Domains\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"contains\":\"Domain_\"}},\"expected_count\":{\"min\":2}},{\"diff_type\":\"added\",\"entity\":\"box_hubs\",\"where\":{\"title\":{\"contains\":\"Economic Data Index\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_files\",\"where\":{\"name\":{\"eq\":\"domain_manifest.txt\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_155", "test_name": "Level 5: Economic Domain Organization", "service": "box", "task_horizon": 11, "operation_type": "search+R+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /files/{id}/content\",\"GET /files/{id}/content\",\"POST /folders\",\"POST /folders\",\"POST /folders\",\"PUT /files/{id}\",\"PUT /files/{id}\",\"POST /hubs\",\"POST /hubs/{id}/manage_items\",\"POST /files/content\"]}"} +{"question": "List all accessible hubs and create a folder named 'Hubs_Found_' in the root, where is the number of hubs found.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"contains\":\"Hubs_Found_\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_128", "test_name": "Level 1: List Hubs", "service": "box", "task_horizon": 2, "operation_type": "R+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /hubs\",\"POST /folders\"]}"} +{"question": "Add a comment 'Needs review' to the Google earnings report PDF in the investments folder.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"message\":{\"contains\":\"Needs review\"},\"item_id\":{\"eq\":\"2748861636\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_119", "test_name": "Level 1: Add Comment", "service": "box", "task_horizon": 2, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"POST /comments\"]}"} +{"question": "Please put a hold on my primary calendar for Glassmoth Conservatory Candle-Lighting and invite Ngozi (ngozi@test.com). After you add it, move it one hour later and set the location to Hothouse Lantern Atrium. Also, set up a watch so I get notified whenever my calendar settings change.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"test.user@test.com\"},\"summary\":{\"contains\":\"Glassmoth Conservatory Candle-Lighting\"},\"location\":{\"contains\":\"Hothouse Lantern Atrium\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_event_attendees\",\"where\":{\"email\":{\"eq\":\"ngozi@test.com\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/users/user_agent/settings\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_180", "test_name": "Glassmoth Conservatory Candle-Lighting - simple hold update", "service": "calendar", "task_horizon": 3, "operation_type": "C+U", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.insert\",\"events.patch\",\"settings.watch\"]}"} +{"question": "We are reorganizing the Traveling Museum of Whispered Relics. Find the main calendar called Whispered Relics Mainline and the old Embargoed Vault calendar first. Create two new route calendars: Relic Transit - Northern Route and Relic Transit - Coastal Route. Set the Northern Route description to 'Northern caravan route logistics' and the Coastal Route description to 'Coastal caravan route logistics'. Use distinct colors from the calendar palette: set Northern Route to color ID 7 and Coastal Route to color ID 11. Subscribe me to the external calendar with ID cal_city_archives_access (titled 'City Archives Access Windows'). In my calendar list, hide Embargoed Vault from view and ensure both new route calendars are visible with their assigned colors. Access changes: Niamh (niamh@test.com) should be an owner on both route calendars, and Farid (farid@test.com) should be a reader on both. Remove Olena (olena@test.com) from the Embargoed Vault entirely.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Relic Transit - Northern Route\"},\"description\":{\"contains\":\"Northern caravan route logistics\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Relic Transit - Coastal Route\"},\"description\":{\"contains\":\"Coastal caravan route logistics\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_city_archives_access\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_embargoed_vault\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"hidden\":{\"to\":true}}},{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"user_id\":{\"eq\":\"user_agent\"},\"color_id\":{\"eq\":\"7\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"user_id\":{\"eq\":\"user_agent\"},\"color_id\":{\"eq\":\"11\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"niamh@test.com\"},\"role\":{\"eq\":\"owner\"}},\"expected_count\":2},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"farid@test.com\"},\"role\":{\"eq\":\"reader\"}},\"expected_count\":2},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_embargoed_vault\"},\"scope_value\":{\"eq\":\"olena@test.com\"}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_173", "test_name": "Museum of Whispered Relics - Calendar governance and access", "service": "calendar", "task_horizon": 12, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"calendars.insert\",\"calendarList.insert\",\"calendarList.patch\",\"acl.insert\",\"acl.delete\"]}"} +{"question": "Tomato season waits for no one - create 'Sacred Tomato Planting Ritual' on my calendar for Sunday morning immediately. Then show me what else I have this week so we don't double-book the garden crew. That tomato event needs more details - update it to say 'Bring your own seedlings and prayers' in the description. For the compost committee, I need to find a time Saturday when both Kenji (kenji@test.com) and Oksana (oksana@test.com) can make it. Speaking of which, which of my calendars is the 'Harvest Schedule' one? Once you find it, create 'Compost Communion: The Turning of the Heap' on that calendar at whatever time works for Kenji and Oksana (set in Kenji's timezone, please). Make sure the duration of the event is 2 hours. Bad news - 'Weed Warrior Wednesday' got rained out, so delete it from the Harvest Schedule and confirm it's actually gone. We have a new collective member, Chisom (chisom@test.com), who needs to see the harvest calendar but shouldn't edit it - set that up. Also, I completely forgot to invite Dariush (dariush@test.com) to the compost thing - he's our soil whisperer, please add him. One last thing: we're starting a new experimental growing project and need a calendar called 'Greenhouse Experiments' for it.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Sacred Tomato Planting Ritual\"},\"calendar_id\":{\"eq\":\"test.user@test.com\"},\"description\":{\"i_contains\":\"seedlings\"},\"start.dateTime\":{\"contains\":\"2018-06-17\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Compost Communion\"},\"calendar_id\":{\"eq\":\"cal_harvest_schedule\"},\"start.dateTime\":{\"contains\":\"2018-06-23\"},\"end.dateTime\":{\"contains\":\"2018-06-23\"},\"start.timeZone\":{\"eq\":\"Asia/Tokyo\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_event_attendees\",\"where\":{\"email\":{\"eq\":\"dariush@test.com\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_weed_warrior\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"chisom@test.com\"},\"role\":{\"eq\":\"reader\"},\"calendar_id\":{\"eq\":\"cal_harvest_schedule\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Greenhouse Experiments\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_165", "test_name": "Green Thumbs Urban Garden Collective - Complex garden coordination", "service": "calendar", "task_horizon": 8, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"events.insert\",\"events.list\",\"events.patch\",\"freeBusy.query\",\"events.delete\",\"acl.insert\",\"calendars.insert\"]}"} +{"question": "Create a new calendar named Windchord Cartotheca. Pull the calendar color palette and set it to color ID 11. Then fetch its calendar list entry and patch it so the calendar is visible and selected. Also update the calendar description to 'Atlas repair bays.' Fully replace Aiko’s ACL rule (user:aiko@test.com) on this calendar to writer. At the very end, on my primary calendar, count the attendees on event evt_cartotheca_intake_huddle (Cartotheca Intake Huddle) and copy those attendees as invitees to event evt_atlas_crate_sync (Atlas Crate Sync). Add the note 'Copied attendees' to Atlas Crate Sync’s description.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Windchord Cartotheca\"},\"description\":{\"contains\":\"Atlas repair bays\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"color_id\":{\"to\":{\"eq\":\"11\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"aiko@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_atlas_crate_sync\"},\"calendar_id\":{\"eq\":\"test.user@test.com\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"Copied 3 attendees\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_event_attendees\",\"where\":{\"event_id\":{\"eq\":\"evt_atlas_crate_sync\"},\"email\":{\"in\":[\"lucia@test.com\",\"zahra@test.com\",\"adebayo@test.com\"]}},\"expected_count\":3},{\"diff_type\":\"added\",\"entity\":\"calendar_event_attendees\",\"where\":{\"event_id\":{\"eq\":\"evt_atlas_crate_sync\"},\"email\":{\"not_in\":[\"lucia@test.com\",\"zahra@test.com\",\"adebayo@test.com\"]}},\"expected_count\":0}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_213", "test_name": "Windchord Cartotheca - create, color, list patch, ACL update", "service": "calendar", "task_horizon": 8, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendars.insert\",\"colors.get\",\"calendarList.patch\",\"calendarList.get\",\"calendars.patch\",\"acl.update\",\"events.get\",\"events.patch\"]}"} +{"question": "Please import this legacy entry into my primary calendar (do not recreate it manually): 'Emberwharf Tide Log' on June 29, 2018 from 5:00pm-5:30pm, location 'Pier Lantern Desk,' iCalUID emberwharf-tide-20180629@ledger.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"test.user@test.com\"},\"summary\":{\"contains\":\"Emberwharf Tide Log\"},\"start.dateTime\":{\"contains\":\"2018-06-29T17:00\"},\"end.dateTime\":{\"contains\":\"2018-06-29T17:30\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"},\"location\":{\"contains\":\"Pier Lantern Desk\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_195", "test_name": "Emberwharf Ledger - import event only", "service": "calendar", "task_horizon": 1, "operation_type": "C", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.import\"]}"} +{"question": "We’re setting up a proper schedule for the Monastery of Echoing Bells. First, show me my calendars so I don’t duplicate anything, and create a calendar with that name if needed. Give Linh (linh@test.com) edit access. The Dawn Bell Rite must recur daily at 5:30am, starting June 18, 2018, and it should continue indefinitely until we cancel it. I need two exceptions: the June 20, 2018 occurrence should start at 6:30am with the note ‘Storm quiet hours,’ and the June 23, 2018 occurrence should be cancelled entirely. Also, check when Kwame (kwame@test.com) is free on the evening of June 24. After you confirm the schedule, delete the entire Dawn Bell Rite series.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_monastery_echoing_bells\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_monastery_echoing_bells\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_monastery_echoing_bells\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_monastery_echoing_bells\"},\"scope_value\":{\"eq\":\"linh@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_monastery_echoing_bells\"},\"summary\":{\"contains\":\"Dawn Bell Rite\"},\"start.dateTime\":{\"contains\":\"2018-06-18T05:30\"},\"status\":{\"eq\":\"cancelled\"},\"recurrence\":{\"contains\":\"FREQ=DAILY\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_monastery_echoing_bells\"},\"summary\":{\"contains\":\"Dawn Bell Rite\"},\"recurrence\":{\"i_contains\":\"EXDATE\"}},\"expected_count\":{\"min\":0,\"max\":1}},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_monastery_echoing_bells\"},\"summary\":{\"contains\":\"Dawn Bell Rite\"},\"start.dateTime\":{\"contains\":\"2018-06-23\"},\"status\":{\"eq\":\"cancelled\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":{\"min\":0,\"max\":1}},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_monastery_echoing_bells\"},\"summary\":{\"contains\":\"Dawn Bell Rite\"},\"start.dateTime\":{\"contains\":\"2018-06-20T06:30\"},\"description\":{\"i_contains\":\"storm quiet hours\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_179", "test_name": "Monastery of Echoing Bells - Daily recurring lifecycle", "service": "calendar", "task_horizon": 6, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"acl.insert\",\"events.insert\",\"events.patch\",\"freeBusy.query\",\"events.delete\"]}"} +{"question": "List my calendars first so we don't duplicate anything. Then subscribe me to the external Tideglass Registry calendar (ID cal_tideglass_registry). After that, hide that calendar in my list and set it to color ID 4. Finally, import the legacy entry Tideglass Ledger Seal into the Tideglass Registry calendar for July 4, 2018 from 1:00pm-2:00pm at Seawick Vault, iCalUID tideglass-seal-20180704@registry. Salma, Linh, Sven, and Mateusz are the stakeholders to keep in the loop.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_tideglass_registry\"},\"user_id\":{\"eq\":\"user_agent\"},\"hidden\":{\"eq\":true},\"color_id\":{\"eq\":\"4\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_tideglass_registry\"},\"ical_uid\":{\"eq\":\"tideglass-seal-20180704@registry\"},\"summary\":{\"contains\":\"Tideglass Ledger Seal\"},\"start.dateTime\":{\"contains\":\"2018-07-04T13:00\"},\"end.dateTime\":{\"contains\":\"2018-07-04T14:00\"},\"location\":{\"contains\":\"Seawick Vault\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_219", "test_name": "Tideglass Registry - subscribe, hide, color, import", "service": "calendar", "task_horizon": 4, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"calendarList.insert\",\"calendarList.patch\",\"events.import\"]}"} +{"question": "I’m setting up the Clockwork Tinkerers Guild calendar. First, show me my calendars so I don’t duplicate anything; if we don’t already have it, create a calendar called Clockwork Tinkerers Guild and give Aiko (aiko@test.com) write access. Our Gear & Ember Workshop needs to run every Friday at 6:00pm for eight weeks starting June 22, 2018—set it up as a recurring series. However, we need two exceptions: the June 29 session should start at 7:00pm and include the note ‘Late start due to forge maintenance,’ and the July 6 session must be cancelled entirely (festival blackout). After applying those exceptions, show me the guild calendar so I can confirm the series looks right. Then add a one-off event called Brass Beetle Showcase on Saturday July 7 at noon.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_clockwork_tinkerers_guild\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_clockwork_tinkerers_guild\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_clockwork_tinkerers_guild\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_clockwork_tinkerers_guild\"},\"scope_value\":{\"eq\":\"aiko@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Gear & Ember Workshop\"},\"calendar_id\":{\"eq\":\"cal_clockwork_tinkerers_guild\"},\"start.dateTime\":{\"contains\":\"2018-06-22T18:00\"},\"end.dateTime\":{\"contains\":\"2018-06-22T19:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"},\"recurrence\":{\"contains\":\"FREQ=WEEKLY\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Gear & Ember Workshop\"},\"calendar_id\":{\"eq\":\"cal_clockwork_tinkerers_guild\"},\"start.dateTime\":{\"contains\":\"2018-06-29T19:00\"},\"end.dateTime\":{\"contains\":\"2018-06-29T20:00\"},\"description\":{\"i_contains\":\"forge maintenance\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Gear & Ember Workshop\"},\"calendar_id\":{\"eq\":\"cal_clockwork_tinkerers_guild\"},\"recurrence\":{\"i_contains\":\"EXDATE\"}},\"expected_count\":{\"min\":0,\"max\":1}},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Gear & Ember Workshop\"},\"calendar_id\":{\"eq\":\"cal_clockwork_tinkerers_guild\"},\"start.dateTime\":{\"contains\":\"2018-07-06T18:00\"},\"status\":{\"eq\":\"cancelled\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":{\"min\":0,\"max\":1}},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Brass Beetle Showcase\"},\"calendar_id\":{\"eq\":\"cal_clockwork_tinkerers_guild\"},\"start.dateTime\":{\"contains\":\"2018-07-07T12:00\"},\"end.dateTime\":{\"contains\":\"2018-07-07T13:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_177", "test_name": "Clockwork Tinkerers Guild - Recurring series exceptions", "service": "calendar", "task_horizon": 7, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"acl.insert\",\"events.insert\",\"events.patch\",\"events.list\"]}"} +{"question": "Our astronomy club is getting serious and we need proper organization. First, show me what calendars I have - I want to make sure we're not duplicating anything. Create a dedicated calendar called 'Cosmic Voyagers HQ' for all our stargazing activities. Yuki (yuki@test.com) is my co-organizer, so give her write access to the new calendar. The Perseid meteor shower is this Saturday at midnight - create an event called 'Perseid Meteor Shower Watch Party' for it on our new calendar. Before the main event, we need to set up the telescopes, but it has to work with Oleksandra's schedule (oleksandra@test.com) - find when she's free her Saturday evening and create a 'Telescope Alignment Ceremony' at that time (use Oleksandra's timezone). The duration of the event has to be 1.5 hours. Oh, I just remembered - the watch party location is confirmed as 'Hillcrest Observatory Field', so update that event. Also, there's still that embarrassing 'Failed Rocket Launch Viewing (Cancelled)' event on my main calendar from when SpaceX scrubbed last month - you know what to do with it.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Cosmic Voyagers HQ\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"yuki@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Perseid Meteor Shower Watch Party\"},\"start.dateTime\":{\"contains\":\"2018-06-24T00:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"},\"location\":{\"contains\":\"Hillcrest Observatory Field\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Telescope Alignment Ceremony\"},\"start.dateTime\":{\"contains\":\"2018-06-23T19:30\"},\"end.dateTime\":{\"contains\":\"2018-06-23T21:00\"},\"start.timeZone\":{\"eq\":\"Europe/Kyiv\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_failed_rocket\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_164", "test_name": "Cosmic Voyagers Astronomy Club - Multi-step calendar organization", "service": "calendar", "task_horizon": 7, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"calendars.insert\",\"acl.insert\",\"events.insert\",\"freeBusy.query\",\"events.patch\",\"events.delete\"]}"} +{"question": "Please hide the Lattice Observatory calendar in my calendar list (calendar ID cal_lattice_observatory). Also, fully replace the recurring Prism-Lens Alignment event (event ID evt_prism_lens_004) on that calendar so it runs weekly on Thursdays at 6:00am, starting June 28, 2018, for 45 minutes at Pier 7 Scope. Ewa asked me to confirm the updated series today.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_lattice_observatory\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"hidden\":{\"to\":true}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_prism_lens_004\"},\"calendar_id\":{\"eq\":\"cal_lattice_observatory\"}},\"expected_changes\":{\"start\":{\"to\":{\"contains\":\"2018-06-28T06:00\"}},\"end\":{\"to\":{\"contains\":\"2018-06-28T06:45\"}},\"location\":{\"to\":{\"contains\":\"Pier 7 Scope\"}},\"recurrence\":{\"to\":{\"contains\":\"FREQ=WEEKLY\"}}},\"expected_count\":1,\"ignore\":[\"attendees\",\"color_id\",\"description\",\"reminders\",\"status\",\"summary\",\"transparency\",\"visibility\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_190", "test_name": "Lattice Observatory - hide calendar and replace recurring event", "service": "calendar", "task_horizon": 2, "operation_type": "R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.patch\",\"events.update\"]}"} +{"question": "The Celluloid Dreams Film Festival is in full swing and I'm drowning in logistics. First, find our main festival calendar - it's called 'Celluloid Dreams Festival 2018'. I need you to go through the entire screening schedule and tell me exactly how many films are showing at the 'Noir Dungeon' venue - I'm worried we overbooked that theater. While you're at it, create an emergency event called 'Emergency Projector Repair: The Reel Must Go On' for 2 hours on Saturday afternoon on the festival calendar. Takeshi (takeshi@test.com) is our miracle-worker projectionist - check when he's free Saturday and schedule the repair during his available time (use his timezone). Add him as an attendee to that repair event with a note saying 'Bring spare bulbs and prayers' in the description. Here's a problem: we have this highly anticipated screening of 'The Last Samurai of Saturn' - find it in our schedule and tell me when it's showing. Also, delete all the 'Intermission: Existential Crisis (15 min)' events - we're cutting breaks to squeeze in more films. Olena (olena@test.com) from The Kyiv Film Review needs to see our complete schedule for her coverage - give her read access to the festival calendar. Create a new calendar called 'Green Room Chaos' for backstage crew coordination. Oh, and once you've found 'The Last Samurai of Saturn', move it to the same venue as our most popular screening, 'Epic Journey to the Stars' - ticket demand is through the roof.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Emergency Projector Repair\"},\"description\":{\"i_contains\":\"spare bulbs\"},\"calendar_id\":{\"eq\":\"cal_celluloid_dreams\"},\"start.dateTime\":{\"contains\":\"2018-06-23T14:00\"},\"end.dateTime\":{\"contains\":\"2018-06-23T16:00\"},\"start.timeZone\":{\"eq\":\"Asia/Tokyo\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_event_attendees\",\"where\":{\"email\":{\"eq\":\"takeshi@test.com\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_film_022\"}},\"expected_changes\":{\"location\":{\"to\":{\"contains\":\"Grand Aurora Theater\"}}},\"expected_count\":1,\"ignore\":[\"attendees\",\"color_id\",\"description\",\"end\",\"recurrence\",\"reminders\",\"start\",\"status\",\"summary\",\"transparency\",\"visibility\"]},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_intermission_1\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_intermission_2\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"olena@test.com\"},\"role\":{\"eq\":\"reader\"},\"calendar_id\":{\"eq\":\"cal_celluloid_dreams\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Green Room Chaos\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_167", "test_name": "Celluloid Dreams Film Festival - Large scale event coordination", "service": "calendar", "task_horizon": 10, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"events.list\",\"freeBusy.query\",\"events.insert\",\"events.patch\",\"events.delete\",\"acl.insert\",\"calendars.insert\"]}"} +{"question": "Quick-add this to my primary calendar: 'Copperseed Archive dusting on June 28, 2018 at 9:30am for 30 minutes.' Then start watching my primary calendar for event changes so I can notify Fatima (fatima@test.com) if anything shifts.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"test.user@test.com\"},\"summary\":{\"contains\":\"Copperseed Archive dusting\"},\"start.dateTime\":{\"contains\":\"2018-06-28T09:30\"},\"end.dateTime\":{\"contains\":\"2018-06-28T10:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/test.user@test.com/events\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_191", "test_name": "Copperseed Archive - quickAdd and events watch", "service": "calendar", "task_horizon": 2, "operation_type": "C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.quickAdd\",\"events.watch\"]}"} +{"question": "Create a new issue 'Improve performance' in Engineering team, assign to John, with description 'Optimize database queries'", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"ad608998-915c-4bad-bcd9-85ebfccccee8\"},\"title\":{\"contains\":\"performance\"},\"assigneeId\":{\"eq\":\"2dcc8dc2-ca19-475d-9882-3ba5e911e7ec\"},\"description\":{\"contains\":\"database queries\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_7", "test_name": "Create issue with assignee and description", "service": "linear", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"users\",\"issueCreate\"]}"} +{"question": "Change the priority of issue ENG-1 to Urgent", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-1\"}},\"expected_changes\":{\"priority\":{\"to\":{\"eq\":1.0}},\"priorityLabel\":{\"to\":{\"eq\":\"Urgent\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_6", "test_name": "Change issue priority to Urgent", "service": "linear", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"issueUpdate\"]}"} +{"question": "The Launch Coordination team is preparing for the April 1st product release. We need to set up the critical path with proper dependencies and timing.\n\nFirst, create a new workflow state called \"Awaiting Dependency\" (color: #FFA500, type: started) - we'll use this for tasks that are time-blocked by predecessors.\n\nCreate three issues in the Launch Coordination team:\n\n1. \"Complete product documentation for v3.0 launch\" - This is the first domino. Yuki owns this. Due date must be March 22nd because legal needs 5 business days after this completes.\n\n2. \"Legal review of launch materials\" - Svetlana from Legal owns this. Due date is March 29th. This CANNOT start until documentation is complete - set up the blocking relationship.\n\n3. \"Publish marketing campaign assets\" - Kwame owns this. Due date is March 31st (press embargo lifts at 9am that day). This is blocked by legal review completion.\n\nSet up the dependency chain: Documentation blocks Legal Review, and Legal Review blocks Marketing.\n\nFinally, add a comment to the documentation issue that explains the timeline pressure: \"CRITICAL_PATH_NOTE: This task has ZERO slack. If documentation slips past March 22nd, legal review (5 business days) won't complete by March 29th, which blocks marketing from the March 31st embargo lift. Launch date is immovable.\"", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"workflow_states\",\"where\":{\"name\":{\"eq\":\"Awaiting Dependency\"},\"color\":{\"eq\":\"#FFA500\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"product documentation\"},\"teamId\":{\"eq\":\"d3e4f5a6-b7c8-9012-3456-789abcdef012\"},\"assigneeId\":{\"eq\":\"f8a9b0c1-d2e3-4567-2345-901234567890\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Legal review\"},\"teamId\":{\"eq\":\"d3e4f5a6-b7c8-9012-3456-789abcdef012\"},\"assigneeId\":{\"eq\":\"b0c1d2e3-f4a5-6789-4567-123456789012\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"marketing campaign\"},\"teamId\":{\"eq\":\"d3e4f5a6-b7c8-9012-3456-789abcdef012\"},\"assigneeId\":{\"eq\":\"a9b0c1d2-e3f4-5678-3456-012345678901\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_relations\",\"where\":{\"type\":{\"eq\":\"blocks\"}},\"expected_count\":2},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"CRITICAL_PATH_NOTE:\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_46", "test_name": "Product Launch Coordination", "service": "linear", "task_horizon": 12, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"users\",\"workflowStates\",\"workflowStateCreate\",\"issueCreate\",\"issueRelationCreate\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "Create a new label called 'Bugs' ", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issue_labels\",\"where\":{\"name\":{\"eq\":\"Bugs\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_11", "test_name": "Create a new label", "service": "linear", "task_horizon": 1, "operation_type": "C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issueLabelCreate\"]}"} +{"question": "Unassign ENG-1 so it has no assignee", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-1\"}},\"expected_changes\":{\"assigneeId\":{\"to\":{\"exists\":false}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_26", "test_name": "Remove assignee from ENG-1", "service": "linear", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"issueUpdate\"]}"} +{"question": "The Research team needs to set up the grant application pipeline for the upcoming NIH submission deadline (June 15th).\n\nFirst, find the existing \"IRB Ethics Approval\" issue - this is our starting point and is already in progress.\n\nCreate three new issues in the Research team to complete the pipeline:\n\n1. \"Data Collection Protocol v2\" - Nadia will own this. It cannot begin until ethics approval is complete.\n\n2. \"Pilot Study Design - 50 participant cohort\" - Tomás will lead this. It depends on having the data collection protocol finalized.\n\n3. \"Grant Submission Draft - R01 mechanism\" - Chioma will compile the final submission. This is the last step and depends on pilot study results.\n\nSet up the blocking relationships to enforce the sequential workflow:\n- IRB Ethics Approval blocks Data Collection Protocol\n- Data Collection Protocol blocks Pilot Study Design\n- Pilot Study Design blocks Grant Submission Draft\n\nAfter setting up the dependencies, add a comment to the Grant Submission issue summarizing the critical path: \"PIPELINE_STATUS: This submission depends on completion chain: Ethics (in progress) → Data Protocol (Nadia) → Pilot Study (Tomás) → This draft. Target: June 15th deadline.\"", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Data Collection Protocol\"},\"teamId\":{\"eq\":\"e4f5a6b7-c8d9-0123-4567-89abcdef0123\"},\"assigneeId\":{\"eq\":\"c1d2e3f4-a5b6-7890-5678-234567890123\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Pilot Study\"},\"teamId\":{\"eq\":\"e4f5a6b7-c8d9-0123-4567-89abcdef0123\"},\"assigneeId\":{\"eq\":\"d2e3f4a5-b6c7-8901-6789-345678901234\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Grant Submission\"},\"teamId\":{\"eq\":\"e4f5a6b7-c8d9-0123-4567-89abcdef0123\"},\"assigneeId\":{\"eq\":\"e3f4a5b6-c7d8-9012-7890-456789012345\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_relations\",\"where\":{\"type\":{\"eq\":\"blocks\"}},\"expected_count\":3},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"PIPELINE_STATUS:\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_47", "test_name": "Research Grant Application Pipeline", "service": "linear", "task_horizon": 12, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issues\",\"users\",\"issueCreate\",\"issueRelationCreate\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "Assign ENG-3 to Sarah Smith", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-3\"}},\"expected_changes\":{\"assigneeId\":{\"to\":{\"eq\":\"03b0809e-713e-44ee-95de-b7a198b135ac\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_17", "test_name": "Reassign ENG-3 to Sarah", "service": "linear", "task_horizon": 3, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"users\",\"issueUpdate\"]}"} +{"question": "The Forest Mycology Collective is organizing their autumn foraging expedition. First, create a new team called \"Forest Mycology Collective\" to track all club activities.\n\nCreate a label called \"awaiting-spore-print\" for specimens that need laboratory analysis before identification can be confirmed.\n\nNow set up the expedition: create an issue titled \"Coastal Redwood Reserve Autumn Foray\" and assign it to Haruki as the expedition leader.\n\nDuring the planning phase, we're pre-logging anticipated specimen finds based on last year's survey. Create a specimen issue titled \"Specimen #1: Cantharellus formosus cluster - Sector 7\" and assign it to Priya for documentation. Create another specimen issue \"Specimen #2: Unknown Amanita - requires cross-reference\" and assign it to Dmitri, applying the \"awaiting-spore-print\" label.\n\nThe Amanita identification depends on comparing its spore print against the Cantharellus specimen first (they were found in the same microhabitat and we need to rule out look-alikes). Set up the Amanita issue as blocked by the Cantharellus issue.\n\nFinally, add a field note comment to the Cantharellus specimen that reads: \"FIELD_NOTE_REF: GPS coordinates 41.2132°N, found near fallen Douglas fir. Fruiting body golden-yellow, false gills present, apricot aroma confirmed.\"", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"teams\",\"where\":{\"name\":{\"eq\":\"Forest Mycology Collective\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_labels\",\"where\":{\"name\":{\"eq\":\"awaiting-spore-print\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Coastal Redwood Reserve\"},\"assigneeId\":{\"eq\":\"f6a7b8c9-d0e1-2345-0123-789012345678\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Cantharellus formosus\"},\"assigneeId\":{\"eq\":\"b8c9d0e1-f2a3-4567-2345-901234567890\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Amanita\"},\"assigneeId\":{\"eq\":\"a7b8c9d0-e1f2-3456-1234-890123456789\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_relations\",\"where\":{\"type\":{\"eq\":\"blocks\"},\"issueTitle\":{\"contains\":\"Cantharellus\"},\"relatedIssueTitle\":{\"contains\":\"Amanita\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"FIELD_NOTE_REF:\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_42", "test_name": "Forest Mycology Collective Expedition", "service": "linear", "task_horizon": 11, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"teamCreate\",\"users\",\"issueLabels\",\"issueLabelCreate\",\"issueCreate\",\"issueUpdate\",\"issueRelationCreate\",\"commentCreate\"]}"} +{"question": "The moderation team has flagged the word \"YOLO_BAD\" as inappropriate for our workspace. We need to audit and address any comments containing this term.\n\nFirst, search through all comments to find any that contain \"YOLO_BAD\". Count how many comments contain this word and note their IDs.\n\nCreate a new issue in the Moderation team titled \"Content Cleanup Required - YOLO_BAD audit\" assigned to Saoirse. In the description, include:\n- The exact count of comments found containing \"YOLO_BAD\"\n- A list of the comment IDs that need to be reviewed\n- Use this format: \"AUDIT_RESULT: Found [X] comments containing flagged content. Comment IDs: [id1, id2, ...]\"\n\nFor each issue that has a comment containing \"YOLO_BAD\", add a warning comment: \"MODERATION_NOTICE: A comment on this issue contains content that violates community guidelines. Please review and edit your comment to remove inappropriate language. Ref: YOLO_BAD audit.\"", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"mod-team-001\"},\"title\":{\"contains\":\"Content Cleanup\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"YOLO_BAD\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"assigneeId\":{\"eq\":\"mod-user-saoirse-001\"},\"title\":{\"contains\":\"Content Cleanup\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"description\":{\"contains\":\"AUDIT_RESULT:\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"description\":{\"contains\":\"Found 3 comments\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"MODERATION_NOTICE:\"}},\"expected_count\":2},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"issueId\":{\"eq\":\"mod-issue-darkmode-001\"},\"body\":{\"contains\":\"MODERATION_NOTICE:\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"issueId\":{\"eq\":\"mod-issue-checkout-001\"},\"body\":{\"contains\":\"MODERATION_NOTICE:\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_55", "test_name": "Community Guidelines Enforcement - Profanity Audit", "service": "linear", "task_horizon": 11, "operation_type": "search+R+C", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"comments\",\"users\",\"issueCreate\",\"commentCreate\"]}"} +{"question": "The community center is hosting the \"Living Cultures Festival\" from Feb 15-19. Create a new team called \"Fermentation Guild\" to coordinate this event. We need to track fermentation timelines carefully.\n\nFirst, add a new workflow state called \"Fermenting\" (color: #8B4513, type: started) to the new team - this will help us track items that are actively culturing.\n\nCreate a label called \"time-critical\" for tasks with strict biological deadlines.\n\nNow for the tricky scheduling: Kenji needs to start his 3-week miso base by January 25th at the latest for it to be ready for the festival tasting on Feb 18th. Create an issue titled \"Prepare Kenji miso base for Feb 18 tasting\" and assign it to Kenji with the time-critical label.\n\nHowever, Fatima needs access to the koji room first to inoculate spores for her amazake demonstration. Create another issue \"Inoculate koji spores for amazake - Fatima\" and set it up so that it blocks Kenji's miso preparation (they share the temperature-controlled koji room and can't run both processes simultaneously).\n\nFinally, add a comment to the miso task that says: \"CULTURE_READY_CHECK: Verify koji colonization complete before rice inoculation. Target temp 86F for 48hrs.\"", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"teams\",\"where\":{\"name\":{\"eq\":\"Fermentation Guild\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"workflow_states\",\"where\":{\"name\":{\"eq\":\"Fermenting\"},\"color\":{\"eq\":\"#8B4513\"},\"type\":{\"eq\":\"started\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_labels\",\"where\":{\"name\":{\"eq\":\"time-critical\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"miso base\"},\"assigneeId\":{\"eq\":\"a1b2c3d4-e5f6-7890-abcd-ef1234567890\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"koji spores\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_relations\",\"where\":{\"type\":{\"eq\":\"blocks\"},\"issueTitle\":{\"contains\":\"koji spores\"},\"relatedIssueTitle\":{\"contains\":\"miso\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"CULTURE_READY_CHECK\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_40", "test_name": "Fermentation Festival Coordination", "service": "linear", "task_horizon": 9, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"teamCreate\",\"users\",\"workflowStateCreate\",\"issueLabelCreate\",\"issueCreate\",\"issueRelationCreate\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "Remove the 'Duplicate' workflow state from the Engineering team.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"workflow_states\",\"where\":{\"id\":{\"eq\":\"ab04ec5f-1292-48b0-9426-50d354957357\"}},\"expected_changes\":{\"archivedAt\":{\"from\":{\"exists\":false},\"to\":{\"exists\":true}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_28", "test_name": "Remove workflow state", "service": "linear", "task_horizon": 2, "operation_type": "search+D", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"workflowStates\",\"workflowStateArchive\"]}"} +{"question": "I need help getting our anime convention booth coordination sorted out. Can you check what's been happening in #product-growth and #random lately? I want to make sure I'm caught up on any relevant discussions before we dive into planning.\n\nAlso, I remember there were some conversations about badges somewhere - can you find those for me? We had some outdated messages about our old booth location that need to be removed since we got reassigned to a different hall.\n\nI need to loop in Olena Petrenko on this since her perspective would be really helpful for the setup logistics. And I should probably reach out directly to John Doe and Priya Sharma separately - John for general coordination and Priya about the infrastructure stuff like power and internet at the booth.\n\nOh, and let's update the channel topics for #product-growth and #project-alpha-dev to reflect that we're focusing on the anime expo booth setup now. There were a couple of my earlier messages that need corrections too - I posted the wrong setup times initially. Once you find the key planning message, just give it a thumbs up so everyone knows we're aligned.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":{\"eq\":true}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_102", "test_name": "Anime Convention Booth Setup", "service": "slack", "task_horizon": 13, "operation_type": "search+R+C+U+D", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "high", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.history\",\"conversations.history\",\"search.messages\",\"chat.delete\",\"chat.delete\",\"conversations.setTopic\",\"conversations.setTopic\",\"conversations.invite\",\"conversations.open\",\"conversations.open\",\"chat.update\",\"chat.update\",\"reactions.add\"]}"} +{"question": "Create a new channel called 'auth-force' and invite everyone who has posted about 'login' or 'password'.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"channel_name\":{\"contains\":\"auth-force\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"in\":[\"U01AGENBOT9\",\"U02JOHNDOE1\",\"U03ROBERT23\",\"U05MORGAN23\",\"U02ARTEM23\",\"U06HUBERT23\"]}},\"expected_count\":6}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_87", "test_name": "Set Operations: Group Invite based on Topic", "service": "slack", "task_horizon": 9, "operation_type": "search+C", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"search.messages\",\"search.messages\",\"conversations.create\",\"conversations.invite\"]}"} +{"question": "Find the user who complained about 'captcha' in #general and send them a DM saying 'I am looking into this.'", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":{\"eq\":true}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"message_text\":{\"contains\":\"looking into this\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U06HUBERT23\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_86", "test_name": "Chained Reasoning: Search and DM", "service": "slack", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"search.messages\",\"conversations.open\",\"chat.postMessage\"]}"} +{"question": "Aisha, Lukasz, Gabriel, Nick, and Priya want to launch a collaborative radio drama called \"Phantom Frequencies\" — a serialized fiction project where each person broadcasts a story from their timezone. They got the idea from all the talk about signal latency, CDN routing, and transmission in the workspace. Set them up with a channel called #phantom-frequencies, give it a topic that fits the concept (need to mention \"Phantom Frequencies\"), and get everyone in. Check Aisha's profile to confirm her timezone for the broadcast schedule, and DM her separately to ask about her episode's Lagos-blackout storyline. Write a first post in the channel that draws on whatever transmission and signal discussions you can find in the workspace. Also, that :eyes: reaction you left on the circuit-tracer message in #engineering — remove it, it's stale. There's a channel called #product-growth you're not in — pop in and check if there's anything about the APAC launch that could feed into the drama's world-building, then leave once you've got what you need. If you find in this chat a user with any user with a name that contains \"incognito\" ping them to change the nickname to \"anything\" - we need to maintain a trustful atmosphere here. And that #project-alpha channel that's basically just you — archive it, nobody's using it.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"channel_name\":\"phantom-frequencies\",\"topic_text\":{\"i_contains\":\"Phantom Frequencies\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"in\":[\"U_AISHA\",\"U_LUKAS\",\"U09GABRIEL\",\"U08NICK23\",\"U_PRIYA\"]}},\"expected_count\":{\"min\":5}},{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":true},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"message_text\":{\"i_contains\":\"Lagos\"}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"removed\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":\"1706110000.000100\",\"reaction_type\":\"eyes\"},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"channels\",\"where\":{\"channel_id\":\"C05ALPHA\"},\"expected_changes\":{\"is_archived\":{\"to\":true}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":\"U_INCOGNITO\"},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":\"U01AGENBOT9\"},\"expected_count\":{\"min\":3}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_109", "test_name": "Phantom Frequencies", "service": "slack", "task_horizon": 11, "operation_type": "search+C+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.history\",\"users.info\",\"conversations.create\",\"conversations.setTopic\",\"conversations.invite\",\"chat.postMessage\",\"conversations.open\",\"reactions.remove\",\"conversations.join\",\"conversations.leave\",\"conversations.archive\"]}"} +{"question": "Send a message to #general saying 'Attention' in bold and 'check logs' in italics. Use Slack Block Kit rich_text blocks with style attributes (bold:true, italic:true).", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"blocks\":{\"contains\":\"rich_text_section\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"blocks\":{\"contains\":\"\\\"bold\\\":true\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"blocks\":{\"contains\":\"\\\"italic\\\":true\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_79", "test_name": "Rich Text: Basic Formatting (Bold/Italic)", "service": "slack", "task_horizon": 2, "operation_type": "search+C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"chat.postMessage\"]}"} +{"question": "Hubert does this thing he calls the \"Apiary Report\" — he sees the workspace as a beehive, and he wants a quarterly survey. First he needs the full picture: how many honeycomb cells does this hive have, and which ones are alive? Then go taste the honey in #growth — read through whatever's been happening there. Find the sweetest drop — the single best message — and mark it with a :honey_pot:. That's Hubert's forager tradition. Once you've done your tasting, write up a Forager's Report and post it in #random for the rest of the colony, summarizing whatever noteworthy conversation you found in #growth. Note, that the report must contain the words \"FORAGERS REPORT\". Last thing: #project-alpha is an empty cell. Nobody's in it, nothing's happening. Seal it off.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"message_reactions\",\"where\":{\"reaction_type\":\"honey_pot\",\"user_id\":\"U01AGENBOT9\",\"message_id\":{\"in\":[\"1700300000.000001\",\"1700300060.000002\",\"1700300120.000003\",\"1700300180.000004\",\"1700300240.000005\",\"1700300300.000006\"]}},\"expected_count\":1,\"description\":\":honey_pot: reaction added to a message in #growth channel\"},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":\"C02EFGH5678\",\"user_id\":\"U01AGENBOT9\",\"message_text\":{\"contains\":\"FORAGERS REPORT\"}},\"expected_count\":1,\"description\":\"Forager's Report posted to #random with required phrase\"},{\"diff_type\":\"changed\",\"entity\":\"channels\",\"where\":{\"channel_id\":\"C05ALPHA\"},\"expected_count\":1,\"expected_changes\":{\"is_archived\":{\"from\":false,\"to\":true}},\"description\":\"#project-alpha archived (empty cell sealed)\"}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_112", "test_name": "The Apiary Report", "service": "slack", "task_horizon": 5, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.history\",\"reactions.add\",\"chat.postMessage\",\"conversations.archive\"]}"} +{"question": "I need some help coordinating our virtual Afrobeats festival streaming infrastructure project. Can you help me get things organized across our Slack workspace?\n\nFirst, I want to make sure the #engineering channel clearly reflects that we're focused on the Music Festival Tech Stack right now - the topic should be updated so everyone knows what we're working on.\n\nI remember there were some discussions about CDN solutions a while back that would be really relevant to our streaming needs - can you dig those up for me?\n\nI also need to figure out who on our team should be involved. I know Robert Chen is supposed to be leading the engineering side, but can you confirm his role? And I think Łukasz Kowalski has some great performance optimization experience - make sure he's part of the conversation in our main coordination channel.\n\nOnce you've gathered all this info, I need updates posted to #engineering, #frontend, and #general to get everyone aligned on our festival streaming infrastructure plans. Also, check what channels we have available that might be relevant to this project.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_101", "test_name": "Music Festival Tech Stack", "service": "slack", "task_horizon": 9, "operation_type": "C+R+U", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "high", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.setTopic\",\"search.messages\",\"users.list\",\"users.info\",\"conversations.list\",\"chat.postMessage\",\"chat.postMessage\",\"chat.postMessage\",\"conversations.invite\"]}"} +{"question": "Robert and Nick want to do a \"Palimpsest\" — scraping off old marks in the workspace and writing over them with new ones. First, check what channels Nick is actually in — Robert suspects he's barely present anywhere. Count them. Then scrape off that :eyes: reaction you left on the circuit-tracer message in #engineering — it's old ink that needs to go. That lonely #project-alpha channel? Overwrite its name — rename it to #palimpsest-archive, it's being repurposed as a record of overwritten things. Finally, write the new text: post a message in #random that says exactly \"PALIMPSEST COMPLETE: [N] channels found for Nick\" where [N] is however many channels Nick turned out to be in.", "answer": "{\"assertions\":[{\"diff_type\":\"removed\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":\"1706110000.000100\",\"user_id\":\"U01AGENBOT9\",\"reaction_type\":\"eyes\"},\"expected_count\":1,\"description\":\"Agent's :eyes: reaction removed from circuit-tracer message in #engineering \\u2014 old ink scraped off\"},{\"diff_type\":\"changed\",\"entity\":\"channels\",\"where\":{\"channel_id\":\"C05ALPHA\"},\"expected_count\":1,\"expected_changes\":{\"channel_name\":{\"from\":\"project-alpha\",\"to\":\"palimpsest-archive\"}},\"description\":\"#project-alpha renamed to #palimpsest-archive \\u2014 overwritten with new purpose\"},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":\"C02EFGH5678\",\"user_id\":\"U01AGENBOT9\",\"message_text\":{\"regex\":\"PALIMPSEST COMPLETE:\\\\s*1\\\\s+channels?\\\\s+found\\\\s+for\\\\s+Nick\"}},\"expected_count\":1,\"description\":\"Palimpsest record posted in #random: 'PALIMPSEST COMPLETE: 1 channel(s) found for Nick' \\u2014 Nick is only in #growth\"}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_114", "test_name": "Palimpsest", "service": "slack", "task_horizon": 4, "operation_type": "search+R+C+U+D", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.conversations\",\"reactions.remove\",\"conversations.rename\",\"chat.postMessage\"]}"} +{"question": "I need your help coordinating something for our Polish-Ukrainian debugging session today. We're calling it the \"Pierogi vs Varenyky Debug Session\" because Olena and Sophie are bringing food during our break!\n\nFirst, can you check on Sophie Dubois and Olena Petrenko's profiles? I want to make sure I have their roles right when I introduce them to the rest of the team. Also, I need to catch up on what's been happening in the engineering channel - there were some login issues discussed that might be relevant.\n\nCould you find any channels that might already be discussing this topic, and if there isn't a dedicated space yet, please create a new channel for our pierogi-vs-varenyky session? We should also post a heads-up in core-infra about our debugging plans.\n\nOh, and Aisha left a great message earlier that I want to react to with a thumbs up. Also, I need to remove someone from one of our project channels who's no longer on the team. Thanks!", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_private\":{\"eq\":false}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_98", "test_name": "Pierogi vs Varenyky Debug Session", "service": "slack", "task_horizon": 8, "operation_type": "search+C+R", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.info\",\"users.info\",\"conversations.history\",\"conversations.list\",\"conversations.create\",\"chat.postMessage\",\"reactions.add\",\"conversations.kick\"]}"} +{"question": "Post to #general mentioning Artem with text 'Please review the pull request'", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"contains\":\"<@U02ARTEM23>\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_71", "test_name": "Mention user in message", "service": "slack", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.list\",\"chat.postMessage\"]}"} +{"question": "Create a new channel called 'rl-project'", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"channel_name\":{\"i_contains\":\"rl-project\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_60", "test_name": "Create a new channel", "service": "slack", "task_horizon": 1, "operation_type": "C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.create\"]}"} +{"question": "Kenji, Olena, and Priya want to spin up a generative art project using the team's GPU infrastructure. They drew inspiration from the compute discussions and that circuit-tracer visualization work happening somewhere in the workspace. Can you get them organized? They need a channel — call it #fractal-forge — with a topic that contains \"GPU-meets-art\". Invite all three, and post an inaugural message that references whatever you can dig up about the GPU work and the circuit-tracer thread that got them excited -- those are going to be messeges on the topic, written by either three. Kenji also wants an :art: reaction on whichever message in #engineering first mentioned the circuit-tracer. Set up a group DM with just Kenji and Olena so they can sort out GPU scheduling privately. And actually, rename the channel to #silicon-dreams — everyone agreed it sounds better.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"channel_name\":{\"eq\":\"silicon-dreams\"},\"topic_text\":{\"i_contains\":\"GPU-meets-art\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_gc\":{\"eq\":true}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U_PRIYA\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U_KENJI\"}},\"expected_count\":2},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U_OLENA\"}},\"expected_count\":2},{\"diff_type\":\"added\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":{\"eq\":\"1706110000.000100\"},\"reaction_type\":{\"eq\":\"art\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"message_text\":{\"i_contains\":\"circuit\"}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"message_text\":{\"i_contains\":\"GPU\"}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":{\"eq\":\"1706110000.000100\"},\"reaction_type\":{\"eq\":\"eyes\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":{\"eq\":\"1706110000.000100\"},\"reaction_type\":{\"eq\":\"eyes\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":{\"eq\":\"1706110000.000100\"},\"reaction_type\":{\"eq\":\"eyes\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"i_contains\":\"circuit\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C02EFGH5678\"},\"message_text\":{\"i_contains\":\"circuit\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"message_text\":{\"i_contains\":\"circuit\"}},\"expected_count\":0}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_107", "test_name": "Silicon Dreams", "service": "slack", "task_horizon": 10, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"search.messages\",\"conversations.history\",\"conversations.create\",\"conversations.setTopic\",\"conversations.invite\",\"chat.postMessage\",\"reactions.add\",\"conversations.replies\",\"conversations.rename\",\"conversations.open\"]}"} diff --git a/datasets/agent-diff-bench/test.parquet b/datasets/agent-diff-bench/test.parquet new file mode 100644 index 0000000..57e4ac6 Binary files /dev/null and b/datasets/agent-diff-bench/test.parquet differ diff --git a/datasets/agent-diff-bench/train.jsonl b/datasets/agent-diff-bench/train.jsonl new file mode 100644 index 0000000..28809c2 --- /dev/null +++ b/datasets/agent-diff-bench/train.jsonl @@ -0,0 +1,179 @@ +{"question": "Find all PDF files in the investments folder and its subfolders. Add the tag 'pdf-document' to each of them.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"extension\":{\"eq\":\"pdf\"},\"tags\":{\"contains\":\"pdf-document\"}},\"expected_changes\":{\"tags\":{\"to\":{\"contains\":\"pdf-document\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_151", "test_name": "Level 2: Tag All PDFs", "service": "box", "task_horizon": 6, "operation_type": "search+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"PUT /files/{id}\",\"PUT /files/{id}\",\"PUT /files/{id}\",\"PUT /files/{id}\",\"PUT /files/{id}\"]}"} +{"question": "Rename the 'macroeconomics' folder to 'Global Economics'.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_folders\",\"where\":{\"id\":{\"eq\":\"1973339758\"}},\"expected_changes\":{\"name\":{\"to\":{\"eq\":\"Global Economics\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_120", "test_name": "Level 1: Rename Folder", "service": "box", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"PUT /folders/{id}\"]}"} +{"question": "The rare book conservation lab is running its year-end audit. You need to aggregate treatment data and update the annual summary. First, confirm your identity — who are you logged in as? You'll need this for audit attribution. Locate the conservation lab folder and check its contents. Get the details of both quarterly humidity logs (Q3 and Q4 2025) — each contains a \"BOOKS TREATED THIS QUARTER\" count that you'll need. Check if any conservation documents are currently in your favorites collection. On the incunabula condition report, add a comment: \"Audit initiated by [your username] on [today's date].\" Also find the existing comment about \"Budget review pending\" and update it to: \"Budget approved - Q3+Q4 aggregated total: [X] books\" where X is the sum of books treated in Q3 and Q4. There's an outdated comment on the condition report marked \"[OUTDATED]\" with incorrect information — delete it. Download the annual summary file, update it with the correct Q3 and Q4 treatment counts (extracted from the humidity logs), and upload it as a new version. The total YTD should now reflect all four quarters. Find the \"Conservation Lab Archive\" hub and update its description to: \"Rare book conservation documentation - Last audit: Q4 2025.\" Finally, there's a deprecated folder from 2024 that's scheduled for deletion — remove it.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"item_id\":{\"eq\":\"1701916585\"},\"message\":{\"contains\":\"Audit initiated\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"1172138282\"}},\"expected_changes\":{\"version_number\":{\"to\":{\"ne\":\"1\"}}},\"ignore\":[\"sha_1\",\"size\",\"file_version_id\"]},{\"diff_type\":\"changed\",\"entity\":\"box_hubs\",\"where\":{\"id\":{\"eq\":\"777777\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"Q4 2025\"}}},\"ignore\":[\"updated_at\"]},{\"diff_type\":\"changed\",\"entity\":\"box_folders\",\"where\":{\"id\":{\"eq\":\"7983826892\"}},\"expected_changes\":{\"item_status\":{\"to\":{\"eq\":\"trashed\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_159", "test_name": "Level 5: Rare Books Conservation Audit", "service": "box", "task_horizon": 13, "operation_type": "R+search+C+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /users/me\",\"GET /search\",\"GET /files/{id}\",\"GET /files/{id}\",\"GET /collections\",\"POST /comments\",\"PUT /comments/{id}\",\"DELETE /comments/{id}\",\"GET /files/{id}/content\",\"POST /files/{id}/content\",\"GET /hubs/{id}\",\"PUT /hubs/{id}\",\"DELETE /folders/{id}\"]}"} +{"question": "In the readings folder under history, search for files with similar names across different subfolders (e.g., same base name in different topic folders). If you find duplicates by name, keep the one in the most appropriate topic folder and trash the others.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"item_status\":{\"eq\":\"trashed\"}},\"expected_changes\":{\"item_status\":{\"to\":{\"eq\":\"trashed\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_142", "test_name": "Level 3: Cross-Folder Dedup", "service": "box", "task_horizon": 4, "operation_type": "search+R+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"GET /folders/{id}/items\",\"GET /folders/{id}/items\",\"DELETE /files/{id}\"]}"} +{"question": "Count how many files are in the 'investments' folder and set the folder's description to the count.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_folders\",\"where\":{\"id\":{\"eq\":\"5610825569\"}},\"expected_changes\":{\"description\":{\"to\":{\"ne\":\"\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_127", "test_name": "Level 2: Count Files and Set Description", "service": "box", "task_horizon": 3, "operation_type": "search+R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /folders/{id}/items\",\"PUT /folders/{id}\"]}"} +{"question": "In the history area, open the 'Buenos Aires' folder and identify any duplicate markdown files that appear to be copies of the same Dirty War class notes. Use clues like near-identical filenames and identical file size to decide which one is the duplicate copy. Keep the canonical original and delete/trash only the duplicate.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"3320893579\"}},\"expected_changes\":{\"item_status\":{\"to\":{\"eq\":\"trashed\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_149", "test_name": "Level 3: Remove Duplicate (Buenos Aires)", "service": "box", "task_horizon": 3, "operation_type": "search+R+D", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"GET /folders/{id}/items\",\"DELETE /files/{id}\"]}"} +{"question": "Check the size of the file named 'transport-april-2025-csv.csv' inside 'investments'. If it's larger than 1MB, rename it to 'large_transport.csv', otherwise rename it to 'small_transport.csv'.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"1421498350\"}},\"expected_changes\":{\"name\":{\"to\":{\"eq\":\"large_transport.csv\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_144", "test_name": "Level 3: Conditional Logic (Size)", "service": "box", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"PUT /files/{id}\"]}"} +{"question": "The tea ceremony school is transitioning to Ro season (炉, the winter hearth period). You need to help organize the digital materials for this important seasonal change. First, find which hub already exists for tea ceremony seasonal materials — you'll need to add updated content there later. Locate the winter preparation guide in the chado folder. Verify it's the current document (not a draft), then update it with the tag winter_season and set its description to \"Ro season preparation - 炉 (November-April)\". Add a comment to the winter preparation guide noting: \"Ready for Ro season (炉) - charcoal placement verified.\" Next, find the utensil inventory file. Add a comment reminding the team: \"Utensils require cleaning before Hatsugama ceremony.\" There's an old draft file in the same folder that has been superseded — it's clearly marked as obsolete. Delete it to clean up the archive. Finally, add the winter preparation guide to the seasonal materials hub so it's easily accessible to all practitioners.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"3180616460\"}},\"expected_changes\":{\"tags\":{\"to\":{\"contains\":\"winter_season\"}},\"description\":{\"to\":{\"contains\":\"Ro season\"}}},\"ignore\":[\"parent_id\",\"shared_link\"]},{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"item_id\":{\"eq\":\"3180616460\"},\"message\":{\"contains\":\"charcoal\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"item_id\":{\"eq\":\"3309661031\"},\"message\":{\"contains\":\"Hatsugama\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"1018029878\"}},\"expected_changes\":{\"item_status\":{\"to\":{\"eq\":\"trashed\"}}}},{\"diff_type\":\"added\",\"entity\":\"box_hub_items\",\"where\":{\"item_id\":{\"eq\":\"3180616460\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_157", "test_name": "Level 4: Tea Ceremony Ro Season", "service": "box", "task_horizon": 8, "operation_type": "R+search+U+C+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /hubs\",\"GET /search\",\"PUT /files/{id}\",\"POST /comments\",\"GET /search\",\"POST /comments\",\"DELETE /files/{id}\",\"POST /hubs/{id}/manage_items\"]}"} +{"question": "In the macroeconomics area, there is a dataset folder that contains dozens of 2018 Census CSV files (national highlights / totals by topic). Find that dataset folder and rename it to 'Census_2018_Data'.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_folders\",\"where\":{\"id\":{\"eq\":\"9782984299\"}},\"expected_changes\":{\"name\":{\"to\":{\"eq\":\"Census_2018_Data\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_134", "test_name": "Level 3: Search and Rename Folder", "service": "box", "task_horizon": 3, "operation_type": "search+R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"GET /folders/{id}/items\",\"PUT /folders/{id}\"]}"} +{"question": "Upload a small text file named 'tmp_delete_me.txt' to the root folder with content 'delete-me'. Then delete the file.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_files\",\"where\":{\"name\":{\"eq\":\"tmp_delete_me.txt\"},\"item_status\":{\"eq\":\"trashed\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_147", "test_name": "Level 1: Upload and Delete File (Trash)", "service": "box", "task_horizon": 2, "operation_type": "C+D", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"POST /files/content\",\"DELETE /files/{id}\"]}"} +{"question": "You are reorganizing the institute's demographic data assets. The goal is to consolidate disparate 2018 Census files and April 2025 transport data into a unified structure. First, create a new Hub called \"Demographics 2025\". This will be the central access point. In the macroeconomics area, there is a folder containing 2018 Census CSV files (look for a folder with many CSVs). Rename this folder to \"Census_2018_Master\". Inside \"Census_2018_Master\", create a subfolder called \"National_Highlights\". Now, search for and identify the \"transport-april-2025-csv.csv\" file. Download/read it to extract the first row's Series_reference. Task 1 (Left Branch): Move the transport file into \"Census_2018_Master\". Add a comment to it: \"Transport series [Series_reference] included for cross-reference.\" Task 2 (Right Branch): Find any file in the census folder that contains \"population\" in its name. Move it into the \"National_Highlights\" subfolder you created. Finally, create a new text file named \"hub_manifest.txt\" inside \"Census_2018_Master\" with the content: \"Consolidated: Census 2018 + Transport 2025.\" Update the \"Demographics 2025\" hub description to: \"Unified demographic and transport datasets.\" ", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_hubs\",\"where\":{\"title\":{\"contains\":\"Demographics 2025\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"box_folders\",\"where\":{\"id\":{\"eq\":\"9782984299\"}},\"expected_changes\":{\"name\":{\"to\":{\"eq\":\"Census_2018_Master\"}}}},{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"National_Highlights\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"item_id\":{\"eq\":\"1421498350\"},\"message\":{\"contains\":\"Transport series\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_files\",\"where\":{\"name\":{\"eq\":\"hub_manifest.txt\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_162", "test_name": "Level 4: Demographics 2025 Reorganization", "service": "box", "task_horizon": 12, "operation_type": "C+search+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"POST /hubs\",\"GET /search\",\"PUT /folders/{id}\",\"POST /folders\",\"GET /search\",\"GET /files/{id}/content\",\"PUT /files/{id}\",\"POST /comments\",\"GET /search\",\"PUT /files/{id}\",\"POST /files/content\",\"PUT /hubs/{id}\"]}"} +{"question": "Download the '2001 crisis notes.txt' file, append the line 'UPDATED: Version 2' to its content, and upload it as a new version of the same file.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"5696874158\"}},\"expected_changes\":{\"version_number\":{\"to\":{\"ne\":\"1\"}}},\"ignore\":[\"sha_1\",\"size\",\"file_version_id\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_148", "test_name": "Level 2: Upload New Version", "service": "box", "task_horizon": 3, "operation_type": "search+R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /files/{id}/content\",\"POST /files/{id}/content\"]}"} +{"question": "For all FOMC minutes PDFs in macroeconomics, set their description to include the date from their filename (e.g., 'FOMC minutes from January 2025').", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"name\":{\"contains\":\"fomc\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"FOMC\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_152", "test_name": "Level 2: Description From Name", "service": "box", "task_horizon": 5, "operation_type": "search+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"PUT /files/{id}\",\"PUT /files/{id}\",\"PUT /files/{id}\",\"PUT /files/{id}\"]}"} +{"question": "Create a new hub called 'Model Evaluations'. Find all the JSON files in the agent-diff-research folder that contain model evaluation results and add them to this new hub.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_hubs\",\"where\":{\"title\":{\"contains\":\"Model Evaluations\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_hub_items\",\"where\":{\"item_type\":{\"eq\":\"file\"}},\"expected_count\":{\"min\":8}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_141", "test_name": "Level 3: Organize Research Hub", "service": "box", "task_horizon": 4, "operation_type": "C+search+R", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"POST /hubs\",\"GET /search\",\"GET /folders/{id}/items\",\"POST /hubs/{id}/manage_items\"]}"} +{"question": "Find the transport dataset CSV from April 2025 (in the investments/macroeconomics area). Download/read it, count the total number of lines (INCLUDING the header), and add a comment to the file exactly in the format: `Line count: 44761`.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"item_id\":{\"eq\":\"1421498350\"},\"message\":{\"eq\":\"Line count: 44761\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_136", "test_name": "Level 4: Read CSV and Comment", "service": "box", "task_horizon": 3, "operation_type": "search+R+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /files/{id}/content\",\"POST /comments\"]}"} +{"question": "Create a folder named 'Backup' in the root directory, and another folder named 'Backup' inside the 'investments' folder. Then, rename the 'Backup' folder that is inside 'investments' to 'Backup_in_investments'.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Backup\"},\"parent_id\":{\"eq\":\"0\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Backup_in_investments\"},\"parent_id\":{\"eq\":\"5610825569\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_138", "test_name": "Level 3: Ambiguous Folder Selection (No Delete)", "service": "box", "task_horizon": 4, "operation_type": "C+search+U", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"POST /folders\",\"GET /search\",\"POST /folders\",\"PUT /folders/{id}\"]}"} +{"question": "You are auditing the \"investments\" folder for the upcoming financial review. Locate the Google earnings report PDF (goog-10-q-q2-2025.pdf). Get its file details to check its size. Condition 1: If the file size is greater than 1MB (1,048,576 bytes), add the tag large_audit. Condition 2: If the file size is less than or equal to 1MB, add the tag standard_audit. Next, create a new Hub called \"Q2 Financial Review\". Search for all files with \"fomc\" in their name. For each file found, add it to the \"Q2 Financial Review\" hub. Find the \"Analysis_2026\" folder (if it exists, otherwise create it). Inside, upload a new text file named audit_summary.txt. The content should be: \"Audit complete. Google report size: [SIZE_IN_BYTES] bytes.\" Finally, add a comment to the Google earnings report: \"Audit status: Tagged based on size ([SIZE_IN_BYTES]b).\" ", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"2748861636\"}},\"expected_changes\":{\"tags\":{\"to\":{\"ne\":null}}}},{\"diff_type\":\"added\",\"entity\":\"box_hubs\",\"where\":{\"title\":{\"contains\":\"Q2 Financial Review\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_files\",\"where\":{\"name\":{\"eq\":\"audit_summary.txt\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"item_id\":{\"eq\":\"2748861636\"},\"message\":{\"contains\":\"Audit status\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_163", "test_name": "Level 3: Google Earnings Size Audit", "service": "box", "task_horizon": 9, "operation_type": "search+R+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /files/{id}\",\"PUT /files/{id}\",\"POST /hubs\",\"GET /search\",\"POST /hubs/{id}/manage_items\",\"POST /folders\",\"POST /files/content\",\"POST /comments\"]}"} +{"question": "Your history research archive in Box is disorganized and needs cleanup. You have redundant folders, misfiled documents, and obsolete tasks cluttering the system. In the history area, there are two folders that seem to contain overlapping Buenos Aires research: one called \"BA\" and one called \"Buenos Aires\". Consolidate them by moving the entire \"BA\" folder into \"Buenos Aires\" as a subfolder, then rename the \"BA\" folder to \"Legacy_Materials\" to indicate it contains older content. In the readings area, list the contents and look for organizational issues. The file \"digital history methods - week 3 reading.txt\" is sitting at the top level of the history folder but belongs in the \"digital humanities\" subfolder under readings. Move this file to its correct location. Create a new folder called \"Archive_Cleanup_2026\" in the root of the history folder to track this reorganization effort. Inside it, create a subfolder called \"Duplicates_Review\" where duplicate files can be moved for review. Look through the seed for files marked as duplicates (files with \"(1)\" in the name or \"backup\"/\"copy\" in the name). These files have obsolete tasks attached. Find and delete the tasks marked \"[OBSOLETE]\" or \"[OUTDATED]\" since the reorganization will handle these files differently. Check what hubs currently exist — you may want to add reorganized materials to an appropriate hub later.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_folders\",\"where\":{\"id\":{\"eq\":\"2228309175\"}},\"expected_changes\":{\"parent_id\":{\"to\":{\"eq\":\"1206853609\"}},\"name\":{\"to\":{\"eq\":\"Legacy_Materials\"}}}},{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Archive_Cleanup_2026\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Duplicates_Review\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_160", "test_name": "Level 4: History Archive Reorganization", "service": "box", "task_horizon": 10, "operation_type": "search+R+C+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /search\",\"PUT /folders/{id}\",\"GET /folders/{id}/items\",\"PUT /files/{id}\",\"POST /folders\",\"POST /folders\",\"GET /files/{id}/tasks\",\"DELETE /tasks/{id}\",\"GET /hubs\"]}"} +{"question": "Find out who I am logged in as, and create a folder named exactly equal to my display name in the root directory.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Admin User\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_116", "test_name": "Level 1: Get Current User", "service": "box", "task_horizon": 2, "operation_type": "R+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /users/me\",\"POST /folders\"]}"} +{"question": "In the same macro-data folder, find the transport registrations CSV (it has columns like Series_reference, Period, Data_value). Download/read it and take the first data row values for Series_reference and Period. Upload a new small TXT file into the macro-data folder named `transport__.txt`, but replace '.' with '_' in both fields. The file content should include the extracted Series_reference, Period, and Data_value from that first row.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_files\",\"where\":{\"name\":{\"eq\":\"transport_TPTA_S22IA_1970_12.txt\"},\"parent_id\":{\"eq\":\"1973339758\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_135", "test_name": "Level 4: Upload TXT named from CSV (Transport)", "service": "box", "task_horizon": 4, "operation_type": "search+R+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /folders/{id}/items\",\"GET /files/{id}/content\",\"POST /files/content\"]}"} +{"question": "Search for all plain-text files about Argentina's 2001 economic crisis. You should find two copies - one properly filed in the history folder and one misfiled in the root. Delete the misfiled copy, then read the correctly filed one. If it mentions 'Argentina', add the tag 'Latin_America' to it.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"9979104500\"}},\"expected_changes\":{\"item_status\":{\"to\":{\"eq\":\"trashed\"}}}},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"5696874158\"}},\"expected_changes\":{\"tags\":{\"to\":{\"contains\":\"Latin_America\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_132", "test_name": "Level 4: Find Duplicates, Delete Misfiled, Tag Correct", "service": "box", "task_horizon": 4, "operation_type": "search+D+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"DELETE /files/{id}\",\"GET /files/{id}/content\",\"PUT /files/{id}\"]}"} +{"question": "You're helping manage the documentation for a Moog Minimoog restoration project. The synth is from 1974 (serial 10847) and the team has been tracking repairs and calibrations in Box. First, search for files related to the Minimoog or Moog restoration. Get the details of the project folder to understand what's there. Check if any synth restoration documents are in your favorites collection. On the capacitor replacement log, add a new comment documenting: \"C47 replaced with Nichicon 47µF/25V - oscillator section complete.\" Then find the existing comment about \"C31 verified\" and update it to add: \"- measurement confirmed at 0.98x nominal.\" For the filter calibration procedure file, there are two pending tasks. Find the task about \"resonance calibration\" and mark it as complete. Find the task about \"cutoff tracking\" and update its message to: \"Cutoff tracking verified ±3 cents across 5 octaves - exceeds spec.\" Add the tag restoration-complete to the oscillator schematic notes file since that section is now finished. Finally, create a new hub called \"Synth Restoration Archive\" to centralize all vintage instrument documentation going forward.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"item_id\":{\"eq\":\"1062973727\"},\"message\":{\"contains\":\"C47\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"2666248889\"}},\"expected_changes\":{\"tags\":{\"to\":{\"contains\":\"restoration-complete\"}}}},{\"diff_type\":\"added\",\"entity\":\"box_hubs\",\"where\":{\"title\":{\"contains\":\"Synth Restoration Archive\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_158", "test_name": "Level 4: Moog Minimoog Restoration", "service": "box", "task_horizon": 9, "operation_type": "search+R+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /folders/{id}\",\"GET /collections\",\"POST /comments\",\"PUT /comments/{id}\",\"PUT /tasks/{id}\",\"PUT /tasks/{id}\",\"PUT /files/{id}\",\"POST /hubs\"]}"} +{"question": "Find all files with 'fomc' in their name. Create a new folder named 'FOMC_Reports' in the root directory, and move all found files into it.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"FOMC_Reports\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"name\":{\"contains\":\"fomc\"}},\"expected_changes\":{\"parent_id\":{\"to\":{\"ne\":\"0\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_129", "test_name": "Level 3: Find and Move All", "service": "box", "task_horizon": 6, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"POST /folders\",\"PUT /files/{id}\",\"PUT /files/{id}\",\"PUT /files/{id}\",\"PUT /files/{id}\"]}"} +{"question": "Find the plain-text study notes about Argentina's 2001 economic crisis (in the history area). Download/read the file and identify the protest slogan used during the December uprising. Post a comment on that file with the exact slogan text.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"item_id\":{\"eq\":\"5696874158\"},\"message\":{\"contains\":\"Que se vayan todos\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_154", "test_name": "Level 4: Read and Extract Slogan", "service": "box", "task_horizon": 3, "operation_type": "search+R+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /files/{id}/content\",\"POST /comments\"]}"} +{"question": "You are preparing the final conservation audit for external review. First, confirm your identity — get your current user details. Locate the \"Annual Summary 2025\" file in the rare books folder. Create a shared link for this file with access set to \"open\" so external auditors can view it. Then, check your \"Favorites\" collection. If the Annual Summary is not already in your favorites, add it to the collection for quick access. Finally, verify the file's details to confirm the shared link is active and the file is listed in the collection.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"1172138282\"}},\"expected_changes\":{\"shared_link\":{\"from\":{\"exists\":false},\"to\":{\"exists\":true}}},\"ignore\":[\"collections\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_161", "test_name": "Level 2: Conservation Audit Shared Link", "service": "box", "task_horizon": 5, "operation_type": "R+search+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /users/me\",\"GET /search\",\"PUT /files/{id}\",\"GET /collections\",\"PUT /files/{id}\"]}"} +{"question": "Search for files with 'fomc' in the name. Add a comment 'Relevant' to the first file found.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"message\":{\"eq\":\"Relevant\"},\"item_id\":{\"in\":[\"3379954793\",\"2667428831\",\"1246789615\",\"1439014490\"]}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_118", "test_name": "Level 1: Search for FOMC", "service": "box", "task_horizon": 2, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"POST /comments\"]}"} +{"question": "In the history/readings folder, reorganize all files by extension: create three folders 'PDFs', 'Word_Docs', and 'Markdown' directly in history/readings. Move ALL .pdf, .docx, and .md files from all subfolders into these new folders, flattening the structure. After moving the files, delete all the now-empty category subfolders.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"PDFs\"},\"parent_id\":{\"eq\":\"2113564020\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Word_Docs\"},\"parent_id\":{\"eq\":\"2113564020\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Markdown\"},\"parent_id\":{\"eq\":\"2113564020\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"box_folders\",\"where\":{\"id\":{\"in\":[\"7905906319\",\"3298967046\",\"1031140335\",\"2396378676\",\"1088403890\",\"7891120016\"]}},\"expected_changes\":{\"item_status\":{\"to\":{\"eq\":\"trashed\"}}},\"expected_count\":6}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_143", "test_name": "Level 3: Organize By Extension (Flatten)", "service": "box", "task_horizon": 11, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"POST /folders\",\"GET /folders/{id}/items\",\"PUT /files/{id}\",\"DELETE /folders/{id}\"]}"} +{"question": "List all comments on the Google 10-Q PDF in investments. Create a folder named 'File_Has__Comments' where is the number of comments found.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"contains\":\"Comments\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_153", "test_name": "Level 1: List File Comments", "service": "box", "task_horizon": 3, "operation_type": "search+R+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /files/{id}/comments\",\"POST /folders\"]}"} +{"question": "Move the file 'transport-april-2025-csv.csv' into the 'investments' folder.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"1421498350\"}},\"expected_changes\":{\"parent_id\":{\"to\":{\"eq\":\"5610825569\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_121", "test_name": "Level 1: Move File", "service": "box", "task_horizon": 3, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"GET /search\",\"PUT /files/{id}\"]}"} +{"question": "Search for all FOMC minutes PDFs in the investments area. Create a hub called 'Fed Minutes Archive' and add all the FOMC documents to it.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_hubs\",\"where\":{\"title\":{\"contains\":\"Fed Minutes\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_hub_items\",\"where\":{\"item_name\":{\"contains\":\"fomc\"}},\"expected_count\":{\"min\":4}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_150", "test_name": "Level 3: Curate FOMC Hub", "service": "box", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"POST /hubs\",\"POST /hubs/{id}/manage_items\"]}"} +{"question": "Create a new Box Hub titled 'Research Center'.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_hubs\",\"where\":{\"title\":{\"eq\":\"Research Center\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_122", "test_name": "Level 1: Create Hub", "service": "box", "task_horizon": 1, "operation_type": "C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"POST /hubs\"]}"} +{"question": "Get details for the 'investments' folder and change its description to 'Audit Complete'.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_folders\",\"where\":{\"id\":{\"eq\":\"5610825569\"}},\"expected_changes\":{\"description\":{\"to\":{\"eq\":\"Audit Complete\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_124", "test_name": "Level 1: Get Folder Info", "service": "box", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"PUT /folders/{id}\"]}"} +{"question": "Search for 'crisis' in my Box, read the text files found, and if any contains the year '2001' but is NOT already in the history folder, move it there.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"9979104500\"}},\"expected_changes\":{\"parent_id\":{\"to\":{\"eq\":\"1660804823\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_130", "test_name": "Level 4: Search Read Move", "service": "box", "task_horizon": 4, "operation_type": "search+R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /files/{id}/content\",\"GET /search\",\"PUT /files/{id}\"]}"} +{"question": "Create a Hub named 'Economic Data' and add the 'macroeconomics' folder to it.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_hubs\",\"where\":{\"title\":{\"eq\":\"Economic Data\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_hub_items\",\"where\":{\"item_id\":{\"eq\":\"1973339758\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_131", "test_name": "Level 2: Hub Setup", "service": "box", "task_horizon": 3, "operation_type": "C+search", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"POST /hubs\",\"GET /search\",\"POST /hubs/{id}/manage_items\"]}"} +{"question": "In the investments area, locate the folder that contains macroeconomic CSV datasets. Find the CPI/price indexes CSV for December 2025, download/read it, and extract the first data row values for Series_reference and Series_title_1. Rename the macro-data folder to `macro_` + `_` + ``, but replace '.' with '_' in the Series_reference. Then set the folder's description to: `series=; title=`.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_folders\",\"where\":{\"id\":{\"eq\":\"1973339758\"}},\"expected_changes\":{\"name\":{\"to\":{\"eq\":\"macro_CPIM_SE901_Food\"}},\"description\":{\"to\":{\"contains\":\"CPIM.SE901\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_133", "test_name": "Level 4: Rename Folder from CSV (CPI)", "service": "box", "task_horizon": 4, "operation_type": "search+R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /folders/{id}/items\",\"GET /files/{id}/content\",\"PUT /folders/{id}\"]}"} +{"question": "Create a new folder named 'Analysis_2026' inside the 'investments' folder.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Analysis_2026\"},\"parent_id\":{\"eq\":\"5610825569\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_117", "test_name": "Level 1: Create Folder", "service": "box", "task_horizon": 2, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"POST /folders\"]}"} +{"question": "Add the tags 'finance', 'investments', and 'quarterly' to the investments folder.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"box_folders\",\"where\":{\"id\":{\"eq\":\"5610825569\"}},\"expected_changes\":{\"tags\":{\"to\":{\"contains\":\"finance\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_123", "test_name": "Level 1: Add Folder Tags", "service": "box", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"actionEval\",\"tools_required\":[\"GET /search\",\"PUT /folders/{id}\"]}"} +{"question": "Your research institute's Box storage is disorganized. Somewhere in the archive, there are field research documents from cryptozoology expeditions — specifically sighting reports that may contain photographic evidence of unidentified creatures. Your task: Find a cryptozoology sighting report (search for relevant terms). Download and read its content. If the document mentions \"photographic evidence\" anywhere in the text, it should be tagged as verified; otherwise tag it unverified. Create a proper organizational structure: a main folder \"Expeditions_2025\" in the root, with a subfolder \"Cryptid_Sightings\" inside it. Move the sighting report into this subfolder with the appropriate tag. Add a comment to the file documenting your review: include today's date and the expedition name (which you'll find mentioned in the document's content). After moving the file, check its original location. If there are any obvious duplicate files (backup copies with similar names), delete them to clean up. Then rename the original source folder by appending \"_archived\" to its name. Finally, create a Hub called \"2025 Field Research Index\" and add the \"Expeditions_2025\" folder to it for easy access.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Expeditions_2025\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"box_folders\",\"where\":{\"name\":{\"eq\":\"Cryptid_Sightings\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"3302188295\"}},\"expected_changes\":{\"tags\":{\"to\":{\"ne\":null}}},\"ignore\":[\"parent_id\"]},{\"diff_type\":\"added\",\"entity\":\"box_comments\",\"where\":{\"item_id\":{\"eq\":\"3302188295\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"box_files\",\"where\":{\"id\":{\"eq\":\"1891733744\"}},\"expected_changes\":{\"item_status\":{\"to\":{\"eq\":\"trashed\"}}}},{\"diff_type\":\"changed\",\"entity\":\"box_folders\",\"where\":{\"id\":{\"eq\":\"4313494130\"}},\"expected_changes\":{\"name\":{\"to\":{\"contains\":\"archived\"}}}},{\"diff_type\":\"added\",\"entity\":\"box_hubs\",\"where\":{\"title\":{\"contains\":\"2025 Field Research Index\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"modified_at\",\"content_created_at\",\"content_modified_at\",\"purged_at\",\"trashed_at\",\"etag\",\"sequence_id\",\"sha1\",\"file_version\",\"path_collection\",\"created_by\",\"modified_by\",\"owned_by\"]}}", "test_id": "box_156", "test_name": "Level 4: Cryptozoology Expedition Organization", "service": "box", "task_horizon": 11, "operation_type": "search+R+C+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"box\",\"seed_template\":\"box_default\",\"impersonate_user_id\":\"27512847635\",\"eval_type\":\"compositeEval\",\"tools_required\":[\"GET /search\",\"GET /files/{id}/content\",\"PUT /files/{id}\",\"POST /folders\",\"POST /folders\",\"PUT /files/{id}\",\"POST /comments\",\"DELETE /files/{id}\",\"PUT /folders/{id}\",\"POST /hubs\",\"POST /hubs/{id}/manage_items\"]}"} +{"question": "Please list events on my Driftglass Studio calendar so you can find the event ID for Tide-Polish Lesson, then move that event to the Mariner Annex calendar.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_tide_polish_lesson\"},\"calendar_id\":{\"eq\":\"cal_driftglass_studio\"}},\"expected_changes\":{\"calendar_id\":{\"to\":{\"eq\":\"cal_mariner_annex\"}}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_194", "test_name": "Driftglass Studio - move event to annex", "service": "calendar", "task_horizon": 2, "operation_type": "search+R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.list\",\"events.move\"]}"} +{"question": "Create a new calendar called Latticewren Survey Log. Then fetch its calendar-list entry and fully replace that entry so the calendar is hidden and not selected in my list. I want it out of sight until Salma (salma@test.com) asks for it.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Latticewren Survey Log\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"user_id\":{\"eq\":\"user_agent\"},\"primary\":{\"eq\":false},\"hidden\":{\"eq\":true},\"selected\":{\"eq\":false}},\"expected_changes\":{\"hidden\":{\"to\":true},\"selected\":{\"to\":false}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_196", "test_name": "Latticewren Survey - create calendar and hide entry", "service": "calendar", "task_horizon": 3, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendars.insert\",\"calendarList.get\",\"calendarList.update\"]}"} +{"question": "On the Silverroot Observatory calendar (ID cal_silverroot_observatory), first check the ACL rule user:zahra@test.com. Then update the calendar description to 'Lens rotation and night ledger.' Next, list instances of the recurring event evt_silverroot_rotation, and then fully replace that event so it runs weekly on Mondays at 7:00pm, starting July 2, 2018, for 1 hour at West Dome. After that, enable both an events watch and an ACL watch on this calendar.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_silverroot_observatory\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"Lens rotation and night ledger\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_silverroot_rotation\"},\"calendar_id\":{\"eq\":\"cal_silverroot_observatory\"}},\"expected_changes\":{\"recurrence\":{\"to\":{\"contains\":\"RRULE:FREQ=WEEKLY;BYDAY=MO\"}},\"start\":{\"to\":{\"contains\":\"2018-07-02T19:00\"}},\"end\":{\"to\":{\"contains\":\"2018-07-02T20:00\"}},\"location\":{\"to\":{\"contains\":\"West Dome\"}}},\"expected_count\":1,\"ignore\":[\"attendees\",\"color_id\",\"description\",\"reminders\",\"status\",\"summary\",\"transparency\",\"visibility\"]},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_silverroot_observatory/events\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_silverroot_observatory/acl\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_208", "test_name": "Silverroot Observatory - replace recurring event and watch", "service": "calendar", "task_horizon": 6, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"acl.get\",\"calendars.patch\",\"events.instances\",\"events.update\",\"events.watch\",\"acl.watch\"]}"} +{"question": "On the Brineglass Works calendar (ID cal_brineglass_works), first fetch the event evt_brineglass_forge_demo, then move it to the Harbor Kiln Hall calendar (ID cal_harbor_kiln_hall). Next, use free/busy to find the earliest 30-minute overlap for Lucia (lucia@test.com) and Noah (noah@test.com) on June 30, 2018, and create a new event Saltglass Alignment on Brineglass Works at that time. Then fully replace Lucia’s ACL rule (user:lucia@test.com) on Brineglass Works to writer. Finally, set up a settings watch for my account.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_brineglass_forge_demo\"},\"calendar_id\":{\"eq\":\"cal_brineglass_works\"}},\"expected_changes\":{\"calendar_id\":{\"to\":{\"eq\":\"cal_harbor_kiln_hall\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_brineglass_works\"},\"summary\":{\"contains\":\"Saltglass Alignment\"},\"start.dateTime\":{\"contains\":\"2018-06-30T22:00\"},\"end.dateTime\":{\"contains\":\"2018-06-30T22:30\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_brineglass_works\"},\"scope_value\":{\"eq\":\"lucia@test.com\"}},\"expected_changes\":{\"role\":{\"to\":{\"eq\":\"writer\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/users/user_agent/settings\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_209", "test_name": "Brineglass Works - move, create, ACL update, settings watch", "service": "calendar", "task_horizon": 6, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.get\",\"events.move\",\"freeBusy.query\",\"events.insert\",\"acl.update\",\"settings.watch\"]}"} +{"question": "Please clear all events from the Mossquill Archive calendar (ID cal_mossquill_archive). Then patch that calendar's description to 'Restoration ledger and vault access.' Update Salma's ACL on the Mossquill Archive calendar (rule user:salma@test.com) to reader using a full replacement. Before changing the inspection slot, check my timezone setting. Then fully replace the event evt_mossquill_vault_check so it's on June 29, 2018 from 4:00pm-5:00pm at Lower Vault Door. Finally, fetch my dateFieldOrder setting.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_mossquill_archive\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_mossquill_archive\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"Restoration ledger and vault access\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_mossquill_archive\"},\"scope_value\":{\"eq\":\"salma@test.com\"}},\"expected_changes\":{\"role\":{\"to\":{\"eq\":\"reader\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_mossquill_vault_check\"},\"calendar_id\":{\"eq\":\"cal_mossquill_archive\"}},\"expected_changes\":{\"start\":{\"to\":{\"contains\":\"2018-06-29T16:00\"}},\"end\":{\"to\":{\"contains\":\"2018-06-29T17:00\"}},\"location\":{\"to\":{\"contains\":\"Lower Vault Door\"}}},\"expected_count\":1,\"ignore\":[\"attendees\",\"color_id\",\"description\",\"recurrence\",\"reminders\",\"status\",\"summary\",\"transparency\",\"visibility\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_202", "test_name": "Mossquill Archive - clear, patch, replace", "service": "calendar", "task_horizon": 5, "operation_type": "search+R+U", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendars.clear\",\"calendars.patch\",\"acl.update\",\"settings.get\",\"events.update\"]}"} +{"question": "Please list my calendar settings so I can confirm my timezone and date/time formats before I reply to Sana (sana@test.com). Also, set up a watch on my settings so I get notified of any changes.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/users/user_agent/settings\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_189", "test_name": "Mistforge Observatory - settings check and watch", "service": "calendar", "task_horizon": 2, "operation_type": "search+C+R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"settings.list\",\"settings.watch\"]}"} +{"question": "On the Ironlace Conservatory calendar (ID cal_ironlace_conservatory), list instances of evt_ironlace_orchid first. Then clear all events from that calendar. After the reset, fully replace evt_ironlace_orchid so it's on July 3, 2018 from 10:00am-11:00am at Glassbed Hall. Patch Mina's ACL rule (user:mina@test.com) to reader. Then stop the old channel with id chan_ironlace_12 and resourceId res_ironlace_12. Finally, delete the Old Driftgreen calendar (ID cal_old_driftgreen).", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_ironlace_cleanup\"},\"calendar_id\":{\"eq\":\"cal_ironlace_conservatory\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_ironlace_orchid\"},\"calendar_id\":{\"eq\":\"cal_ironlace_conservatory\"}},\"expected_changes\":{\"start\":{\"to\":{\"contains\":\"2018-07-03T10:00\"}},\"end\":{\"to\":{\"contains\":\"2018-07-03T11:00\"}},\"location\":{\"to\":{\"contains\":\"Glassbed Hall\"}}},\"expected_count\":1,\"ignore\":[\"attendees\",\"color_id\",\"description\",\"recurrence\",\"reminders\",\"status\",\"summary\",\"transparency\",\"visibility\"]},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_ironlace_conservatory\"},\"scope_value\":{\"eq\":\"mina@test.com\"}},\"expected_changes\":{\"role\":{\"to\":{\"eq\":\"reader\"}}},\"expected_count\":1},{\"diff_type\":\"removed\",\"entity\":\"calendar_channels\",\"where\":{\"id\":{\"eq\":\"chan_ironlace_12\"},\"resource_id\":{\"eq\":\"res_ironlace_12\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_old_driftgreen\"}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_218", "test_name": "Ironlace Conservatory - clear, replace, instances, ACL patch, delete calendar", "service": "calendar", "task_horizon": 6, "operation_type": "search+U+D", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.instances\",\"calendars.clear\",\"events.update\",\"acl.patch\",\"channels.stop\",\"calendars.delete\"]}"} +{"question": "Create a new calendar called Wavelock Guest Sweep. Fully update that calendar to set timezone Europe/Berlin and description \"Guest sweep log and embargo notes.\" Then list events on my primary calendar and identify all events where Aiko, Farid, Lucia, or Oksana appear in the attendee list. Delete every such event. Run a free/busy query for those four across Aug 1–7, 2018, then another for Aug 8–14, 2018. Schedule a weekly 30-minute event on my primary calendar at the earliest time that doesn't conflict with any of those four attendees. Unsubscribe me from the legacy calendar cal_wavelock_legacy. Finally, start a watch on my calendar list and a watch on my settings.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Wavelock Guest Sweep\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Wavelock Guest Sweep\"}},\"expected_changes\":{\"time_zone\":{\"to\":{\"eq\":\"Europe/Berlin\"}},\"description\":{\"to\":{\"contains\":\"Guest sweep log and embargo notes\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_primary_guest_aiko_farid\"},\"calendar_id\":{\"eq\":\"test.user@test.com\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_primary_guest_lucia_oksana\"},\"calendar_id\":{\"eq\":\"test.user@test.com\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"test.user@test.com\"},\"start.dateTime\":{\"contains\":\"2018-08-01T09:00\"},\"end.dateTime\":{\"contains\":\"2018-08-01T09:30\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"},\"recurrence\":{\"contains\":\"RRULE:FREQ=WEEKLY\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_wavelock_legacy\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/users/me/calendarList\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/users/user_agent/settings\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_223", "test_name": "Wavelock Guest Sweep - create, purge attendees, freebusy, watch", "service": "calendar", "task_horizon": 10, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendars.insert\",\"calendars.update\",\"events.list\",\"events.delete\",\"freeBusy.query\",\"events.insert\",\"calendarList.delete\",\"calendarList.watch\",\"settings.watch\"]}"} +{"question": "Subscribe me to the external calendar cal_emberveil_rookery. Then start an events watch on that calendar. Next, check Hana’s calendar and only set up a settings watch for my account if Hana has an event on June 30, 2018 at 9:00-9:30am Asia/Tokyo time. Then, only remove Salma’s access from Emberveil Rookery (rule cal_emberveil_rookery:user:salma@test.com) if that calendar has more than 7 events between June 20-27, 2018. Finally, delete the obsolete Ashfeather Annex calendar (ID cal_ashfeather_annex).", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_emberveil_rookery\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_emberveil_rookery/events\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_emberveil_rookery\"},\"scope_value\":{\"eq\":\"salma@test.com\"},\"deleted\":{\"eq\":true}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/users/user_agent/settings\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_ashfeather_annex\"}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_212", "test_name": "Emberveil Rookery - subscribe, watch, revoke, delete", "service": "calendar", "task_horizon": 6, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.insert\",\"events.watch\",\"events.list\",\"settings.watch\",\"acl.delete\",\"calendars.delete\"]}"} +{"question": "Please help me plan leave using the Seabriar Leave Ledger calendar (ID cal_seabriar_leave). First list my calendar list to confirm the ledger is there, and fetch that calendar's metadata. Then count how many vacation days I've already used by listing instances of evt_used_vacation_day from Jan 1 to Aug 9, 2018. Use that count to book my vacation starting Aug 10, 2018 for the remaining days (assume the annual allowance is 20 days), and make sure you count only business days (no weekends). Before you lock it in, check instances of evt_company_blackout and evt_weekend_silence so you don't place anything on weekends. Update the ledger description to \"Leave ledger and tally-based booking,\" share the ledger with Aiko (aiko@test.com) as a reader, and set its calendar list entry to visible with color ID 10. Create the vacation block event on the ledger, and quick-add a reminder: \"Send pigeon letter Sep 10, 2018 9am.\" Also stop the old channel chan_leave_09 / res_leave_09.\n\nNext, cancel all events on my primary calendar that overlap the vacation window. Finally, I want to know if a pigeon letter sent on Sep 10 (it takes 6 days) would arrive before when the Clilffside Pact (evt_cliffside_pact) is scheduled. If it would not, move it to the earliest weekday after the arrival of the pigeon mail that doesn't overlap my vacation.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_seabriar_leave\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"Leave ledger and tally-based booking\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_seabriar_leave\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"hidden\":{\"to\":false},\"color_id\":{\"to\":{\"eq\":\"10\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_seabriar_leave\"},\"scope_value\":{\"eq\":\"aiko@test.com\"},\"role\":{\"eq\":\"reader\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_seabriar_leave\"},\"summary\":{\"contains\":\"Seabriar Leave Block\"},\"start\":{\"contains\":\"2018-08-10\"},\"end\":{\"contains\":\"2018-08-23\"},\"end.dateTime\":{\"contains\":\"2018-08-23\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_seabriar_leave\"},\"summary\":{\"contains\":\"Send pigeon letter Sep 10, 2018 9am\"}},\"expected_count\":1},{\"diff_type\":\"removed\",\"entity\":\"calendar_channels\",\"where\":{\"id\":{\"eq\":\"chan_leave_09\"},\"resource_id\":{\"eq\":\"res_leave_09\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_primary_leave_conflict_1\"},\"calendar_id\":{\"eq\":\"test.user@test.com\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_primary_leave_conflict_2\"},\"calendar_id\":{\"eq\":\"test.user@test.com\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_cliffside_pact\"},\"calendar_id\":{\"eq\":\"test.user@test.com\"}},\"expected_changes\":{\"start\":{\"to\":{\"contains\":\"2018-09-17T10:00\"}},\"end\":{\"to\":{\"contains\":\"2018-09-17T11:00\"}}},\"expected_count\":1,\"ignore\":[\"attendees\",\"color_id\",\"description\",\"location\",\"recurrence\",\"reminders\",\"status\",\"summary\",\"transparency\",\"visibility\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_221", "test_name": "Seabriar Leave Ledger - count, book, cancel conflicts, reschedule", "service": "calendar", "task_horizon": 12, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"calendars.get\",\"events.instances\",\"calendars.patch\",\"acl.insert\",\"calendarList.patch\",\"events.insert\",\"events.quickAdd\",\"channels.stop\",\"events.list\",\"events.delete\",\"events.patch\"]}"} +{"question": "On the Starfen Observatory calendar, please review who has access, then fully replace the Comet Scribe Session so it is on June 24, 2018 from 2:00pm-3:00pm at Dome 3 (treat this as a full replace, not a patch). Also, delete the Dust Ledger calendar entirely.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_comet_scribe_session\"},\"calendar_id\":{\"eq\":\"cal_starfen_observatory\"}},\"expected_changes\":{\"start\":{\"to\":{\"contains\":\"2018-06-24T14:00\"}},\"end\":{\"to\":{\"contains\":\"2018-06-24T15:00\"}},\"location\":{\"to\":{\"contains\":\"Dome 3\"}}},\"expected_count\":1,\"ignore\":[\"attendees\",\"color_id\",\"description\",\"recurrence\",\"reminders\",\"status\",\"summary\",\"transparency\",\"visibility\"]},{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_dust_ledger\"}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_183", "test_name": "Starfen Observatory - replace event and delete calendar", "service": "calendar", "task_horizon": 3, "operation_type": "search+R+U+D", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"acl.list\",\"events.update\",\"calendars.delete\"]}"} +{"question": "On the Ashline Relay Commons calendar (ID cal_ashline_relay_commons), check the ACL rule user:hana@test.com first. Then grant Chinedu (chinedu@test.com) writer access and Noah (noah@test.com) reader access. After that, use free/busy to find the earliest 60-minute overlap for Hana, Chinedu, and Noah between June 28-29, 2018, and schedule a new event on my primary calendar at that overlap called \"Ashline Relay Briefing\" (use my timezone). Finally, update the calendar list entry so this calendar is hidden and not selected.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_ashline_relay_commons\"},\"scope_value\":{\"eq\":\"chinedu@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_ashline_relay_commons\"},\"scope_value\":{\"eq\":\"noah@test.com\"},\"role\":{\"eq\":\"reader\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_ashline_relay_commons\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"hidden\":{\"to\":true},\"selected\":{\"to\":false}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"test.user@test.com\"},\"summary\":{\"contains\":\"Ashline Relay Briefing\"},\"start.dateTime\":{\"contains\":\"2018-06-28T15:00\"},\"end.dateTime\":{\"contains\":\"2018-06-28T16:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_207", "test_name": "Ashline Relay Commons - ACLs, freebusy, hide", "service": "calendar", "task_horizon": 6, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"acl.get\",\"acl.insert\",\"freeBusy.query\",\"events.insert\",\"calendarList.patch\"]}"} +{"question": "Add a one-off event on my primary calendar called Emberglass Kiln Glow on June 25, 2018 from 7:00pm-8:30pm.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"test.user@test.com\"},\"summary\":{\"contains\":\"Emberglass Kiln Glow\"},\"start.dateTime\":{\"contains\":\"2018-06-25T19:00\"},\"end.dateTime\":{\"contains\":\"2018-06-25T20:30\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_184", "test_name": "Emberglass Atelier - single event create", "service": "calendar", "task_horizon": 1, "operation_type": "C", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.insert\"]}"} +{"question": "Check that I'm subscribed to the Aurora Loom calendar (ID cal_aurora_loom). Then remove the entire recurring series Starlit Weave Circle (event ID evt_starlit_weave_series) from that calendar.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_starlit_weave_series\"},\"calendar_id\":{\"eq\":\"cal_aurora_loom\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_192", "test_name": "Aurora Loom - delete recurring series", "service": "calendar", "task_horizon": 2, "operation_type": "search+R+U+D", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.get\",\"events.delete\"]}"} +{"question": "On the Lumenfjord Scriptorium calendar, check the ACL rule user:scribe@lumenfjord.example and tell me what role it has. Then fully replace the Aurora Ink Drying event so it's on June 27, 2018 from 3:00pm-4:00pm at North Alcove. Afterward, list the events on that calendar so I can verify the update.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_aurora_ink_drying\"},\"calendar_id\":{\"eq\":\"cal_lumenfjord_scriptorium\"}},\"expected_changes\":{\"start\":{\"to\":{\"contains\":\"2018-06-27T15:00\"}},\"end\":{\"to\":{\"contains\":\"2018-06-27T16:00\"}},\"location\":{\"to\":{\"contains\":\"North Alcove\"}}},\"expected_count\":1,\"ignore\":[\"attendees\",\"color_id\",\"description\",\"recurrence\",\"reminders\",\"status\",\"summary\",\"transparency\",\"visibility\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_186", "test_name": "Lumenfjord Scriptorium - replace event and list access", "service": "calendar", "task_horizon": 3, "operation_type": "search+R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"acl.get\",\"events.update\",\"events.list\"]}"} +{"question": "I'm drowning in festival logistics for Mirage Menagerie 2026. Find that calendar first. We also need a private crew calendar called Backstage Sandstorm Ops and Piotr (piotr@test.com) must be able to edit it. On the main festival calendar, schedule our eight 15-minute micro-acts starting Saturday June 23, 2018 at 2:00pm, back-to-back every 15 minutes in this exact order: Glass-Dune Juggling, Whispering Wadi Puppets, Lantern Maze Overture, Sand-Script Calligraphy, Mothlight Drummers, Nomad Kite Ballet, Oasis Echo Choir, and Moon-Salt Acrobatics. Add a quick note-style event: 'Starlit Tea Ceremony with Akira tomorrow 3pm' (tomorrow is Monday June 18, 2018). I also need a Twilight Troupe Council for 1 hour on Saturday evening (June 23, 2018) when both Ananya (ananya@test.com) and Zainab (zainab@test.com) can attend--check their availability first. Then update all the micro-acts to be at Dune Pavilion B and include Ananya. Finally, remove the placeholders 'Placeholder: Dust Rehearsal' and 'Placeholder: Ghost Stage'. Please batch the repeated edits/inserts/deletes so we don't trip our API rate limits.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Backstage Sandstorm Ops\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"piotr@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Glass-Dune Juggling\"},\"location\":{\"contains\":\"Dune Pavilion B\"},\"calendar_id\":{\"eq\":\"cal_mirage_menagerie\"},\"start.dateTime\":{\"contains\":\"2018-06-23T14:00\"},\"end.dateTime\":{\"contains\":\"2018-06-23T14:15\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Whispering Wadi Puppets\"},\"location\":{\"contains\":\"Dune Pavilion B\"},\"calendar_id\":{\"eq\":\"cal_mirage_menagerie\"},\"start.dateTime\":{\"contains\":\"2018-06-23T14:15\"},\"end.dateTime\":{\"contains\":\"2018-06-23T14:30\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Lantern Maze Overture\"},\"location\":{\"contains\":\"Dune Pavilion B\"},\"calendar_id\":{\"eq\":\"cal_mirage_menagerie\"},\"start.dateTime\":{\"contains\":\"2018-06-23T14:30\"},\"end.dateTime\":{\"contains\":\"2018-06-23T14:45\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Sand-Script Calligraphy\"},\"location\":{\"contains\":\"Dune Pavilion B\"},\"calendar_id\":{\"eq\":\"cal_mirage_menagerie\"},\"start.dateTime\":{\"contains\":\"2018-06-23T14:45\"},\"end.dateTime\":{\"contains\":\"2018-06-23T15:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Mothlight Drummers\"},\"location\":{\"contains\":\"Dune Pavilion B\"},\"calendar_id\":{\"eq\":\"cal_mirage_menagerie\"},\"start.dateTime\":{\"contains\":\"2018-06-23T15:00\"},\"end.dateTime\":{\"contains\":\"2018-06-23T15:15\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Nomad Kite Ballet\"},\"location\":{\"contains\":\"Dune Pavilion B\"},\"calendar_id\":{\"eq\":\"cal_mirage_menagerie\"},\"start.dateTime\":{\"contains\":\"2018-06-23T15:15\"},\"end.dateTime\":{\"contains\":\"2018-06-23T15:30\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Oasis Echo Choir\"},\"location\":{\"contains\":\"Dune Pavilion B\"},\"calendar_id\":{\"eq\":\"cal_mirage_menagerie\"},\"start.dateTime\":{\"contains\":\"2018-06-23T15:30\"},\"end.dateTime\":{\"contains\":\"2018-06-23T15:45\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Moon-Salt Acrobatics\"},\"location\":{\"contains\":\"Dune Pavilion B\"},\"calendar_id\":{\"eq\":\"cal_mirage_menagerie\"},\"start.dateTime\":{\"contains\":\"2018-06-23T15:45\"},\"end.dateTime\":{\"contains\":\"2018-06-23T16:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_event_attendees\",\"where\":{\"email\":{\"eq\":\"ananya@test.com\"}},\"expected_count\":8},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Starlit Tea Ceremony with Akira\"},\"calendar_id\":{\"eq\":\"cal_mirage_menagerie\"},\"start.dateTime\":{\"contains\":\"2018-06-18T15:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Twilight Troupe Council\"},\"calendar_id\":{\"eq\":\"cal_mirage_menagerie\"},\"start.dateTime\":{\"contains\":\"2018-06-23T19:00\"},\"end.dateTime\":{\"contains\":\"2018-06-23T20:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_placeholder_dust\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_placeholder_ghost\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_172", "test_name": "Mirage Menagerie Caravan Festival - Batching required", "service": "calendar", "task_horizon": 24, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"calendars.insert\",\"acl.insert\",\"events.insert\",\"freeBusy.query\",\"events.patch\",\"events.delete\"]}"} +{"question": "Please quick-add this to my primary calendar: 'Fogloom Archive Lantern Check on June 26, 2018 at 8:00pm for 45 minutes.' After it's created, fetch that event by ID so we can verify the parsed details. Also, set up a watch for changes to my calendar settings.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"test.user@test.com\"},\"summary\":{\"contains\":\"Fogloom Archive Lantern Check\"},\"start.dateTime\":{\"contains\":\"2018-06-26T20:00\"},\"end.dateTime\":{\"contains\":\"2018-06-26T20:45\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/users/user_agent/settings\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_185", "test_name": "Fogloom Archive - quickAdd and settings watch", "service": "calendar", "task_horizon": 4, "operation_type": "search+C+R+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.quickAdd\",\"events.get\",\"settings.watch\"]}"} +{"question": "Please update Iryna's access on the Driftweave Annex calendar (ID cal_driftweave_annex) to writer -- her ACL rule is user:iryna@test.com. Then set up an events watch on that calendar. Also, unsubscribe me from the Old Aster Lodge calendar (ID cal_old_aster_lodge). Finally, stop the old events watch channel with id chan_annex_77 and resourceId res_annex_77.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_driftweave_annex\"},\"scope_value\":{\"eq\":\"iryna@test.com\"}},\"expected_changes\":{\"role\":{\"to\":{\"eq\":\"writer\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_driftweave_annex/events\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"removed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_old_aster_lodge\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"removed\",\"entity\":\"calendar_channels\",\"where\":{\"id\":{\"eq\":\"chan_annex_77\"},\"resource_id\":{\"eq\":\"res_annex_77\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_201", "test_name": "Driftweave Annex - access, watch, unsubscribe", "service": "calendar", "task_horizon": 4, "operation_type": "C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"acl.update\",\"events.watch\",\"calendarList.delete\",\"channels.stop\"]}"} +{"question": "We're reorganizing the mountain observatory. First, find Skyward Observatory Access and the legacy Dormant Telescopes calendar. Create two new calendars: Meteor Patrol Rotation and Aurora Research Slots. Subscribe me to the external calendar with ID cal_mountain_weather (Mountain Weather Alerts) so we can coordinate around storms. Set the Meteor Patrol Rotation description to 'Night patrol rotation schedule' with timezone America/Denver, and set the Aurora Research Slots description to 'Research telescope booking slots' with timezone America/Los_Angeles. Make sure both new calendars are visible in my list and color-coded: Meteor Patrol Rotation color ID 9 and Aurora Research Slots color ID 14. Access changes: Mei (mei@test.com) should be an owner on both new calendars, Tomasz (tomasz@test.com) should be a reader on both, and remove Leila (leila@test.com) from Dormant Telescopes. Please batch the calendar-list updates and ACL changes to stay under quota limits.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Meteor Patrol Rotation\"},\"description\":{\"contains\":\"Night patrol rotation schedule\"},\"time_zone\":{\"eq\":\"America/Denver\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Aurora Research Slots\"},\"description\":{\"contains\":\"Research telescope booking slots\"},\"time_zone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_mountain_weather\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"user_id\":{\"eq\":\"user_agent\"},\"color_id\":{\"eq\":\"9\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"user_id\":{\"eq\":\"user_agent\"},\"color_id\":{\"eq\":\"14\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"mei@test.com\"},\"role\":{\"eq\":\"owner\"}},\"expected_count\":2},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"tomasz@test.com\"},\"role\":{\"eq\":\"reader\"}},\"expected_count\":2},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_dormant_telescopes\"},\"scope_value\":{\"eq\":\"leila@test.com\"}},\"expected_changes\":{\"deleted\":{\"to\":true}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_175", "test_name": "Skyward Observatory Access Passes - Calendar governance and access", "service": "calendar", "task_horizon": 12, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"calendars.insert\",\"calendarList.insert\",\"calendarList.patch\",\"acl.insert\",\"acl.delete\"]}"} +{"question": "Please move Icefern Map Workshop (event ID evt_icefern_maps_07) from the Icefern Annex calendar to the Boreal Classroom calendar (ID cal_boreal_classroom). Then start watching the Boreal Classroom calendar for event changes so I can notify Hana (hana@test.com) and Sven (sven@test.com) if anything shifts.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_icefern_maps_07\"},\"calendar_id\":{\"eq\":\"cal_icefern_annex\"}},\"expected_changes\":{\"calendar_id\":{\"to\":{\"eq\":\"cal_boreal_classroom\"}}},\"expected_count\":1,\"ignore\":[\"status\",\"sequence\",\"updated_at\"]},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_boreal_classroom/events\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_197", "test_name": "Icefern Annex - move event and watch destination", "service": "calendar", "task_horizon": 2, "operation_type": "C+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.move\",\"events.watch\"]}"} +{"question": "We're reorganizing the Emberline Embassy courier network. Find the Emberline Embassy Roster calendar and the legacy Old Courier Shifts entry. I need two new route calendars: Emberline Courier North Circuit and Emberline Courier South Circuit. Also subscribe me to the external calendar with ID cal_consular_blackout (Consular Blackout Windows) so we can avoid those times. For compliance, set up a watch on my settings and then confirm my current locale/timezone preferences. Update the North Circuit to use timezone Europe/Warsaw with description 'Northern route handoff schedule', and the South Circuit to use timezone Asia/Kolkata with description 'Southern route handoff schedule'. Make both route calendars visible in my list and set their colors: North Circuit color ID 6 and South Circuit color ID 12. Share both routes with Priya (priya@test.com) as a writer and Hassan (hassan@test.com) as a reader. Finally, remove Old Courier Shifts from my calendar list. Please batch the calendar-list updates and permission changes to reduce API calls.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Emberline Courier North Circuit\"},\"time_zone\":{\"eq\":\"Europe/Warsaw\"},\"description\":{\"contains\":\"Northern route handoff schedule\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Emberline Courier South Circuit\"},\"time_zone\":{\"eq\":\"Asia/Kolkata\"},\"description\":{\"contains\":\"Southern route handoff schedule\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_consular_blackout\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"user_id\":{\"eq\":\"user_agent\"},\"color_id\":{\"eq\":\"6\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"user_id\":{\"eq\":\"user_agent\"},\"color_id\":{\"eq\":\"12\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_old_courier_shifts\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"deleted\":{\"to\":true}}},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"priya@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":2},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"hassan@test.com\"},\"role\":{\"eq\":\"reader\"}},\"expected_count\":2}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_174", "test_name": "Emberline Embassy Network - Calendar governance and access", "service": "calendar", "task_horizon": 14, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"calendars.insert\",\"calendarList.insert\",\"settings.watch\",\"settings.get\",\"calendars.patch\",\"calendarList.patch\",\"acl.insert\",\"calendarList.delete\"]}"} +{"question": "On the Thistlewire Workshop calendar (ID cal_thistlewire_workshop), list instances of the recurring event evt_thistlewire_cycles first. Then add a one-off event Bronze Fret Alignment on June 30, 2018 from 10:00am-11:00am. After that, hide the Thistlewire Workshop calendar in my list and start a calendar list watch for my account. Finally, delete the obsolete Copperwind Annex calendar (ID cal_copperwind_annex).", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_thistlewire_workshop\"},\"summary\":{\"contains\":\"Bronze Fret Alignment\"},\"start.dateTime\":{\"contains\":\"2018-06-30T10:00\"},\"end.dateTime\":{\"contains\":\"2018-06-30T11:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_thistlewire_workshop\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"hidden\":{\"to\":true}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/users/me/calendarList\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_copperwind_annex\"}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_211", "test_name": "Thistlewire Workshop - instances review, create, hide, watch, delete", "service": "calendar", "task_horizon": 5, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.instances\",\"events.insert\",\"calendarList.patch\",\"calendarList.watch\",\"calendars.delete\"]}"} +{"question": "For the Glassreef Codex calendar (ID cal_glassreef_codex), I need a tally log. First, check the calendar color palette and confirm my timezone setting. Then list July 1-31, 2018 events on Glassreef Codex and count how many include \"Tide-loom\" in the summary. Move the template event evt_kelp_murmur_template from cal_kelpshade_staging into Glassreef Codex and update it to Tide-loom Count Ledger at Pearlwork Desk, with a description that explicitly includes that count. Also fetch the ACL rule user:archivist@glassreef.example on Glassreef Codex, start an ACL watch for that calendar, and finally clear the old Barnacle Practice calendar (ID cal_barnacle_practice).", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_kelp_murmur_template\"},\"calendar_id\":{\"eq\":\"cal_glassreef_codex\"}},\"expected_changes\":{\"calendar_id\":{\"to\":{\"eq\":\"cal_glassreef_codex\"}},\"summary\":{\"to\":{\"contains\":\"Tide-loom Count Ledger\"}},\"location\":{\"to\":{\"contains\":\"Pearlwork Desk\"}},\"description\":{\"to\":{\"regex\":\"(?i)(tide-loom.*\\\\b10\\\\b|\\\\b10\\\\b.*tide-loom)\"}}},\"expected_count\":1,\"ignore\":[\"attendees\",\"color_id\",\"end\",\"recurrence\",\"reminders\",\"start\",\"status\",\"transparency\",\"visibility\"]},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_glassreef_codex/acl\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_barnacle_practice\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_220", "test_name": "Glassreef Codex - count, move, patch, watch, clear", "service": "calendar", "task_horizon": 8, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"colors.get\",\"settings.get\",\"events.list\",\"events.move\",\"events.patch\",\"acl.get\",\"acl.watch\",\"calendars.clear\"]}"} +{"question": "Please import the following legacy entry into my primary calendar (not a manual create): 'Saffron Dusk Feather-Mending' on June 22, 2018 from 6:00pm-7:00pm, location 'Windglass Roost,' iCalUID saffron-dusk-20180622@aviary. Also, remove Salma's access to the Sandglass Aviary calendar - her ACL rule is user:salma@test.com.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"test.user@test.com\"},\"summary\":{\"contains\":\"Saffron Dusk Feather-Mending\"},\"start.dateTime\":{\"contains\":\"2018-06-22T18:00\"},\"end.dateTime\":{\"contains\":\"2018-06-22T19:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"},\"location\":{\"contains\":\"Windglass Roost\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_sandglass_aviary\"},\"scope_value\":{\"eq\":\"salma@test.com\"},\"deleted\":{\"eq\":true}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_181", "test_name": "Sandglass Aviary - import event and revoke access", "service": "calendar", "task_horizon": 2, "operation_type": "C+U+D", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.import\",\"acl.delete\"]}"} +{"question": "On the Skyloom Observatory calendar (ID cal_skyloom_observatory), list events first. Then fully replace evt_skyloom_alignment so it’s on July 2, 2018 from 8:00pm–9:00pm at Upper Ring. Also fully replace the ACL rule user:mechanic@skyloom.example to reader. After that, start an events watch on the Skyloom Observatory calendar and list events again to confirm the change.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_skyloom_alignment\"},\"calendar_id\":{\"eq\":\"cal_skyloom_observatory\"}},\"expected_changes\":{\"start\":{\"to\":{\"contains\":\"2018-07-02T20:00\"}},\"end\":{\"to\":{\"contains\":\"2018-07-02T21:00\"}},\"location\":{\"to\":{\"contains\":\"Upper Ring\"}}},\"expected_count\":1,\"ignore\":[\"attendees\",\"color_id\",\"description\",\"recurrence\",\"reminders\",\"status\",\"summary\",\"transparency\",\"visibility\"]},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_skyloom_observatory\"},\"scope_value\":{\"eq\":\"mechanic@skyloom.example\"}},\"expected_changes\":{\"role\":{\"to\":{\"eq\":\"reader\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_skyloom_observatory/events\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_217", "test_name": "Skyloom Observatory - event replace, ACL update, events watch", "service": "calendar", "task_horizon": 4, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.list\",\"events.update\",\"acl.update\",\"events.watch\"]}"} +{"question": "We're hosting the Intergalactic Crypto-Zoology Summit and I need you to set up the schedule. Find the 'Crypto-Zoology Summit 2018' calendar. Schedule the opening keynote 'Keynote: The Sasquatch Migration Patterns' for 9am on Monday June 18, lasting 1 hour. I need to schedule the main debate panel, 'Panel: Nessie vs Ogopogo - A Comparative Analysis', but it depends on Zahra's (zahra@test.com) availability in the afternoon of June 18 - find when she's free and book a 2-hour panel at that time on the summit calendar. Mateusz (mateusz@test.com) just agreed to co-present the Sasquatch keynote, so please add him as an attendee to that event. I accidentally added a workshop called 'How to Fake Bigfoot Prints' to the summit calendar earlier - delete it immediately, we can't have that on the official record. Aarav (aarav@test.com) from the press office needs read access to the summit calendar.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Sasquatch Migration\"},\"calendar_id\":{\"eq\":\"cal_cryptozoology_summit\"},\"start.dateTime\":{\"contains\":\"2018-06-18T09:00\"},\"end.dateTime\":{\"contains\":\"2018-06-18T10:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_event_attendees\",\"where\":{\"email\":{\"eq\":\"mateusz@test.com\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Nessie\"},\"calendar_id\":{\"eq\":\"cal_cryptozoology_summit\"},\"start.dateTime\":{\"contains\":\"2018-06-18T14:00\"},\"end.dateTime\":{\"contains\":\"2018-06-18T16:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_bigfoot_workshop\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"aarav@test.com\"},\"role\":{\"eq\":\"reader\"},\"calendar_id\":{\"eq\":\"cal_cryptozoology_summit\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_170", "test_name": "Crypto-Zoology Summit - Mythical creatures research conference", "service": "calendar", "task_horizon": 6, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"events.insert\",\"freeBusy.query\",\"events.patch\",\"events.delete\",\"acl.insert\"]}"} +{"question": "On the Lanternbraid Pavilion calendar (ID cal_lanternbraid_pavilion), fetch the event evt_lanternbraid_opening first. Then update the calendar's location to Harborline Rotunda. Also start an ACL watch on the Lanternbraid Pavilion calendar. Finally, unsubscribe me from the Old Copper Annex calendar (ID cal_old_copper_annex).", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_lanternbraid_pavilion\"}},\"expected_changes\":{\"location\":{\"to\":{\"contains\":\"Harborline Rotunda\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_lanternbraid_pavilion/acl\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"removed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_old_copper_annex\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_205", "test_name": "Lanternbraid Pavilion - patch and watch", "service": "calendar", "task_horizon": 4, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.get\",\"calendars.patch\",\"acl.watch\",\"calendarList.delete\"]}"} +{"question": "On the Sablewind Archive calendar (ID cal_sablewind_archive), fetch its calendar list entry and set it to visible (not hidden) with color ID 5. Share the calendar with Keiko (keiko@test.com) as writer, and remove Salma’s access (rule cal_sablewind_archive:user:salma@test.com). Finally, set up a settings watch for my account.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_sablewind_archive\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"hidden\":{\"to\":false},\"color_id\":{\"to\":{\"eq\":\"5\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_sablewind_archive\"},\"scope_value\":{\"eq\":\"keiko@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_sablewind_archive\"},\"scope_value\":{\"eq\":\"salma@test.com\"},\"deleted\":{\"eq\":true}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/users/user_agent/settings\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_216", "test_name": "Sablewind Archive - share, revoke, list patch, settings watch", "service": "calendar", "task_horizon": 6, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.get\",\"calendarList.patch\",\"acl.insert\",\"acl.delete\",\"settings.watch\"]}"} +{"question": "Check my calendar list entry for Stoneglow Depot (ID cal_stoneglow_depot) first. Then fully replace the calendar metadata with summary Stoneglow Depot, description 'Crate intake ledger', and timezone America/Los_Angeles. Fully replace the ACL rule user:clerk@stoneglow.example to reader. Add the note 'Week-of intake note' to the description of all events in Stoneglow Depot that occur during the week of July 1-7, 2018. Move evt_stoneglow_manifest from Stoneglow Depot to Harbor Ledger (ID cal_harbor_ledger).", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_stoneglow_week_0702\"},\"calendar_id\":{\"eq\":\"cal_stoneglow_depot\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"Week-of intake note\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_stoneglow_depot\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"Crate intake ledger\"}},\"time_zone\":{\"to\":{\"eq\":\"America/Los_Angeles\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_stoneglow_depot\"},\"scope_value\":{\"eq\":\"clerk@stoneglow.example\"}},\"expected_changes\":{\"role\":{\"to\":{\"eq\":\"reader\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_stoneglow_week_0704\"},\"calendar_id\":{\"eq\":\"cal_stoneglow_depot\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"Week-of intake note\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_stoneglow_inventory\"},\"calendar_id\":{\"eq\":\"cal_stoneglow_depot\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_stoneglow_inventory\"},\"calendar_id\":{\"eq\":\"cal_stoneglow_depot\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_stoneglow_inventory\"},\"calendar_id\":{\"eq\":\"cal_stoneglow_depot\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_stoneglow_out_0615\"},\"calendar_id\":{\"eq\":\"cal_stoneglow_depot\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_stoneglow_out_0615\"},\"calendar_id\":{\"eq\":\"cal_stoneglow_depot\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_stoneglow_out_0615\"},\"calendar_id\":{\"eq\":\"cal_stoneglow_depot\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_stoneglow_manifest\"},\"calendar_id\":{\"eq\":\"cal_stoneglow_depot\"}},\"expected_changes\":{\"calendar_id\":{\"to\":{\"eq\":\"cal_harbor_ledger\"}}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_214", "test_name": "Stoneglow Depot - clear, replace, ACL update, move, settings watch", "service": "calendar", "task_horizon": 9, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.get\",\"calendars.update\",\"acl.update\",\"events.list\",\"events.patch\",\"events.move\"]}"} +{"question": "We’re setting up the tidal library’s long-term calendar. First, show me my calendars and create Tidal Library Rotations if it doesn’t already exist. Share it with Fumiko (fumiko@test.com) so she can edit. The Moon-Shell Rebinding ritual needs to recur monthly on the first Tuesday at 9:00am, starting July 3, 2018, and should continue indefinitely until we cancel it. We also need two exceptions: the August 7, 2018 occurrence should start at 11:00am with a note ‘Storm-surge delay,’ and the September 4, 2018 occurrence should be cancelled entirely. Add a separate one-off event called Ink Tide Inventory on July 15, 2018 at 4:00pm. After confirming the schedule looks right, delete the entire Moon-Shell Rebinding series.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Tidal Library Rotations\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"fumiko@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Moon-Shell Rebinding\"},\"start.dateTime\":{\"contains\":\"2018-07-03T09:00\"},\"status\":{\"eq\":\"cancelled\"},\"recurrence\":{\"contains\":\"FREQ=MONTHLY\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Moon-Shell Rebinding\"},\"recurrence\":{\"i_contains\":\"EXDATE\"}},\"expected_count\":{\"min\":0,\"max\":1}},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Moon-Shell Rebinding\"},\"start.dateTime\":{\"contains\":\"2018-09-04T09:00\"},\"status\":{\"eq\":\"cancelled\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":{\"min\":0,\"max\":1}},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Moon-Shell Rebinding\"},\"start.dateTime\":{\"contains\":\"2018-08-07T11:00\"},\"description\":{\"i_contains\":\"storm-surge delay\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Ink Tide Inventory\"},\"start.dateTime\":{\"contains\":\"2018-07-15T16:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_178", "test_name": "Tidal Library Rotations - Recurring series lifecycle", "service": "calendar", "task_horizon": 8, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"calendars.insert\",\"acl.insert\",\"events.insert\",\"events.patch\",\"events.delete\"]}"} +{"question": "Subscribe me to the external calendar cal_meridian_drift_folio. Then fully replace that calendar list entry so it's hidden, not selected, and uses color ID 7. On that same calendar, patch the event evt_meridian_index to add the description 'Catalog spine check.' Finally, start an ACL watch on every calendar that has at least one event whose name contains 'coolcoolcool'.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_meridian_drift_folio\"},\"user_id\":{\"eq\":\"user_agent\"},\"hidden\":{\"eq\":true},\"selected\":{\"eq\":false},\"color_id\":{\"eq\":\"7\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_meridian_index\"},\"calendar_id\":{\"eq\":\"cal_meridian_drift_folio\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"Catalog spine check\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_meridian_drift_folio/acl\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_tidemire_conservatory/acl\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_lanternbraid_pavilion/acl\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_206", "test_name": "Meridian Drift Folio - subscribe, tune, patch, watch", "service": "calendar", "task_horizon": 6, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.insert\",\"calendarList.update\",\"events.patch\",\"events.list\",\"acl.watch\"]}"} +{"question": "On the Ivory Loom Archive calendar (ID cal_ivory_loom_archive), I need a cleanup. First, switch this calendar's timezone to match Ewa's. Next, list the calendar's ACL rules so we confirm access. After that, list events on Ivory Loom Archive and identify every event whose title or description contains the word \"blood\". Delete all of those events.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"id\":{\"eq\":\"cal_ivory_loom_archive\"}},\"expected_changes\":{\"time_zone\":{\"to\":{\"eq\":\"Europe/Warsaw\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_ivory_loom_blood_ink\"},\"calendar_id\":{\"eq\":\"cal_ivory_loom_archive\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_ivory_loom_redaction\"},\"calendar_id\":{\"eq\":\"cal_ivory_loom_archive\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_ivory_loom_blood_oath\"},\"calendar_id\":{\"eq\":\"cal_ivory_loom_archive\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_ivory_loom_crimson_notice\"},\"calendar_id\":{\"eq\":\"cal_ivory_loom_archive\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_222", "test_name": "Ivory Loom Archive - purge blood events", "service": "calendar", "task_horizon": 7, "operation_type": "search+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendars.patch\",\"acl.list\",\"events.list\",\"events.delete\"]}"} +{"question": "The Symposium of Infinite Curiosity is three weeks away and the program is chaos. Find our main calendar - 'Symposium of Infinite Curiosity 2018'. We have sessions scheduled and I need an exact count of how many are in the 'Quantum' track (they'll have [Quantum] in the title). Add Mei-Lin's opening keynote - it's called 'Keynote: The Heresy of Obvious Conclusions' and should be Day 1 (Monday June 18) at 8am, lasting 1 hour. Update that keynote with a description: 'Mandatory attendance for all track chairs. Coffee will be existential.' Bogdan (bogdan@test.com) and Ravi (ravi@test.com) need to meet urgently on Day 2 (Tuesday June 19) afternoon to discuss a problematic submission - find when they're both free and create 'Secret Tribunal of the Program Committee' for 2 hours at that time on the symposium calendar. Dr. Chiamaka (chiamaka@test.com) is presenting four different papers across the conference - tell me when each of her sessions is. Someone finally noticed the irony: 'Workshop: Introduction to Procrastination (Postponed)' - delete it. Create a private calendar called 'Speakers Green Room of Mild Panic' for backstage coordination. Ingrid (ingrid@test.com) just joined as volunteer coordinator - give her edit access to the main symposium calendar. Chiamaka's first presentation needs to move to the same venue as 'Panel: Temporal Causality Roundtable' — but only if that venue is free at the same time. If that panel already occupies that venue at that time, move Chiamaka's first presentation to 'Annex of Temporal Studies' instead.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Heresy of Obvious Conclusions\"},\"description\":{\"i_contains\":\"existential\"},\"calendar_id\":{\"eq\":\"cal_symposium_curiosity\"},\"start.dateTime\":{\"contains\":\"2018-06-18T08:00\"},\"end.dateTime\":{\"contains\":\"2018-06-18T09:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Secret Tribunal\"},\"calendar_id\":{\"eq\":\"cal_symposium_curiosity\"},\"start.dateTime\":{\"contains\":\"2018-06-19T15:00\"},\"end.dateTime\":{\"contains\":\"2018-06-19T17:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_procrastination_workshop\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_chiamaka_pres_1\"}},\"expected_changes\":{\"location\":{\"to\":{\"contains\":\"Annex of Temporal Studies\"}}},\"ignore\":[\"attendees\",\"color_id\",\"description\",\"end\",\"recurrence\",\"reminders\",\"start\",\"status\",\"summary\",\"transparency\",\"visibility\"]},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"ingrid@test.com\"},\"role\":{\"eq\":\"writer\"},\"calendar_id\":{\"eq\":\"cal_symposium_curiosity\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Speakers Green Room\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_168", "test_name": "Symposium of Infinite Curiosity - Academic conference coordination", "service": "calendar", "task_horizon": 10, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"events.list\",\"events.insert\",\"events.patch\",\"freeBusy.query\",\"events.delete\",\"calendars.insert\",\"acl.insert\"]}"} +{"question": "We are starting a serious conservation push for the firefly habitat. First, show me all my calendars so I can see whether I already have a conservatory calendar. If not, create a new calendar called 'Firefly Conservatory 2018' and share it with Haruto (haruto@test.com) as a writer. The weekly Lantern Patrols need to happen every Tuesday at 7:00pm for 6 weeks starting June 19, 2018, lasting 1 hour - set that up as a recurring event (one series, not six separate events). Zanele (zanele@test.com) can only do the Bioluminescent Microscopy Workshop on Saturday evening June 23, 2018 between 6pm and 10pm in her timezone - check her availability and schedule a 1-hour workshop when she is free, using her timezone. Once the patrol route is final, update the Lantern Patrol to use the location 'Willow Glade Observation Ring'. Finally, delete the old 'Broken Jar Ceremony' event from my primary calendar.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Firefly Conservatory 2018\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"haruto@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Lantern Patrol\"},\"location\":{\"contains\":\"Willow Glade Observation Ring\"},\"start.dateTime\":{\"contains\":\"2018-06-19T19:00\"},\"end.dateTime\":{\"contains\":\"2018-06-19T20:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"},\"recurrence\":{\"contains\":\"FREQ=WEEKLY\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Bioluminescent Microscopy Workshop\"},\"start.dateTime\":{\"contains\":\"2018-06-23T19:00\"},\"end.dateTime\":{\"contains\":\"2018-06-23T20:00\"},\"start.timeZone\":{\"eq\":\"Africa/Johannesburg\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_broken_jar\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_176", "test_name": "Firefly Conservatory - Recurring patrols and cleanup", "service": "calendar", "task_horizon": 7, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"calendars.insert\",\"acl.insert\",\"events.insert\",\"freeBusy.query\",\"events.patch\",\"events.delete\"]}"} +{"question": "Create a new calendar named Kiteglass Survey Log. Before you do anything else, pull the calendar color palette and set the new calendar to color ID 9. Sven needs editor access on the Harbor Signalboard calendar (ID cal_harbor_signalboard) -- his ACL rule is user:sven@test.com, please update it. Then find the earliest 45-minute overlap between my primary calendar and Sven's calendar across June 27-28, 2018. Also, fetch the event evt_horizon_shim_02 on my primary calendar to confirm its exact time so the overlap you pick doesn't collide. Finally, stop the old settings watch channel with id chan_settings_204 and resourceId res_settings_204.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Kiteglass Survey Log\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_list_entries\",\"where\":{\"user_id\":{\"eq\":\"user_agent\"},\"color_id\":{\"eq\":\"9\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_harbor_signalboard\"},\"scope_value\":{\"eq\":\"sven@test.com\"}},\"expected_changes\":{\"role\":{\"to\":{\"eq\":\"writer\"}}},\"expected_count\":1},{\"diff_type\":\"removed\",\"entity\":\"calendar_channels\",\"where\":{\"id\":{\"eq\":\"chan_settings_204\"},\"resource_id\":{\"eq\":\"res_settings_204\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_200", "test_name": "Kiteglass Survey Log - access update and channel stop", "service": "calendar", "task_horizon": 7, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendars.insert\",\"colors.get\",\"calendarList.patch\",\"acl.update\",\"freeBusy.query\",\"events.get\",\"channels.stop\"]}"} +{"question": "Please remove the Sunthread Loom Blessing event from my primary calendar (event ID evt_sunthread_loom_001). After that, list all my calendar settings so I can confirm my current timezone before I update Adebayo (adebayo@test.com).", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_sunthread_loom_001\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_187", "test_name": "Sunthread Archive - delete event and list settings", "service": "calendar", "task_horizon": 2, "operation_type": "search+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.delete\",\"settings.list\"]}"} +{"question": "Create a new calendar called Glimmerforge Atlas. Share it with Mina (mina@test.com) as writer. Then update Sven's access on that same calendar (rule user:sven@test.com) to writer. Finally, fetch my timezone setting so I can include it in the access note to Adebayo (adebayo@test.com).", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Glimmerforge Atlas\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"mina@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"sven@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_203", "test_name": "Glimmerforge Atlas - create and share", "service": "calendar", "task_horizon": 4, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendars.insert\",\"acl.insert\",\"acl.update\",\"settings.get\"]}"} +{"question": "On the Tidemire Conservatory calendar (ID cal_tidemire_conservatory), first fetch the event evt_tidemire_orchid_rounds, then list its instances so I can review upcoming rounds. After that, list my calendar settings for the record. Once I confirm, clear all events from the Tidemire Conservatory calendar. Yara (yara@test.com) needs the final report after the reset.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_tidemire_conservatory\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":4}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_204", "test_name": "Tidemire Conservatory - inspect and clear", "service": "calendar", "task_horizon": 6, "operation_type": "search+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.get\",\"events.instances\",\"settings.list\",\"calendars.clear\"]}"} +{"question": "On my primary calendar, add a one-hour event called Quartzloom Spore Cataloging on June 21, 2018 from 9:00am-10:00am. Also, remove Mina's access from the Quartzloom Herbarium calendar (calendar ID cal_quartzloom_herbarium, rule ID user:mina@test.com).", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"test.user@test.com\"},\"summary\":{\"contains\":\"Quartzloom Spore Cataloging\"},\"start.dateTime\":{\"contains\":\"2018-06-21T09:00\"},\"end.dateTime\":{\"contains\":\"2018-06-21T10:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_acl_rules\",\"where\":{\"calendar_id\":{\"eq\":\"cal_quartzloom_herbarium\"},\"scope_value\":{\"eq\":\"mina@test.com\"},\"deleted\":{\"eq\":true}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_182", "test_name": "Quartzloom Herbarium - event create and access revoke", "service": "calendar", "task_horizon": 2, "operation_type": "C+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.insert\",\"acl.delete\"]}"} +{"question": "Create a new calendar called Brasswillow Registry and share it with Ewa (ewa@test.com) with edit access. After she's had a look, delete the Brasswillow Registry calendar entirely.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Brasswillow Registry\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"ewa@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Brasswillow Registry\"}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_199", "test_name": "Brasswillow Registry - create, share, delete", "service": "calendar", "task_horizon": 3, "operation_type": "C+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendars.insert\",\"acl.insert\",\"calendars.delete\"]}"} +{"question": "Thunderwave Festival is about to explode and I need clarity on the chaos. Find the 'Thunderwave Festival 2018' calendar. We have performances booked across 6 stages and I need to know exactly how many acts are playing the 'Volcano Stage' - go through the schedule and count them. Kofi (kofi@test.com) is our Volcano Stage manager - check when he's free Saturday afternoon (June 16) because we need an emergency sound check. We just confirmed a secret sunrise set - create 'Sacred Sound Ritual: DJ Nebula Sunrise Set' for Sunday June 17 at 5am, lasting 2 hours, on 'Ethereal Meadow Stage'. I'm worried we accidentally dropped some metal bands from the lineup. Search the schedule and find every act with [METAL] in the title - tell me the count. For the sunrise set, add Yuna (yuna@test.com) and Petro (petro@test.com) as attendees - they're running lights and sound. Good news: 'The Amplifier Incident Investigation (Staff Only)' can be deleted - we found the culprit (it was a rogue beer). Sakura (sakura@test.com) is our festival photographer and needs to see the complete schedule to plan her shots - give her read access to the Thunderwave calendar. Finally, create a private calendar called 'Artist Hospitality Demands and Disasters' for tracking the ridiculous rider requests.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"DJ Nebula\"},\"calendar_id\":{\"eq\":\"cal_thunderwave_festival\"},\"location\":{\"contains\":\"Ethereal Meadow\"},\"start.dateTime\":{\"contains\":\"2018-06-17T05:00\"},\"end.dateTime\":{\"contains\":\"2018-06-17T07:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_event_attendees\",\"where\":{\"email\":{\"eq\":\"yuna@test.com\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_event_attendees\",\"where\":{\"email\":{\"eq\":\"petro@test.com\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_amplifier_incident\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"sakura@test.com\"},\"role\":{\"eq\":\"reader\"},\"calendar_id\":{\"eq\":\"cal_thunderwave_festival\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Artist Hospitality\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_169", "test_name": "Thunderwave Music Festival - Multi-stage festival coordination", "service": "calendar", "task_horizon": 7, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"events.list\",\"freeBusy.query\",\"events.insert\",\"events.delete\",\"acl.insert\",\"calendars.insert\"]}"} +{"question": "Create a new calendar called Emberpine Cartography Ledger.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Emberpine Cartography Ledger\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_188", "test_name": "Emberpine Cartography - create calendar only", "service": "calendar", "task_horizon": 1, "operation_type": "C", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendars.insert\"]}"} +{"question": "We're setting up the Time-Traveler's Convention and the timeline is fragile. First, check if 'Timeline Alpha' already exists in my calendars - it should be there. Create a new calendar called 'Timeline Beta' for our temporal experiments. Schedule the 'Paradox Prevention Seminar' for Tuesday June 19 at 10am, lasting 2 hours, on Timeline Beta. Sven (sven@test.com) is arriving from 2099 and needs to attend, but his arrival window is fluctuating - check his availability for Wednesday June 20 instead. Move the 'Paradox Prevention Seminar' to Sven's free slot on Wednesday. The Time Council has flagged the 'Grandfather Paradox Demonstration' as a Class 5 risk - find it and delete it immediately. Finally, grant Fatima (fatima@test.com) write access to Timeline Beta so she can document the changes to history.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendars\",\"where\":{\"summary\":{\"contains\":\"Timeline Beta\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Paradox Prevention Seminar\"},\"start.dateTime\":{\"contains\":\"2018-06-20T14:00\"},\"end.dateTime\":{\"contains\":\"2018-06-20T16:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_grandfather_paradox\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"fatima@test.com\"},\"role\":{\"eq\":\"writer\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_171", "test_name": "Time-Traveler's Convention - Temporal event coordination", "service": "calendar", "task_horizon": 7, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"calendars.insert\",\"events.insert\",\"freeBusy.query\",\"events.patch\",\"events.delete\",\"acl.insert\"]}"} +{"question": "Please clear all events from my Nightglass Repository calendar (ID cal_nightglass_repository) but keep the calendar. Then hide that calendar in my list. I'll let Ewa (ewa@test.com) and Hana (hana@test.com) know once it's done.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_nightglass_repository\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":{\"min\":1},\"description\":\"All events on this calendar should be cleared (cancelled)\"},{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_nightglass_repository\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"hidden\":{\"to\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_198", "test_name": "Nightglass Repository - clear calendar and hide entry", "service": "calendar", "task_horizon": 2, "operation_type": "R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendars.clear\",\"calendarList.patch\"]}"} +{"question": "Please delete the entire recurring Cinderflock Vesper Choir series (event ID evt_cinderflock_vespers) from the Cinderflock Choir calendar (ID cal_cinderflock_choir).", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_cinderflock_vespers\"},\"calendar_id\":{\"eq\":\"cal_cinderflock_choir\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_193", "test_name": "Cinderflock Choir - delete recurring series", "service": "calendar", "task_horizon": 1, "operation_type": "U+D", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.delete\"]}"} +{"question": "Pull the calendar color palette and set Quillshore Annex (ID cal_quillshore_annex) to color ID 8. Then import the legacy entry Quillshore Salt Index into that calendar for June 30, 2018 from 2:00pm-3:00pm, location Brine Archive Hall, iCalUID quillshore-salt-20180630@annex. After that, check the ACL rule user:linh@test.com on Quillshore Annex and show me the calendar list entry for that calendar. Finally, start an ACL watch on Quillshore Annex and a calendar list watch for my account.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_quillshore_annex\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"color_id\":{\"to\":{\"eq\":\"8\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"calendar_id\":{\"eq\":\"cal_quillshore_annex\"},\"summary\":{\"contains\":\"Quillshore Salt Index\"},\"location\":{\"contains\":\"Brine Archive Hall\"},\"start.dateTime\":{\"contains\":\"2018-06-30T14:00\"},\"end.dateTime\":{\"contains\":\"2018-06-30T15:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/calendars/cal_quillshore_annex/acl\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_channels\",\"where\":{\"resource_uri\":{\"eq\":\"/users/me/calendarList\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_210", "test_name": "Quillshore Annex - import, ACL check, list entry review, dual watches", "service": "calendar", "task_horizon": 7, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"colors.get\",\"calendarList.patch\",\"events.import\",\"acl.get\",\"calendarList.get\",\"acl.watch\",\"calendarList.watch\"]}"} +{"question": "The guild needs organizing. First, remind me which calendars I have - I'm looking for our 'Dungeon Masters Guild' one. We're kicking off a new campaign called 'The Curse of the Crimson Dice' and I need to schedule Session Zero for Friday at 7pm on that calendar. The duration should be 3 hours. Amara (amara@test.com) offered to run a one-shot this weekend - find when she's free and schedule 'Amara's Epic One-Shot Adventure' for 4 hours at that time on the guild calendar (use Amara's timezone). Oh, and that Session Zero event needs more info - update the description to say 'Bring character concepts. Snacks provided. No phones at the table.' I want to see all the sessions we have planned this month on the guild calendar. Hiroshi (hiroshi@test.com) has been running great sessions and deserves to schedule his own games now - give him edit access to the Dungeon Masters Guild calendar. That old 'TPK Recovery Support Group (Postponed Indefinitely)' event is still sitting there as a bad joke from when we had that campaign wipe, and it should not be sitting there anymore. Finally, we've been mixing board game nights with RPG sessions and it's confusing people. Use the existing Board Game Bazaar calendar (ID cal_board_game_bazaar) for non-RPG gaming (create it if it doesn't exist). Then scan every game event on the Dungeon Masters Guild calendar: each event description is tagged 'Type: RPG' or 'Type: Non-RPG'. Copy every Non-RPG event to Board Game Bazaar; if it is recurring, copy it as a recurring event there too.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Curse of the Crimson Dice\"},\"description\":{\"i_contains\":\"character concepts\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"},\"start.dateTime\":{\"contains\":\"2018-06-22T19:00\"},\"end.dateTime\":{\"contains\":\"2018-06-22T22:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Amara's Epic One-Shot Adventure\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"},\"start.dateTime\":{\"contains\":\"2018-06-24T14:00\"},\"end.dateTime\":{\"contains\":\"2018-06-24T18:00\"},\"start.timeZone\":{\"eq\":\"Africa/Lagos\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_acl_rules\",\"where\":{\"scope_value\":{\"eq\":\"hiroshi@test.com\"},\"role\":{\"eq\":\"writer\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"event_tpk_recovery\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Board Game Brawl Night\"},\"calendar_id\":{\"eq\":\"cal_board_game_bazaar\"},\"recurrence\":{\"contains\":\"RRULE:FREQ=WEEKLY;BYDAY=MO\"},\"start.dateTime\":{\"contains\":\"2018-06-18T19:00\"},\"end.dateTime\":{\"contains\":\"2018-06-18T21:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Catan Quarry Trade\"},\"calendar_id\":{\"eq\":\"cal_board_game_bazaar\"},\"start.dateTime\":{\"contains\":\"2018-06-20T18:00\"},\"end.dateTime\":{\"contains\":\"2018-06-20T20:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Meeple Market Mixer\"},\"calendar_id\":{\"eq\":\"cal_board_game_bazaar\"},\"recurrence\":{\"contains\":\"RRULE:FREQ=WEEKLY;BYDAY=FR\"},\"start.dateTime\":{\"contains\":\"2018-06-22T18:00\"},\"end.dateTime\":{\"contains\":\"2018-06-22T20:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"Ticket to Ride Summit\"},\"calendar_id\":{\"eq\":\"cal_board_game_bazaar\"},\"start.dateTime\":{\"contains\":\"2018-06-23T17:00\"},\"end.dateTime\":{\"contains\":\"2018-06-23T19:00\"},\"start.timeZone\":{\"eq\":\"America/Los_Angeles\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"RPG Legends League\"},\"calendar_id\":{\"eq\":\"cal_board_game_bazaar\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"RPG: Mirefall Campaign\"},\"calendar_id\":{\"eq\":\"cal_board_game_bazaar\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"RPG: Iron Citadel\"},\"calendar_id\":{\"eq\":\"cal_board_game_bazaar\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"summary\":{\"contains\":\"RPG: Ember Vale Prelude\"},\"calendar_id\":{\"eq\":\"cal_board_game_bazaar\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_legend\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_legend\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_legend\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_mire\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_mire\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_mire\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_iron_citadel\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_iron_citadel\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_iron_citadel\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_ember_prelude\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_ember_prelude\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_dungeon_rpg_ember_prelude\"},\"calendar_id\":{\"eq\":\"cal_dungeon_masters\"}},\"expected_count\":0}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_166", "test_name": "Dice & Dragons Tabletop Gaming Guild - Campaign scheduling", "service": "calendar", "task_horizon": 15, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"calendarList.list\",\"events.insert\",\"freeBusy.query\",\"events.patch\",\"events.list\",\"acl.insert\",\"events.delete\"]}"} +{"question": "On the Crystalfold Foundry calendar (ID cal_crystalfold_foundry), fully replace evt_crystalfold_quench so it’s on July 1, 2018 from 9:00am-10:30am at Forge Bay 2. Then delete evt_crystalfold_slag and evt_crystalfold_mold. Finally, unsubscribe me from the Old Lattice Mill calendar (ID cal_old_lattice_mill).", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_crystalfold_quench\"},\"calendar_id\":{\"eq\":\"cal_crystalfold_foundry\"}},\"expected_changes\":{\"start\":{\"to\":{\"contains\":\"2018-07-01T09:00\"}},\"end\":{\"to\":{\"contains\":\"2018-07-01T10:30\"}},\"location\":{\"to\":{\"contains\":\"Forge Bay 2\"}}},\"expected_count\":1,\"ignore\":[\"attendees\",\"color_id\",\"description\",\"recurrence\",\"reminders\",\"status\",\"summary\",\"transparency\",\"visibility\"]},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_crystalfold_slag\"},\"calendar_id\":{\"eq\":\"cal_crystalfold_foundry\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_events\",\"where\":{\"id\":{\"eq\":\"evt_crystalfold_mold\"},\"calendar_id\":{\"eq\":\"cal_crystalfold_foundry\"}},\"expected_changes\":{\"status\":{\"to\":{\"eq\":\"cancelled\"}}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"calendar_list_entries\",\"where\":{\"calendar_id\":{\"eq\":\"cal_old_lattice_mill\"},\"user_id\":{\"eq\":\"user_agent\"}},\"expected_changes\":{\"deleted\":{\"to\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"etag\",\"html_link\",\"ical_uid\",\"sequence\",\"start_datetime\",\"end_datetime\"]}}", "test_id": "calendar_215", "test_name": "Crystalfold Foundry - replace, delete events, unsubscribe", "service": "calendar", "task_horizon": 4, "operation_type": "R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"calendar\",\"seed_template\":\"calendar_default\",\"impersonate_user_id\":\"user_agent\",\"eval_type\":\"actionEval\",\"tools_required\":[\"events.update\",\"events.delete\",\"calendarList.delete\"]}"} +{"question": "Mark issue ENG-1 as completed", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-1\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"4334c4ee-405c-4d2c-bf25-4dcb7a8c0512\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_10", "test_name": "Complete issue (move to Done)", "service": "linear", "task_horizon": 3, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"workflowStates\",\"issueUpdate\"]}"} +{"question": "Create a new issue in the Engineering team titled 'Fix login bug' ", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"ad608998-915c-4bad-bcd9-85ebfccccee8\"},\"title\":{\"contains\":\"login bug\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_0", "test_name": "Create a new issue", "service": "linear", "task_horizon": 2, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issueCreate\"]}"} +{"question": "Create two Engineering issues: 'Update onboarding docs' (label UX) and 'Add circuit breaker' (label Urgent)", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"ad608998-915c-4bad-bcd9-85ebfccccee8\"},\"title\":{\"contains\":\"onboarding docs\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"ad608998-915c-4bad-bcd9-85ebfccccee8\"},\"title\":{\"contains\":\"circuit breaker\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{},\"expected_count\":{\"min\":2}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_22", "test_name": "Batch create issues with labels", "service": "linear", "task_horizon": 4, "operation_type": "search+C", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issueLabels\",\"issueCreate\"]}"} +{"question": "The Mobile team is preparing for the v2.5 release and QA has reported some critical bugs that need to be tracked.\n\nFirst, create a new issue titled \"App crashes on login with special characters\" in the Mobile team. This is a high priority bug. Assign it to Marcus.\n\nCreate another issue titled \"Push notifications not working on Android 14\" in the same team and assign it to Aisha.\n\nMove both issues to \"In Progress\" status since the developers are starting work on them immediately.\n\nFinally, add a comment to the login crash issue with the following reproduction steps: \"REPRO_STEPS: 1. Open app 2. Enter username with & or % character 3. Tap login button 4. App crashes to home screen. Tested on iOS 17.2 and Android 14.\"", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"crashes on login\"},\"assigneeId\":{\"eq\":\"c9d0e1f2-a3b4-5678-3456-012345678901\"},\"stateId\":{\"eq\":\"mob-state-inprogress-567890abcdef\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Push notifications\"},\"assigneeId\":{\"eq\":\"d0e1f2a3-b4c5-6789-4567-123456789012\"},\"stateId\":{\"eq\":\"mob-state-inprogress-567890abcdef\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"REPRO_STEPS:\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_43", "test_name": "Mobile App Release - QA Bug Tracking", "service": "linear", "task_horizon": 8, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"users\",\"issueLabels\",\"workflowStates\",\"issueCreate\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "The 'Polish onboarding dashboard UX' issue was filed in Engineering by mistake. Move it to the Product team and reset its status to 'In Review'.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-2\"}},\"expected_changes\":{\"teamId\":{\"from\":{\"eq\":\"ad608998-915c-4bad-bcd9-85ebfccccee8\"},\"to\":{\"eq\":\"cdb85540-5065-4346-8aef-ae2b72d6e940\"}},\"stateId\":{\"to\":{\"eq\":\"31d46818-d16d-4279-90e6-a7bab45561c0\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_33", "test_name": "Cross-Team Issue Migration", "service": "linear", "task_horizon": 4, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"teams\",\"workflowStates\",\"issueUpdate\"]}"} +{"question": "The Post-Production team is managing the editing pipeline for \"Project Aurora\". Here's what needs to happen:\n\nFirst, find the \"Master Edit Lock\" issue - this is the critical gate that must complete before downstream work can proceed.\n\nCount how many issues are directly blocked by \"Master Edit Lock\". You'll need this exact number for your report.\n\nCreate two new issues in the Post-Production team:\n1. \"Final Color Grade - DCI-P3 Mastering\" - Kenji will handle this. It cannot start until \"Color Grading Phase 1\" is complete.\n2. \"Audio Mix Master - Dolby Atmos\" - Amara will handle this. It depends on \"Sound Design Draft\" being finished.\n\nAdditionally, the colorist needs to lock the final look before audio can be mixed to picture. Set up \"Final Color Grade\" accordingly.\n\nAfter setting up all the new dependencies, add a comment to the \"Master Edit Lock\" issue with a dependency audit in this exact format:\n\n\"DEPENDENCY_AUDIT: Master Edit Lock directly blocks [X] downstream issues. New dependencies added: Final Color Grade (Kenji), Audio Mix Master (Amara). Cross-stream link established between color and audio pipelines.\"\n\nReplace [X] with the actual count of issues directly blocked by Master Edit Lock.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Final Color Grade\"},\"teamId\":{\"eq\":\"f5a6b7c8-d9e0-1234-5678-9abcdef01234\"},\"assigneeId\":{\"eq\":\"a1b2c3d4-e5f6-7890-abcd-ef1234567890\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Audio Mix Master\"},\"teamId\":{\"eq\":\"f5a6b7c8-d9e0-1234-5678-9abcdef01234\"},\"assigneeId\":{\"eq\":\"f4a5b6c7-d8e9-0123-8901-567890123456\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_relations\",\"where\":{\"type\":{\"eq\":\"blocks\"}},\"expected_count\":3},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"DEPENDENCY_AUDIT:\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"directly blocks 4\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_48", "test_name": "Film Post-Production Pipeline", "service": "linear", "task_horizon": 12, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issues\",\"users\",\"issueCreate\",\"issueRelationCreate\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "Create a new Engineering issue 'Polish navigation' with labels 'UX' and 'Urgent'", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"ad608998-915c-4bad-bcd9-85ebfccccee8\"},\"title\":{\"contains\":\"Polish navigation\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{},\"expected_count\":{\"min\":2}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_18", "test_name": "Create issue with labels UX and Urgent", "service": "linear", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issueLabels\",\"issueCreate\"]}"} +{"question": "Add the newly created 'Bugs' label to the Engineering login issue currently assigned to John Doe.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{\"issue_id\":{\"eq\":\"c6e168e3-fed4-45d0-b03f-a1c1f89ee7ab\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_12", "test_name": "Apply Bugs label to login issue", "service": "linear", "task_horizon": 3, "operation_type": "search+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issueLabels\",\"issues\",\"issueUpdate\"]}"} +{"question": "Add Artem (b55072d7-ccaa-43cd-8ab7-3dca324e3294) to the Product team.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"team_memberships\",\"where\":{\"userId\":{\"eq\":\"b55072d7-ccaa-43cd-8ab7-3dca324e3294\"},\"teamId\":{\"eq\":\"cdb85540-5065-4346-8aef-ae2b72d6e940\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_30", "test_name": "Add member to team", "service": "linear", "task_horizon": 2, "operation_type": "search+C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"teamMembershipCreate\"]}"} +{"question": "Add a comment to ENG-3: 'Please add logs for the error path'", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"issueId\":{\"eq\":\"87c1d2f3-66c4-4dd0-bc93-1b99d04dc374\"},\"body\":{\"contains\":\"error path\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_19", "test_name": "Add a comment to ENG-3", "service": "linear", "task_horizon": 1, "operation_type": "C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"commentCreate\"]}"} +{"question": "The Garden Plots team manages our community garden cooperative. It's spring reassignment season and we need to shuffle some plots around.\n\nFirst, find the existing plot issues: \"Plot A7 - Tomatoes\", \"Plot B3 - Herbs\", and \"Plot C1 - Squash\".\n\nLook up gardeners Ines and Rashida - they're involved in this season's reassignments.\n\nCreate a new tracking issue titled \"Spring 2025 Plot Reassignment Tracker\" in the Garden Plots team with description \"Documenting all plot changes for the growing season. Reassignments finalized at March board meeting.\"\n\nNow process the reassignments:\n\n1. Plot A7 (tomatoes) was abandoned when Marcus moved away. Reassign it to Ines and change its status to \"Active\" since she's starting immediately.\n\n2. Ines and Rashida agreed to swap their herb plots. Reassign \"Plot B3 - Herbs\" from Ines to Rashida. Keep the current status unchanged.\n\n3. The squash plot (C1) owner has left the cooperative entirely. Move it to \"Dormant\" status but don't assign anyone yet - we'll offer it at the next meeting.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Spring 2025\"},\"teamId\":{\"eq\":\"d9e0f1a2-b3c4-5678-9012-cdef01234567\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"gp-issue-a7-tomatoes-001\"}},\"expected_changes\":{\"assigneeId\":{\"to\":{\"eq\":\"f0a1b2c3-d4e5-6789-4567-123456789012\"}},\"stateId\":{\"to\":{\"eq\":\"gp-state-active-0003-cdef0123\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"gp-issue-b3-herbs-002\"}},\"expected_changes\":{\"assigneeId\":{\"to\":{\"eq\":\"a1b2c3d4-e5f6-7890-5678-234567890123\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"gp-issue-c1-squash-003\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"gp-state-dormant-0001-abcdef01\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_52", "test_name": "Community Garden Plot Management", "service": "linear", "task_horizon": 7, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issues\",\"users\",\"issueCreate\",\"issueUpdate\"]}"} +{"question": "Delete the seeded comment on ENG-1", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"comments\",\"where\":{\"id\":{\"eq\":\"e10f59c3-7a49-4d52-8dba-8c8602f8c807\"}},\"expected_changes\":{\"archivedAt\":{\"from\":{\"exists\":false},\"to\":{\"exists\":true}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_21", "test_name": "Delete seeded comment on ENG-1", "service": "linear", "task_horizon": 3, "operation_type": "search+D", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"comments\",\"commentDelete\"]}"} +{"question": "Create a new team called 'Design'", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"teams\",\"where\":{\"name\":{\"eq\":\"Design\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"workflow_states\",\"where\":{\"name\":{\"eq\":\"Backlog\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_5", "test_name": "Create a new team", "service": "linear", "task_horizon": 1, "operation_type": "C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teamCreate\"]}"} +{"question": "The Meeple & Brew board game café has a scheduling emergency. The venue for our Catan Regional Championship double-booked us, so we need to reschedule the entire tournament pipeline.\n\nFind the three tournament issues: \"Catan Regional Championship - Spring 2025\", \"Qualifying Round - Top 16 Bracket\", and \"Tournament Registration Deadline\".\n\nThe championship was originally March 15th but must move to March 23rd (8-day delay).\n\nHere's the critical part - the dates are interdependent:\n- The Qualifying Round must happen exactly 7 days before the Championship\n- The Registration Deadline must close exactly 5 days before the Qualifying Round\n\nCalculate and update all three due dates accordingly.\n\nAlso, Yuto was organizing the championship but has a work trip conflict on the new date. Reassign the championship to Adaeze. Keep Henrik on the qualifying round.\n\nAfter updating all dates, add a comment to the championship issue documenting the changes:\n\n\"RESCHEDULE_AUDIT: Venue conflict forced 8-day delay. New timeline calculated:\n- Registration closes: March 11th (was March 3rd)\n- Qualifiers: March 16th (was March 8th)\n- Championship: March 23rd (was March 15th)\nOrganizer handoff: Yuto → Adaeze due to travel conflict.\"", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"mb-issue-championship-001\"}},\"expected_changes\":{\"dueDate\":{\"to\":{\"eq\":\"2025-03-23\"}},\"assigneeId\":{\"to\":{\"eq\":\"d4e5f6a7-b8c9-0123-8901-567890123456\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"mb-issue-qualifying-002\"}},\"expected_changes\":{\"dueDate\":{\"to\":{\"eq\":\"2025-03-16\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"mb-issue-registration-003\"}},\"expected_changes\":{\"dueDate\":{\"to\":{\"eq\":\"2025-03-11\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"RESCHEDULE_AUDIT:\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"Yuto\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"Adaeze\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_53", "test_name": "Board Game Café - Tournament Rescheduling Crisis", "service": "linear", "task_horizon": 8, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"users\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "Add a comment to issue ENG-1 saying 'I am working on this now'", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"issueId\":{\"eq\":\"c6e168e3-fed4-45d0-b03f-a1c1f89ee7ab\"},\"body\":{\"contains\":\"working on this\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_4", "test_name": "Add comment to issue", "service": "linear", "task_horizon": 1, "operation_type": "C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"commentCreate\"]}"} +{"question": "Assign issue ENG-2 to John Doe", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-2\"}},\"expected_changes\":{\"assigneeId\":{\"from\":{\"eq\":\"03b0809e-713e-44ee-95de-b7a198b135ac\"},\"to\":{\"eq\":\"2dcc8dc2-ca19-475d-9882-3ba5e911e7ec\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_3", "test_name": "Assign issue to user", "service": "linear", "task_horizon": 3, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"users\",\"issueUpdate\"]}"} +{"question": "Break down the 'SSO' ticket into two sub-issues: 'Frontend Implementation' (assigned to Sarah) and 'Backend API' (assigned to John). Ensure the previous ticket is set as the parent for both.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"parentId\":{\"eq\":\"7d3f21ac-89c1-4f3b-9c2e-4fe3a1b71002\"},\"assigneeId\":{\"eq\":\"03b0809e-713e-44ee-95de-b7a198b135ac\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"parentId\":{\"eq\":\"7d3f21ac-89c1-4f3b-9c2e-4fe3a1b71002\"},\"assigneeId\":{\"eq\":\"2dcc8dc2-ca19-475d-9882-3ba5e911e7ec\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_38", "test_name": "Issue Decomposition (Hierarchy)", "service": "linear", "task_horizon": 4, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"users\",\"issueCreate\"]}"} +{"question": "Create a new issue in the Engineering team titled 'Fix login bug' with high priority", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"ad608998-915c-4bad-bcd9-85ebfccccee8\"},\"title\":{\"contains\":\"login bug\"},\"priority\":{\"eq\":2.0}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_1", "test_name": "Create a new issue", "service": "linear", "task_horizon": 2, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issueCreate\"]}"} +{"question": "Create a new label 'Backend' and add it to ENG-2", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issue_labels\",\"where\":{\"name\":{\"eq\":\"Backend\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{\"issue_id\":{\"eq\":\"5c62f29d-0f6a-4c4d-9d25-52293e2a8d4f\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_24", "test_name": "Create temporary label and apply to ENG-2", "service": "linear", "task_horizon": 3, "operation_type": "search+C+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issueLabelCreate\",\"issues\",\"issueUpdate\"]}"} +{"question": "Move issue ENG-1 to 'In Progress' status", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-1\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"6963a682-5967-477a-9afc-0b8a5b70b070\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_2", "test_name": "Update issue status to In Progress", "service": "linear", "task_horizon": 3, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"workflowStates\",\"issueUpdate\"]}"} +{"question": "The Stargazers astronomy club needs to set up their spring celestial events schedule.\n\nCreate a new issue in the Stargazers team titled \"Lyrid Meteor Shower Viewing Party - Peak Night April 22nd\" with description \"Annual club gathering at Dark Sky Preserve. Expected rate: 18 meteors/hour. Radiant rises after midnight in Perseus.\"\n\nAssign this event to Priya as the event coordinator.\n\nApply the \"public-event\" label to this issue since non-members are welcome to attend.\n\nAdd a comment with the viewing logistics: \"OBSERVATION_DETAILS: Meet at Ridgeline Observatory parking lot at 10pm. Bring red flashlights only - no white light. Bogdan will set up the 12-inch Dobsonian for Saturn viewing while we wait for the radiant to rise. Best meteor photography settings: ISO 3200, f/2.8, 20-second exposures pointed northeast.\"", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Lyrid Meteor Shower\"},\"teamId\":{\"eq\":\"b7c8d9e0-f1a2-3456-7890-abcdef123456\"},\"assigneeId\":{\"eq\":\"b8c9d0e1-f2a3-4567-2345-901234567890\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"April 22\"},\"description\":{\"contains\":\"Dark Sky Preserve\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"description\":{\"contains\":\"18 meteors/hour\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"OBSERVATION_DETAILS:\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"Ridgeline Observatory\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"12-inch Dobsonian\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_50", "test_name": "Amateur Astronomy Club - Celestial Event Planning", "service": "linear", "task_horizon": 10, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issueLabels\",\"issueCreate\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "John Doe is going on vacation. Reassign all of his 'Urgent' issues to Sarah Smith, but leave his non-urgent work as is.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-3\"}},\"expected_changes\":{\"assigneeId\":{\"from\":{\"eq\":\"2dcc8dc2-ca19-475d-9882-3ba5e911e7ec\"},\"to\":{\"eq\":\"03b0809e-713e-44ee-95de-b7a198b135ac\"}}},\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-1\"}},\"expected_count\":0,\"expected_changes\":{\"assigneeId\":{\"to\":{\"exists\":true}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_32", "test_name": "Vacation Handoff (Conditional Batch Reassignment)", "service": "linear", "task_horizon": 4, "operation_type": "search+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users\",\"issueLabels\",\"issues\",\"issueUpdate\"]}"} +{"question": "The IT Support team received a critical server outage report. Here's the workflow to execute:\n\nFirst, check if a label called \"hardware-failure\" exists. If it doesn't, create it.\n\nCreate a new issue titled \"Server rack B7 unresponsive - power supply failure\" in the IT Support team.\n\nApply the \"hardware-failure\" label to this ticket and assign it to Kofi for initial triage.\n\nAdd a comment to the ticket with this diagnostic entry: \"DIAG_LOG_001: Initial ping test failed. Checked physical connections. PSU indicator light is off. Replacement unit requested from inventory.\"\n\nNow update that same comment to append the following resolution note at the end: \" || UPDATE: PSU replaced at 14:32. Server responding. Monitoring for 24hrs.\"\n\nFinally, update the ticket to change the assignee from Kofi to Elena for post-incident verification, and move the ticket to \"In Review\" status.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issue_labels\",\"where\":{\"name\":{\"eq\":\"hardware-failure\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Server rack B7\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Server rack B7\"}},\"expected_count\":1,\"expected_changes\":{\"assigneeId\":{\"to\":{\"eq\":\"a3b4c5d6-e7f8-9012-7890-456789012345\"}}},\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"DIAG_LOG_001:\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"|| UPDATE:\"}},\"expected_count\":1,\"expected_changes\":{\"body\":{\"to\":{\"contains\":\"|| UPDATE:\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_44", "test_name": "IT Support Ticket Workflow", "service": "linear", "task_horizon": 8, "operation_type": "search+R+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"users\",\"issueLabels\",\"issueLabelCreate\",\"issueCreate\",\"issueUpdate\",\"commentCreate\",\"commentUpdate\"]}"} +{"question": "Rename the Engineering issue describing intermittent login failures to 'Fix login bug - follow up'.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-1\"}},\"expected_changes\":{\"title\":{\"to\":{\"contains\":\"follow up\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_14", "test_name": "Update login issue title", "service": "linear", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"issueUpdate\"]}"} +{"question": "Cancel issue ENG-1 (set its status to Canceled)", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-1\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"d4f59a6d-33cb-45d1-8f4e-3e57536f912d\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_23", "test_name": "Move ENG-1 to Canceled", "service": "linear", "task_horizon": 3, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"workflowStates\",\"issueUpdate\"]}"} +{"question": "The Archaeology team is managing the Season 3 excavation at Site Karnak-West. There's a workflow problem blocking progress.\n\nExamine the issues \"Artifact Photography Documentation\" and \"Lab Sample Analysis\". These two issues are in a dependency deadlock - each one is marked as blocking the other, which means neither can proceed.\n\nDetermine which blocking relationship is incorrect. The correct archaeological workflow is: Photography must complete BEFORE samples can go to the lab (you need photos of artifacts in situ before extraction for the record). The reverse relationship (lab blocking photography) was added by mistake and makes no sense.\n\nDelete the incorrect blocking relationship to resolve the deadlock.\n\nNow extend the workflow. Create a new issue called \"Final Site Report Compilation - Season 3\" in the Archaeology team. This report cannot be written until BOTH the photography documentation AND the lab analysis are complete. Set up both as blockers for the report.\n\nAssign the work: Ximena handles photography, Okonkwo handles lab analysis, and Søren compiles the final report.\n\nMove the photography issue to \"In Progress\" now that it's unblocked.\n\nAfter fixing everything, add a comment to the \"Lab Sample Analysis\" issue documenting the fix: \"WORKFLOW_FIX: Removed erroneous blocking relation where Lab was blocking Photography. Correct flow is Photography → Lab (need in-situ photos before extraction). Deadlock resolved. Current chain: Photography → Lab Analysis → Final Report.\"", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issue_relations\",\"where\":{\"id\":{\"eq\":\"rel-lab-blocks-photo-002\"}},\"expected_changes\":{\"archivedAt\":{\"from\":{\"exists\":false},\"to\":{\"exists\":true}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Final Site Report\"},\"teamId\":{\"eq\":\"a6b7c8d9-e0f1-2345-6789-0abcdef12345\"},\"assigneeId\":{\"eq\":\"c7d8e9f0-a1b2-3456-1234-890123456789\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"arch-issue-photography-001\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"arch-state-inprogress-2345-cdef01\"}},\"assigneeId\":{\"to\":{\"eq\":\"b6c7d8e9-f0a1-2345-0123-789012345678\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"arch-issue-lab-analysis-002\"}},\"expected_changes\":{\"assigneeId\":{\"to\":{\"eq\":\"d4e5f6a7-b8c9-0123-def0-456789012345\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"added\",\"entity\":\"issue_relations\",\"where\":{\"type\":{\"eq\":\"blocks\"}},\"expected_count\":2},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"WORKFLOW_FIX:\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"Deadlock resolved\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_49", "test_name": "Archaeological Dig Site Coordination - Deadlock Resolution", "service": "linear", "task_horizon": 13, "operation_type": "search+C+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issues\",\"users\",\"workflowStates\",\"issueRelationDelete\",\"issueCreate\",\"issueRelationCreate\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "Create three new issues in Engineering: 'Alpha', 'Beta', and 'Gamma'. Configure them as a dependency chain where Alpha blocks Beta, and Beta blocks Gamma.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"ad608998-915c-4bad-bcd9-85ebfccccee8\"}},\"expected_count\":3},{\"diff_type\":\"added\",\"entity\":\"issue_relations\",\"where\":{\"type\":{\"eq\":\"blocks\"}},\"expected_count\":2}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_39", "test_name": "Dependency Chain Creation", "service": "linear", "task_horizon": 6, "operation_type": "search+C", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issueCreate\",\"issueRelationCreate\"]}"} +{"question": "Add a new workflow state for the product team called 'Done'. Then, create a new issue in this team called 'Mini-demo', mark it as done, and add a comment to it saying 'Marking it as done because Hubert already prepped the demo but forgot to create an issue'.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"workflow_states\",\"where\":{\"name\":{\"eq\":\"Done\"},\"teamId\":{\"eq\":\"cdb85540-5065-4346-8aef-ae2b72d6e940\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"eq\":\"Mini-demo\"},\"teamId\":{\"eq\":\"cdb85540-5065-4346-8aef-ae2b72d6e940\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"Hubert already prepped\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_27", "test_name": "Create State, Issue, and Comment flow", "service": "linear", "task_horizon": 5, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"workflowStateCreate\",\"issueCreate\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "Find any Engineering tickets in 'In Progress' that were created before Jan 2nd, 2025. Move them back to 'Todo' and add a comment: 'Demoted due to lack of progress'.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-3\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"741f29ae-cfb3-4b8a-a1f8-c5161c842366\"}}},\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"issueId\":{\"eq\":\"87c1d2f3-66c4-4dd0-bc93-1b99d04dc374\"},\"body\":{\"contains\":\"Demoted\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_37", "test_name": "SLA Enforcement (Dates & States)", "service": "linear", "task_horizon": 7, "operation_type": "search+C+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"workflowStates\",\"issues\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "The PMO is conducting a Q1 resource allocation review. Here's what needs to happen:\n\nFirst, look at all teams and count how many members each team has.\n\nFind the team with the most members - this is our \"fully staffed\" benchmark.\n\nFor every team that has FEWER members than the benchmark team, create a new issue in that team titled \"Q1 Staffing Request - Need [X] additional team members\" where [X] is the exact difference between that team's member count and the benchmark team's count. Set priority to High for these issues.\n\nAlso, there's a misrouted issue: \"API Documentation Update\" was accidentally created in the Design team but belongs in Engineering. Move it to the Engineering team.\n\nFinally, add a comment to any issue in the Engineering team summarizing the analysis:\n\n\"RESOURCE_AUDIT: Q1 staffing review complete. Engineering has [MAX] members (benchmark). Staffing gaps identified: Product needs [A], Design needs [B], QA needs [C]. Total headcount gap across org: [TOTAL]. Staffing request issues created in all understaffed teams.\"\n\nReplace the bracketed values with the actual numbers from your analysis. Note: [TOTAL] should be the sum of headcount gaps from ALL understaffed teams (not just Product, Design, and QA).", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"res-issue-api-docs-001\"}},\"expected_changes\":{\"teamId\":{\"to\":{\"eq\":\"ad608998-915c-4bad-bcd9-85ebfccccee8\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"cdb85540-5065-4346-8aef-ae2b72d6e940\"},\"title\":{\"contains\":\"Staffing Request\"},\"priority\":{\"eq\":2}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"f1a2b3c4-d5e6-7890-1234-567890abcdef\"},\"title\":{\"contains\":\"Staffing Request\"},\"priority\":{\"eq\":2}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"a1b2c3d4-e5f6-7890-1234-567890abcdef\"},\"title\":{\"contains\":\"Staffing Request\"},\"priority\":{\"eq\":2}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"RESOURCE_AUDIT:\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"7 members\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"Product needs 4\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"Design needs 5\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"QA needs 3\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"Total headcount gap across org: 28\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_54", "test_name": "Quarterly Resource Allocation Review", "service": "linear", "task_horizon": 12, "operation_type": "search+R+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issues\",\"issueUpdate\",\"issueCreate\",\"commentCreate\"]}"} +{"question": "Read the comments on the 'production incident' ticket. Create a new Engineering ticket titled 'Fix 500 errors in eval runner' with a description based on the analysis in the comments. Mark this new ticket as being blocked by ENG-3.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Fix 500 errors\"},\"description\":{\"contains\":\"agent\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_relations\",\"where\":{\"type\":{\"eq\":\"blocks\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_34", "test_name": "Create Follow-up Task from Incident Comments", "service": "linear", "task_horizon": 5, "operation_type": "search+R+C", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"comments\",\"teams\",\"issueCreate\",\"issueRelationCreate\"]}"} +{"question": "Assign the 'email sign-in' ticket to the Engineering team member who currently has the fewest assigned issues. If there is a tie, pick anyone with the lowest count.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-6\"}},\"expected_changes\":{\"assigneeId\":{\"to\":{\"eq\":\"b55072d7-ccaa-43cd-8ab7-3dca324e3294\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_35", "test_name": "Load Balancer (Logic & Aggregation)", "service": "linear", "task_horizon": 3, "operation_type": "search+R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issues\",\"issueUpdate\"]}"} +{"question": "Add the 'RL' label to the login issue that John Doe recently commented on.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{\"issue_id\":{\"eq\":\"c6e168e3-fed4-45d0-b03f-a1c1f89ee7ab\"},\"issue_label_id\":{\"eq\":\"8f01ce9d-1433-4c4c-969d-21ca3bf2718f\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_15", "test_name": "Tag commented issue with RL label", "service": "linear", "task_horizon": 4, "operation_type": "search+R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users\",\"comments\",\"issueLabels\",\"issueUpdate\"]}"} +{"question": "Update the description of issue ENG-1 to include 'Root cause: session timeout issue'", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-1\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"session timeout\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_8", "test_name": "Update issue description", "service": "linear", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"issueUpdate\"]}"} +{"question": "Move issue ENG-2 to 'In Review'", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"ENG-2\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"4379b3d7-1143-4aa4-a3a6-da0c436e73b6\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_16", "test_name": "Move ENG-2 to In Review", "service": "linear", "task_horizon": 3, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"workflowStates\",\"issueUpdate\"]}"} +{"question": "Create three new issues in the Engineering team: 'Update documentation', 'Refactor API endpoints', and 'Add unit tests'", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"ad608998-915c-4bad-bcd9-85ebfccccee8\"}},\"expected_count\":3}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_9", "test_name": "Create multiple issues in batch", "service": "linear", "task_horizon": 4, "operation_type": "search+C", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issueCreate\"]}"} +{"question": "Find duplicated issues regarding 'trace analysis' in the Product team and mark the 2nd one (the newer one) as a duplicate.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"workflow_states\",\"where\":{\"name\":{\"eq\":\"Duplicate\"},\"teamId\":{\"eq\":\"cdb85540-5065-4346-8aef-ae2b72d6e940\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"PROD-2\"}},\"expected_changes\":{\"stateId\":{\"from\":{\"eq\":\"0fde0f94-ee5f-4a37-ad23-a3acd0080c57\"},\"to\":{\"exists\":true}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_29", "test_name": "Resolve duplicate issues (requires creating state)", "service": "linear", "task_horizon": 4, "operation_type": "search+C+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issues\",\"workflowStateCreate\",\"issueUpdate\"]}"} +{"question": "The Clay & Fire pottery studio tracks their kiln schedule. Help update the firing queue.\n\nFirst, find the existing issues \"Fatou's Celadon Vase\" and \"Stoneware Bowl Set\" in the Ceramics team.\n\nThe celadon vase is ready to go in the kiln - move it to \"Firing\" status.\n\nThe stoneware bowls have finished their cone 10 firing and need to cool down - move them to \"Cooling\" status.\n\nCreate a new label called \"raku-firing\" for pieces that will use the rapid-cooling technique (we'll apply it to future items).\n\nFinally, add a kiln log comment to the celadon vase issue: \"KILN_LOG: Loaded into kiln #2 at 9:15am. Target: Cone 9 oxidation (~2300°F). Fatou requested slow cooling for crystal development. Do not open kiln door until temp drops below 400°F.\"", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"cer-issue-vase-001\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"cer-state-firing-1234-bcdef012\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"cer-issue-bowls-002\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"cer-state-cooling-2345-cdef0123\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"added\",\"entity\":\"issue_labels\",\"where\":{\"name\":{\"eq\":\"raku-firing\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"KILN_LOG:\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"Cone 9\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"2300\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"crystal development\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_51", "test_name": "Pottery Studio - Kiln Firing Schedule", "service": "linear", "task_horizon": 10, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"workflowStates\",\"issueUpdate\",\"issueLabelCreate\",\"commentCreate\"]}"} +{"question": "Rename the label 'UX' to 'User Experience'", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"issue_labels\",\"where\":{\"id\":{\"eq\":\"f9c5b7c8-3909-4f8b-bc25-73e1b56a9c0c\"}},\"expected_changes\":{\"name\":{\"to\":{\"eq\":\"User Experience\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_25", "test_name": "Rename a label", "service": "linear", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issueLabels\",\"issueLabelUpdate\"]}"} +{"question": "Please edit the comment (id: e6a0f6c4-4d0e-4f4c-9a54-444444444444) about open telemetry and include the link 'https://smith.langchain.com/' as an example.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"comments\",\"where\":{\"id\":{\"eq\":\"e6a0f6c4-4d0e-4f4c-9a54-444444444444\"}},\"expected_changes\":{\"body\":{\"to\":{\"contains\":\"smith.langchain.com\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_31", "test_name": "Edit comment with link", "service": "linear", "task_horizon": 1, "operation_type": "U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"commentUpdate\"]}"} +{"question": "The Seed Library team is conducting its quarterly germination audit. We need to evaluate each donor's success rate and take appropriate action.\n\nFirst, calculate Yuto's germination rate: count how many of his seed packets are in \"Sprouted\" status versus \"Failed\" status, then compute (sprouted / total) × 100. If his rate is 75% or higher AND he has at least 2 different varieties that sprouted, create a new label called \"Seed Guardian\" and apply it to all of his packets as recognition.\n\nNext, handle Nneka's special case: she donated exactly one packet and it's marked as priority 1 (rare heirloom). Regardless of whether it sprouted or failed, move this packet to \"Preserved Collection\" status—the library will attempt tissue culture propagation on rare genetics.\n\nFinally, evaluate Szymon's packets the same way. If his germination rate is below 60%, move all his non-sprouted packets to \"Needs Donor Review\" status and add a comment to each one that reads: \"GERMINATION_AUDIT: X sprouted / Y total = Z% - below 60% threshold\" where X, Y, Z are the actual calculated values.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issue_labels\",\"where\":{\"name\":{\"eq\":\"Seed Guardian\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{},\"expected_count\":4},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"SEED-5\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"34567890-bcde-f012-3456-789abcdef012\"}}},\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"SEED-7\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"45678901-cdef-0123-4567-89abcdef0123\"}}},\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"identifier\":{\"eq\":\"SEED-8\"}},\"expected_changes\":{\"stateId\":{\"to\":{\"eq\":\"45678901-cdef-0123-4567-89abcdef0123\"}}},\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"GERMINATION_AUDIT:\"}},\"expected_count\":2}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_41", "test_name": "Seed Library Germination Audit", "service": "linear", "task_horizon": 11, "operation_type": "search+R+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issues\",\"users\",\"issueLabels\",\"workflowStates\",\"issueLabelCreate\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "The Wing & Wind pigeon racing club has an emergency. A fast-moving storm system is approaching the race corridor and we need to execute safety protocols.\n\nFirst, find all birds currently marked as \"In Flight\" in the Racing Operations team - these are the ones at risk.\n\nCreate an emergency coordination issue in the Racing Operations team titled \"WEATHER ALERT: Storm cell approaching sector 7 - All birds at risk\" with description \"NWS severe thunderstorm warning issued 14:32 UTC. Wind gusts to 60mph expected. Initiating emergency diversion protocol.\"\n\nFind the bird tracking issue for \"Stormchaser\" (Liora's champion racer, band #2847). Update it to add this to the description: \"DIVERSION ACTIVE: Rerouted to backup loft at coordinates 41.8781° N, 87.6298° W. Amadi's loft confirmed ready to receive.\"\n\nFinally, add a weather advisory comment to the emergency coordination issue:\n\"WEATHER_LOG: Storm tracking update at 14:45 UTC. Cell moving NNE at 35mph. ETA to race corridor: 47 minutes. All handlers notified via SMS. GPS tracking shows 3 birds diverted successfully. Amadi confirming visual on Stormchaser approaching backup loft.\"", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"teamId\":{\"eq\":\"race-team-001\"},\"title\":{\"contains\":\"WEATHER ALERT\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"sector 7\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"description\":{\"contains\":\"NWS\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"description\":{\"contains\":\"60mph\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"race-issue-stormchaser-001\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"DIVERSION ACTIVE\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"race-issue-stormchaser-001\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"41.8781\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"WEATHER_LOG:\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"14:45 UTC\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"Amadi confirming\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"comments\",\"where\":{\"body\":{\"contains\":\"WEATHER_LOG:\"},\"issueId\":{\"eq\":\"race-issue-stormchaser-001\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"issues\",\"where\":{\"id\":{\"eq\":\"race-issue-stormchaser-001\"}},\"expected_changes\":{\"description\":{\"to\":{\"contains\":\"87.6298\"}}},\"expected_count\":1,\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_56", "test_name": "Competitive Pigeon Racing Club - Storm Emergency Protocol", "service": "linear", "task_horizon": 12, "operation_type": "search+R+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"issueCreate\",\"issueUpdate\",\"commentCreate\"]}"} +{"question": "Update the seeded comment on ENG-1 to: 'Updated: working on a fix and adding tests'", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"comments\",\"where\":{\"id\":{\"eq\":\"e10f59c3-7a49-4d52-8dba-8c8602f8c807\"}},\"expected_changes\":{\"body\":{\"to\":{\"contains\":\"Updated: working on a fix\"}}},\"ignore\":[\"updatedAt\"]}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_20", "test_name": "Update seeded comment on ENG-1", "service": "linear", "task_horizon": 3, "operation_type": "U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issues\",\"comments\",\"commentUpdate\"]}"} +{"question": "The Backend team is doing sprint cleanup. Here's what needs to happen:\n\nFirst, find the existing issue about \"database migration\" - we'll need its ID for a dependency.\n\nCreate a new parent issue titled \"Q1 Infrastructure Overhaul\" in the Backend team. This will be our tracking epic.\n\nCreate a sub-issue under that epic titled \"Upgrade Redis cluster to v7\" - make sure to set the parent relationship to the epic you just created.\n\nThe Redis upgrade cannot start until the database migration is complete. Set up the Redis issue as blocked by the database migration issue.\n\nNow, add a standup note comment to the Redis issue that says: \"STANDUP_NOTE: Jamal will start this after migration completes. ETA next Tuesday.\"\n\nWait - that comment was supposed to go on the migration ticket, not the Redis ticket. Delete that comment.\n\nFinally, assign the Redis sub-task to Jamal, and assign the parent epic \"Q1 Infrastructure Overhaul\" to Olga with High priority.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Q1 Infrastructure Overhaul\"},\"teamId\":{\"eq\":\"c2d3e4f5-a6b7-8901-2345-6789abcdef01\"},\"assigneeId\":{\"eq\":\"d6e7f8a9-b0c1-2345-0123-789012345678\"},\"priority\":{\"eq\":2.0}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issues\",\"where\":{\"title\":{\"contains\":\"Redis cluster\"},\"assigneeId\":{\"eq\":\"e7f8a9b0-c1d2-3456-1234-890123456789\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_relations\",\"where\":{\"type\":{\"eq\":\"blocks\"},\"issueTitle\":{\"contains\":\"database migration\"},\"relatedIssueTitle\":{\"contains\":\"Redis\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_45", "test_name": "Sprint Backlog Cleanup", "service": "linear", "task_horizon": 9, "operation_type": "search+C+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"teams\",\"issues\",\"users\",\"issueCreate\",\"issueRelationCreate\",\"commentCreate\",\"commentDelete\",\"issueUpdate\"]}"} +{"question": "We are replacing the 'RL' label with a new 'AI' label. Create a new label named 'AI'. Find all tickets with the 'RL' label, tag them with 'AI', and remove the 'RL' label.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issue_labels\",\"where\":{\"name\":{\"eq\":\"AI\"}},\"expected_count\":1},{\"diff_type\":\"removed\",\"entity\":\"issue_label_issue_association\",\"where\":{\"issue_id\":{\"eq\":\"b4f5130f-5c1b-4bc0-a8f6-60a22b0adf5e\"},\"issue_label_id\":{\"eq\":\"8f01ce9d-1433-4c4c-969d-21ca3bf2718f\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{\"issue_id\":{\"eq\":\"b4f5130f-5c1b-4bc0-a8f6-60a22b0adf5e\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_36", "test_name": "Label Migration (Batch & Sequence)", "service": "linear", "task_horizon": 4, "operation_type": "search+C+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issueLabelCreate\",\"issueLabels\",\"issues\",\"issueUpdate\"]}"} +{"question": "Add the 'UX' label to the onboarding dashboard issue that Sarah Smith owns.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"issue_label_issue_association\",\"where\":{\"issue_id\":{\"eq\":\"5c62f29d-0f6a-4c4d-9d25-52293e2a8d4f\"},\"issue_label_id\":{\"eq\":\"f9c5b7c8-3909-4f8b-bc25-73e1b56a9c0c\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\",\"updatedAt\",\"createdAt\",\"editedAt\"]}}", "test_id": "linear_13", "test_name": "Add UX label to onboarding issue", "service": "linear", "task_horizon": 3, "operation_type": "search+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"linear\",\"seed_template\":\"linear_expanded\",\"impersonate_user_id\":\"2790a7ee-fde0-4537-9588-e233aa5a68d1\",\"eval_type\":\"actionEval\",\"tools_required\":[\"issueLabels\",\"issues\",\"issueUpdate\"]}"} +{"question": "I need your help coordinating our Lunar New Year product launch for the APAC market. Can you first catch me up on what's been happening in #model-research and #core-infra? I want to make sure we're not planning a launch during any technical instability.\n\nAlso, I need to verify that Kenji Sato and Robert Chen are the right people to loop in on this - can you confirm their roles for me? Kenji should be handling APAC growth and Robert should be our engineering lead.\n\nOnce you've gathered that context, please set up a dedicated channel for this initiative and make sure the topic clearly reflects what we're working on. Then post a summary of what you found to #project-alpha-dev so the team is aligned.\n\nOh, and I think I sent a message earlier about the timeline that needs updating with the correct dates - can you fix that? And if there's anything important in those channel histories worth acknowledging, give it a thumbs up so people know we've seen it.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_private\":{\"eq\":false}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_100", "test_name": "Lunar New Year Product Launch", "service": "slack", "task_horizon": 10, "operation_type": "search+R+C+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.history\",\"conversations.history\",\"users.info\",\"users.info\",\"conversations.create\",\"conversations.setTopic\",\"chat.postMessage\",\"chat.update\",\"reactions.add\",\"conversations.create\"]}"} +{"question": "Find the message that says 'Hey team' and edit it to say 'Hello everyone'", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_id\":{\"eq\":\"1699564800.000123\"}},\"expected_changes\":{\"message_text\":{\"to\":{\"contains\":\"Hello everyone\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_70", "test_name": "Edit existing message", "service": "slack", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"search.messages\",\"chat.update\"]}"} +{"question": "Send a markdown formatted message to #engineering with a header 'Daily Report' and a bold item '**All Systems Go**'. Use Slack Block Kit with a markdown block type.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"blocks\":{\"contains\":\"\\\"type\\\":\\\"mrkdwn\\\"\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_84", "test_name": "Markdown Block (Direct)", "service": "slack", "task_horizon": 2, "operation_type": "C+R", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"chat.postMessage\"]}"} +{"question": "Reply 'Next monday.' to the most recent message in #general", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"parent_id\":{\"eq\":\"1706115500.000001\"},\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"contains\":\"Next monday\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_65", "test_name": "Reply in a thread", "service": "slack", "task_horizon": 3, "operation_type": "search+R+C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "high", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.history\",\"chat.postMessage\"]}"} +{"question": "Create a new channel called 'rl-project' and add Morgan Stanley to it", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"channel_name\":{\"i_contains\":\"rl-project\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U05MORGAN23\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_62", "test_name": "Create a new channel and add user to it ", "service": "slack", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.create\",\"conversations.invite\"]}"} +{"question": "You've replied to one of the messages with a bad joke. Edit it, for 'I will make a proposal for auth improvements tommorow EOD'", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"parent_id\":{\"eq\":\"1700143200.000999\"},\"message_id\":{\"eq\":\"1700153200.000999\"},\"user_id\":{\"eq\":\"U01AGENBOT9\"}},\"expected_changes\":{\"message_text\":{\"to\":{\"contains\":\"proposal for auth improvements tommorow EOD\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_78", "test_name": "Edit a thread message", "service": "slack", "task_horizon": 3, "operation_type": "R+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.history\",\"conversations.replies\",\"chat.update\"]}"} +{"question": "Do two things using Slack Block Kit: 1) Send a code snippet to #engineering containing `{\"status\": 200}` using rich_text_preformatted element, and 2) Send a numbered list to #general with items 'Phase 1' and 'Phase 2' using rich_text_list with style:ordered.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"blocks\":{\"contains\":\"rich_text_preformatted\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"blocks\":{\"contains\":\"rich_text_list\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_81", "test_name": "Rich Text: Code Block and Numbered List", "service": "slack", "task_horizon": 3, "operation_type": "C+R", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"chat.postMessage\",\"chat.postMessage\"]}"} +{"question": "Hey, I need your help coordinating our 24-hour global hackathon across Lagos, Kyiv, Warsaw, and SF. First, can you find out which channels are relevant for this open source hackathon we're running? I want to make sure #core-infra has an updated topic that reflects we're in hackathon mode.\n\nAlso, I need to post an update to the infrastructure team about our coordination status. Before I loop in Łukasz Kowalski and Kenji Sato, can you pull up their profiles? I want to confirm Łukasz is still our performance lead and check Kenji's role on the APAC growth side.\n\nI posted something outdated in one of the channels yesterday that needs to be removed - it had wrong timezone info. Can you also check what's been discussed recently in #project-alpha-dev so I'm caught up? And verify who's currently in #frontend - we might need to add some people.\n\nOh, and when you find any important messages about the hackathon prep, just give them a thumbs up so people know we've seen them.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_94", "test_name": "Open Source Hackathon Coordination", "service": "slack", "task_horizon": 9, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.setTopic\",\"chat.postMessage\",\"users.info\",\"users.info\",\"conversations.history\",\"reactions.add\",\"chat.delete\",\"conversations.members\"]}"} +{"question": "Add Morgan Stanley to the 'random' channel", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U05MORGAN23\"},\"channel_id\":{\"eq\":\"C02EFGH5678\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_61", "test_name": "Add user to channel", "service": "slack", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.list\",\"conversations.invite\"]}"} +{"question": "Find all questions in #random and post each one to #general as separate messages.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"contains\":\"Gemini\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"contains\":\"preview\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"contains\":\"garlic knots\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"contains\":\"shared lunch\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_74", "test_name": "Forward questions to another channel", "service": "slack", "task_horizon": 6, "operation_type": "search+R+C", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.history\",\"chat.postMessage\",\"chat.postMessage\",\"chat.postMessage\",\"chat.postMessage\"]}"} +{"question": "Quote the text 'To be or not to be' in the #random channel. Use Slack Block Kit rich_text blocks with rich_text_quote element.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C02EFGH5678\"},\"blocks\":{\"contains\":\"rich_text_quote\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_82", "test_name": "Rich Text: Block Quote", "service": "slack", "task_horizon": 2, "operation_type": "C+R", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"chat.postMessage\"]}"} +{"question": "Check the discussion in #growth. If the team decided to double down on Reddit, react with :rocket: to the message proposing it.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":{\"eq\":\"1700300240.000005\"},\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"reaction_type\":{\"eq\":\"rocket\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_93", "test_name": "Conditional Logic", "service": "slack", "task_horizon": 3, "operation_type": "C+R", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.history\",\"reactions.add\"]}"} +{"question": "I need some help getting our Lunar New Year product launch coordination sorted out. We're targeting APAC markets and I want to make sure we're being culturally sensitive with our timing and messaging.\n\nFirst, can you help me figure out who on our team has the right expertise for this? I need to reach out directly to our frontend person about some UI elements that need to be adapted, and also connect with our engineering lead separately about the technical rollout schedule.\n\nAlso, I noticed the #project-alpha-dev channel might have some people who aren't really needed for this particular launch, and I want to keep discussions focused. Can you check who's currently in that channel? We may need to streamline the membership a bit - I think there are a couple of folks who were added for previous projects but don't need to be looped in on the APAC launch details.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":{\"eq\":true}},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_96", "test_name": "Lunar New Year Product Launch", "service": "slack", "task_horizon": 7, "operation_type": "C+R", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.open\",\"conversations.open\",\"conversations.members\",\"conversations.kick\",\"conversations.kick\",\"users.list\"]}"} +{"question": "Send a table to #growth with headers 'Metric' and 'Value', and one row of data: 'DAU', '1500'. Use Slack Block Kit with a table block type.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C04MNOP3456\"},\"blocks\":{\"contains\":\"\\\"type\\\":\\\"table\\\"\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C04MNOP3456\"},\"blocks\":{\"contains\":\"DAU\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_83", "test_name": "Table Block Generation", "service": "slack", "task_horizon": 2, "operation_type": "C+R", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"chat.postMessage\"]}"} +{"question": "Send a DM (group conversation not channel) to Artem and Hubert saying 'Hey, I've took a look at the presentation and I have some questions. Can you help me?'", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"message_text\":{\"contains\":\"presentation and I have some questions\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_gc\":{\"eq\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_59", "test_name": "Send direct message Artem, and Hubert (Create group conversation + send message)", "service": "slack", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.open\",\"chat.postMessage\"]}"} +{"question": "Search for all messages (4 messages) in the #engineering channel related to login issues and combine them into a single new message as DM to Hubert. Do not change the meaning of the original messages just combine them.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":{\"eq\":true},\"channel_id\":{\"regex\":\"^D\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U06HUBERT23\"},\"channel_id\":{\"regex\":\"^D\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"channel_id\":{\"regex\":\"^D\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"channel_id\":{\"regex\":\"^D\"},\"message_text\":{\"contains\":\"500 errors\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"channel_id\":{\"regex\":\"^D\"},\"message_text\":{\"contains\":\"invalid_grant\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"channel_id\":{\"regex\":\"^D\"},\"message_text\":{\"contains\":\"login rate limit\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"channel_id\":{\"regex\":\"^D\"},\"message_text\":{\"contains\":\"login endpoint\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_75", "test_name": "Search messages in a channel and combine results into a new message as DM", "service": "slack", "task_horizon": 4, "operation_type": "search+R+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"search.messages\",\"users.list\",\"conversations.open\",\"chat.postMessage\"]}"} +{"question": "Mention @Artem in #general using Slack Block Kit rich_text blocks with a user element type containing Artem's user ID.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"blocks\":{\"contains\":\"\\\"type\\\":\\\"user\\\"\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"blocks\":{\"contains\":\"U02ARTEM23\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_85", "test_name": "Rich Text: User Mention", "service": "slack", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.list\",\"chat.postMessage\"]}"} +{"question": "I need to do a quick audit of our #engineering channel. Can you get me the full details about the channel and check who's currently in it?\n\nPost the audit results to #general - I want a message showing the exact member count (as a number) and a list of the current members by name.\n\nAfter that, rename #engineering to \"engineering-backend\" since that's what the team mainly works on. Then post a follow-up message in #general confirming the rename was successful with the new channel name.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"contains\":\"5\"}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"regex\":\"[Mm]ember\"}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"changed\",\"entity\":\"channels\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"}},\"expected_changes\":{\"channel_name\":{\"to\":{\"contains\":\"engineering-backend\"}}}},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"contains\":\"engineering-backend\"}},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_104", "test_name": "Channel Audit and Rename", "service": "slack", "task_horizon": 7, "operation_type": "C+R+U", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.info\",\"conversations.members\",\"users.list\",\"chat.postMessage\",\"conversations.rename\",\"chat.postMessage\"]}"} +{"question": "Summarize the discussion about 'Gemini' in #random and post the summary to #engineering.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"message_text\":{\"contains\":\"Gemini\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_92", "test_name": "Cross-channel Summarization", "service": "slack", "task_horizon": 8, "operation_type": "C+R", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.history\",\"chat.postMessage\"]}"} +{"question": "Send 'System maintenance tonight at 10pm' to both #general and #random", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"message_text\":{\"contains\":\"maintenance\"},\"channel_id\":{\"eq\":\"C01ABCD1234\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"message_text\":{\"contains\":\"maintenance\"},\"channel_id\":{\"eq\":\"C02EFGH5678\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_72", "test_name": "Multi-channel send", "service": "slack", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"chat.postMessage\",\"chat.postMessage\"]}"} +{"question": "Hubert, John, Morgan, and Omer want to start a mapping project for forgotten underground rivers — they're calling it \"Cartography of Lost Rivers\". Pull up some details about #core-infra to see if that community would be a good match for cross-pollination. Now, \"Morgan\" — I mean the one who's been in the engineering discussions, not the other one. Also, that Morgan asked me to count all of the messages across all of the chats that mention the word \"supercomputer.\" Do this please. Then create #lost-rivers-cartography, set a topic about mapping forgotten urban waterways, invite all four, and write a project manifesto as the opening post that will say: '\"supercomputer\" mentioned number of times across all of the chats'. DM Morgan privately to ask whether they'd rather lead the cartography side or the field exploration. Lastly, find a message about infrastructure in #engineering and edit it to include a mention of the new project.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"channel_name\":\"lost-rivers-cartography\",\"topic_text\":{\"i_contains\":\"waterway\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"in\":[\"U06HUBERT23\",\"U02JOHNDOE1\",\"U05MORGAN23\",\"U04OMER23\"]}},\"expected_count\":{\"min\":4}},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":\"U01AGENBOT9\",\"message_text\":{\"regex\":\"(?i)supercomputer.*\\\\b2\\\\b\"}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":true},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":\"U01AGENBOT9\",\"message_text\":{\"regex\":\"(?i)exploration\"}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":\"U05MORGAN23\"},\"expected_count\":{\"min\":2}},{\"diff_type\":\"changed\",\"entity\":\"messages\",\"where\":{\"channel_id\":\"C03IJKL9012\"},\"expected_changes\":{\"message_text\":{\"to\":{\"regex\":\"(?i)lost.rivers\"}}},\"ignore\":[\"blocks\"],\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":\"U01AGENBOT9\"},\"expected_count\":{\"min\":2}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_110", "test_name": "Cartography of Lost Rivers", "service": "slack", "task_horizon": 14, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.list\",\"conversations.info\",\"search.all\",\"conversations.create\",\"conversations.setTopic\",\"conversations.invite\",\"chat.postMessage\",\"conversations.open\",\"chat.update\",\"conversations.history\"]}"} +{"question": "I need some help organizing our cross-cultural tea ceremony exchange between the Tokyo and Paris offices. Can you set up a dedicated channel for this traditional tea ceremony planning initiative and make sure the channel topic clearly explains what it's about?\n\nBefore we get started, I want to check what's been happening in #random lately - there was some casual conversation about lunch plans and something about Gemini that might be relevant to who's interested. Also, take a look at #growth because I remember seeing some Reddit strategy discussion that could tie into how we promote this cultural exchange.\n\nI posted a few messages earlier about the event that need updating with corrected information - the dates and details have changed. There's also one outdated message I sent that's no longer relevant and should just be removed entirely to avoid confusion.\n\nOh, and if you see the message where Priya or Mateo showed interest in participating, can you add a reaction to acknowledge it? I don't want to clutter the thread with another reply.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_private\":{\"eq\":false}},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_99", "test_name": "Traditional Tea Ceremony Planning", "service": "slack", "task_horizon": 11, "operation_type": "C+U+D", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "high", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.create\",\"conversations.setTopic\",\"conversations.history\",\"conversations.history\",\"chat.update\",\"chat.update\",\"chat.update\",\"reactions.add\",\"chat.delete\",\"conversations.create\",\"conversations.create\"]}"} +{"question": "Archive the #growth channel", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"channels\",\"where\":{\"channel_id\":{\"eq\":\"C04MNOP3456\"}},\"expected_changes\":{\"is_archived\":{\"to\":true}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_64", "test_name": "Archive a channel", "service": "slack", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.archive\"]}"} +{"question": "Send a DM to John saying 'Can we sync later?'", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"message_text\":{\"contains\":\"sync later\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":{\"eq\":true}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_58", "test_name": "Send direct message", "service": "slack", "task_horizon": 3, "operation_type": "search+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.open\",\"chat.postMessage\"]}"} +{"question": "Try to invite the user 'ElonMusk' to #general. If you can't find him, inform me (Hubert) via Slack.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":{\"eq\":true}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"message_text\":{\"regex\":\"find|found|unable|couldn't|could not|not exist\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U06HUBERT23\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_88", "test_name": "Constraint Verification (Negative Test)", "service": "slack", "task_horizon": 4, "operation_type": "search+C+R", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"users.list\",\"conversations.open\",\"chat.postMessage\"]}"} +{"question": "Search for all messages (6 messages) related to login issues and auth improvments. Edit the message in the #engineering channel you sent before without details about issues and add the details about the issues and improvements. Do not change the meaning/ woring of the original messages just combine them.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"message_id\":{\"eq\":\"1700143200.000999\"}},\"expected_changes\":{\"message_text\":{\"to\":{\"contains\":\"500 errors\"}}}},{\"diff_type\":\"changed\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"message_id\":{\"eq\":\"1700143200.000999\"}},\"expected_changes\":{\"message_text\":{\"to\":{\"contains\":\"invalid_grant\"}}}},{\"diff_type\":\"changed\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"message_id\":{\"eq\":\"1700143200.000999\"}},\"expected_changes\":{\"message_text\":{\"to\":{\"contains\":\"login rate limit\"}}}},{\"diff_type\":\"changed\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"message_id\":{\"eq\":\"1700143200.000999\"}},\"expected_changes\":{\"message_text\":{\"to\":{\"contains\":\"ogin endpoint\"}}}},{\"diff_type\":\"changed\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"message_id\":{\"eq\":\"1700143200.000999\"}},\"expected_changes\":{\"message_text\":{\"to\":{\"contains\":\"captcha\"}}}},{\"diff_type\":\"changed\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"message_id\":{\"eq\":\"1700143200.000999\"}},\"expected_changes\":{\"message_text\":{\"to\":{\"contains\":\"empty password\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_77", "test_name": "Search messages in multiple channels and edit message in a channel with the combined results", "service": "slack", "task_horizon": 3, "operation_type": "search+R+U", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"search.messages\",\"conversations.history\",\"chat.update\"]}"} +{"question": "Send a 'hello' message to the general channel", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"contains\":\"hello\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_57", "test_name": "Send message to general channel", "service": "slack", "task_horizon": 2, "operation_type": "C+R", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"chat.postMessage\"]}"} +{"question": "Invite the Morgan who is NOT an admin to #random.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U05MORGAN23\"},\"channel_id\":{\"eq\":\"C02EFGH5678\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_90", "test_name": "Ambiguity Resolution (Contextual User)", "service": "slack", "task_horizon": 3, "operation_type": "C+R", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.list\",\"conversations.invite\"]}"} +{"question": "Post 'Status update: Alpha is on track' to the alpha dev channel.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"message_text\":{\"contains\":\"Status update\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_91", "test_name": "Ambiguity Resolution (Contextual Channel)", "service": "slack", "task_horizon": 2, "operation_type": "C+R", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"chat.postMessage\"]}"} +{"question": "In #random, react with :thumbsup: to all messages that are questions about lunch, and react with :thumbsdown: to the message about piza combo", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":{\"eq\":\"1699572000.000789\"},\"reaction_type\":{\"eq\":\"thumbsup\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":{\"eq\":\"1706052665.000000\"},\"reaction_type\":{\"eq\":\"thumbsup\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":{\"eq\":\"1706051755.000000\"},\"reaction_type\":{\"eq\":\"thumbsdown\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_67", "test_name": "Add emoji reactions", "service": "slack", "task_horizon": 5, "operation_type": "search+R+C", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.history\",\"reactions.add\",\"reactions.add\",\"reactions.add\"]}"} +{"question": "Send a bulleted list to #random with three items: 'Bagels', 'Coffee', and 'Donuts'. Use Slack Block Kit rich_text blocks with rich_text_list (style:bullet).", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C02EFGH5678\"},\"blocks\":{\"contains\":\"rich_text_list\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C02EFGH5678\"},\"blocks\":{\"contains\":\"\\\"style\\\":\\\"bullet\\\"\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_80", "test_name": "Rich Text: Bulleted List", "service": "slack", "task_horizon": 2, "operation_type": "search+C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"chat.postMessage\"]}"} +{"question": "Reply 'Next monday.' to the to MCP deployment questions in #general", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"parent_id\":{\"eq\":\"1700173200.000456\"},\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"contains\":\"Next monday\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_66", "test_name": "Reply in a thread", "service": "slack", "task_horizon": 3, "operation_type": "search+R+C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.history\",\"chat.postMessage\"]}"} +{"question": "Change the #general channel topic to 'Weekly standup discussions'", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"channels\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"}},\"expected_changes\":{\"topic_text\":{\"to\":{\"contains\":\"Weekly standup\"}}}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_69", "test_name": "Update channel topic", "service": "slack", "task_horizon": 2, "operation_type": "search+U", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.setTopic\"]}"} +{"question": "I need some help organizing our Diwali x Thanksgiving potluck celebration! We're doing a combined Indian and American traditions thing and I want to make sure we coordinate this properly across the team.\n\nFirst, can you check what channels we have that might be relevant for this event and see what's been discussed recently in #core-infra? I want to make sure I'm not stepping on any ongoing conversations. Also, I need to know who's on our team so I can figure out who to involve based on their backgrounds and expertise.\n\nOnce you've got that context, please update the topics for #core-infra, #project-alpha, and #growth to reflect that we're planning this potluck celebration. Then post an announcement in #project-alpha about the event.\n\nI also need you to check who's currently in #growth to make sure the right people are included, and open a direct message with Kenji Sato since I need to coordinate with him separately about timing given APAC schedules.\n\nOh, and there's an old message I posted earlier about the event that has wrong details - can you update it with the correct information? There's also an outdated announcement from last week that's no longer relevant, so please delete that. Finally, just react to Priya's message about bringing samosas to show I've seen it!", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":{\"eq\":true}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_95", "test_name": "Diwali x Thanksgiving Potluck", "service": "slack", "task_horizon": 13, "operation_type": "C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.history\",\"users.list\",\"conversations.setTopic\",\"conversations.setTopic\",\"conversations.setTopic\",\"chat.postMessage\",\"conversations.members\",\"conversations.open\",\"chat.update\",\"chat.delete\",\"reactions.add\",\"chat.postMessage\"]}"} +{"question": "Sophie and Mateo want to bring the workspace's food culture together under one roof — a \"Midnight Bazaar\" inspired by all those coffee and pizza conversations scattered around the channels. Dig through the workspace to find what food chatter has been going on and who's been part of it - specifically, search for the authors of the messages that contain the words \"food\" or \"eat\". That old archived channel nobody uses anymore — revive it and repurpose it as bazaar headquarters. Set a topic that captures the night-market vibe (needs to include the words \"street food\"), and write an opening post that weaves in whatever food discussions you find. While you're at it, some housekeeping: Mateo says he's drowning in #project-alpha-dev notifications and wants out — remove him. Also, that message about the espresso machine in #random? Edit it to plug the bazaar. And delete that stale message in #random asking about ordering \"large pies\" — the bazaar makes casual lunch plans obsolete.", "answer": "{\"assertions\":[{\"diff_type\":\"changed\",\"entity\":\"channels\",\"where\":{\"channel_id\":{\"eq\":\"C_OLD_PROJECT\"}},\"expected_changes\":{\"is_archived\":{\"from\":true,\"to\":false},\"topic_text\":{\"to\":{\"i_contains\":\"street food\"}}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C_OLD_PROJECT\"},\"user_id\":{\"eq\":\"U01AGENBOT9\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C_OLD_PROJECT\"}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"removed\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"user_id\":{\"eq\":\"U_MATEO\"}},\"expected_count\":1},{\"diff_type\":\"changed\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C02EFGH5678\"}},\"expected_changes\":{\"message_text\":{\"to\":{\"i_contains\":\"bazaar\"}}},\"ignore\":[\"blocks\"],\"expected_count\":{\"min\":1}},{\"diff_type\":\"removed\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C02EFGH5678\"},\"message_text\":{\"i_contains\":\"large pies\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_108", "test_name": "Midnight Bazaar", "service": "slack", "task_horizon": 8, "operation_type": "search+C+U+D", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"search.messages\",\"conversations.unarchive\",\"conversations.join\",\"conversations.setTopic\",\"chat.postMessage\",\"conversations.kick\",\"chat.update\",\"chat.delete\"]}"} +{"question": "It's end of Q4 and I need to reorganize our Slack workspace. Help me with the following:\n\n1. First, list all the channels I'm currently a member of. Format it as a numbered list showing: channel name, member count. Send that list to me as a DM to myself.\n\n2. The \"old-project-q3\" channel was archived but we're reviving it for Q1 planning. Unarchive it and rename it to \"q1-planning-2026\". Update the topic to \"Q1 2026 Planning - Americas Team\".\n\n3. In #project-alpha-dev, we want to focus on the Americas timezone team only. Check each member's timezone using their profile info, then remove anyone who is NOT in an Americas timezone (timezone should start with \"America/\").\n\n4. I left an 👀 reaction on the circuit-tracer thread in #engineering a while back - please remove that since we've addressed the issue.\n\n5. Join the #product-growth channel since I'm not in it yet.\n\n6. Finally, post a Q1 kickoff message in the newly renamed channel. In the message, list which team members from #project-alpha-dev are in Americas timezones (the ones who remain after cleanup) - include their names and timezones.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"message_text\":{\"contains\":\"general\"}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"changed\",\"entity\":\"channels\",\"where\":{\"channel_id\":{\"eq\":\"C_OLD_PROJECT\"}},\"expected_changes\":{\"is_archived\":{\"to\":{\"eq\":false}},\"channel_name\":{\"to\":{\"contains\":\"q1-planning\"}}}},{\"diff_type\":\"changed\",\"entity\":\"channels\",\"where\":{\"channel_id\":{\"eq\":\"C_OLD_PROJECT\"}},\"expected_changes\":{\"topic_text\":{\"to\":{\"contains\":\"Americas\"}}}},{\"diff_type\":\"removed\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"user_id\":{\"eq\":\"U_PRIYA\"}},\"expected_count\":1},{\"diff_type\":\"removed\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"user_id\":{\"eq\":\"U_LUKAS\"}},\"expected_count\":1},{\"diff_type\":\"removed\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"user_id\":{\"eq\":\"U_KENJI\"}},\"expected_count\":1},{\"diff_type\":\"removed\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"user_id\":{\"eq\":\"U_AISHA\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"user_id\":{\"eq\":\"U_MATEO\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"user_id\":{\"eq\":\"U_MATEO\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"user_id\":{\"eq\":\"U_MATEO\"}},\"expected_count\":0},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"user_id\":{\"eq\":\"U_ROBERT\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"user_id\":{\"eq\":\"U_ROBERT\"}},\"expected_count\":0},{\"diff_type\":\"changed\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C06ALPHADEV\"},\"user_id\":{\"eq\":\"U_ROBERT\"}},\"expected_count\":0},{\"diff_type\":\"removed\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":{\"eq\":\"1706110000.000100\"},\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"reaction_type\":{\"eq\":\"eyes\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C_GROWTH\"},\"user_id\":{\"eq\":\"U01AGENBOT9\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C_OLD_PROJECT\"},\"message_text\":{\"contains\":\"Mateo\"}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C_OLD_PROJECT\"},\"message_text\":{\"contains\":\"Robert\"}},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_106", "test_name": "Quarterly Workspace Reorganization", "service": "slack", "task_horizon": 14, "operation_type": "C+R+U+D", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.conversations\",\"conversations.members\",\"conversations.open\",\"chat.postMessage\",\"conversations.list\",\"conversations.unarchive\",\"conversations.rename\",\"conversations.setTopic\",\"conversations.members\",\"users.info\",\"conversations.kick\",\"reactions.remove\",\"conversations.join\",\"chat.postMessage\"]}"} +{"question": "Search for all messages (6 messages) related to login issues and auth improvments. Combine them into a single new message as DM to Hubert. Do not change the meaning of the original messages just combine them.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":{\"eq\":true}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U06HUBERT23\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"message_text\":{\"contains\":\"500 errors\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"message_text\":{\"contains\":\"invalid_grant\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"message_text\":{\"contains\":\"login rate limit\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"message_text\":{\"contains\":\"login endpoint\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"message_text\":{\"contains\":\"captcha\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"message_text\":{\"contains\":\"empty password\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_76", "test_name": "Search messages in multiple channels and combine results into a new message as DM", "service": "slack", "task_horizon": 4, "operation_type": "search+R+C", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"search.messages\",\"users.list\",\"conversations.open\",\"chat.postMessage\"]}"} +{"question": "I need some help cleaning up our Afrobeats festival streaming project workspace. We've had some organizational issues lately. First, can you find out who's currently hanging out in #random and also get me a list of everyone on the team? I need to verify Robert Chen's role since he's supposed to be leading the engineering side of things.\n\nI also need you to dig through our CDN-related conversations - we had some important discussions about our content delivery setup that I need to reference. Check what's been happening recently in #project-alpha too.\n\nOnce you've got the lay of the land, please remove Artem Bogdanov from #project-alpha-dev and also kick Hubert Marek from #core-infra - they've moved to different workstreams. Post an update to #core-infra about our streaming infrastructure progress, and update the topic there to reflect our virtual festival focus.\n\nThere's also an outdated message I posted earlier that needs deleting, and I need to correct some information in a previous update I sent. Can you help me sort all this out?", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_97", "test_name": "Music Festival Tech Stack", "service": "slack", "task_horizon": 13, "operation_type": "search+C+R+U+D", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.members\",\"users.list\",\"users.info\",\"search.messages\",\"search.messages\",\"conversations.history\",\"conversations.list\",\"conversations.kick\",\"conversations.kick\",\"chat.postMessage\",\"conversations.setTopic\",\"chat.delete\",\"chat.update\"]}"} +{"question": "Remove John from the #random channel", "answer": "{\"assertions\":[{\"diff_type\":\"removed\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":{\"eq\":\"C02EFGH5678\"},\"user_id\":{\"eq\":\"U02JOHNDOE1\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_63", "test_name": "Remove user from channel", "service": "slack", "task_horizon": 3, "operation_type": "search+D", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.list\",\"conversations.kick\"]}"} +{"question": "Kenji, Priya, Aisha, Sophie, Lukasz, and Mateo want to do a \"Sunrise Relay\" — a collaborative poetry chain where each person writes a verse when dawn breaks in their timezone, passing the baton westward as the sun moves around the earth. Pull up everyone's locale and timezone info so you can figure out the correct relay order from earliest sunrise to latest. Check what's been going on in #frontend for some creative inspiration to seed the poem's theme. Create a channel called #sunrise-relay, set the topic to the relay schedule showing each person and their timezone in sunrise order in exactly this format: \": \\n\" , invite all six, and post the full relay plan as the opening message. Drop a :sunrise: reaction on that schedule post. While you're looking at timezones, Mateo mentioned he can't keep up with #model-research because all the discussions happen during European hours and he's on Pacific time — pull him out of that channel. Oh, and rename #sunrise-relay to #dawn-chorus — the group decided the poem should be about birdsong at first light.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"channel_name\":\"dawn-chorus\",\"topic_text\":{\"regex\":\"(?s)(?=.*Lagos)(?=.*Warsaw)(?=.*Paris)Tokyo.*Kolkata.*Los_Angeles\"}},\"expected_count\":1,\"description\":\"Channel created as #sunrise-relay and renamed to #dawn-chorus, topic has relay schedule with all 6 timezones in correct east-to-west sunrise order\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"in\":[\"U_KENJI\",\"U_PRIYA\",\"U_AISHA\",\"U_LUKAS\",\"U_SOPHIE\",\"U_MATEO\"]}},\"expected_count\":{\"min\":6},\"description\":\"All 6 relay participants invited to the new channel\"},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":\"U01AGENBOT9\",\"message_text\":{\"regex\":\"(?s)(?=.*Tokyo)(?=.*Los_Angeles)\"}},\"expected_count\":{\"min\":1},\"description\":\"Agent posted the full relay plan as opening message with timezone references\"},{\"diff_type\":\"added\",\"entity\":\"message_reactions\",\"where\":{\"reaction_type\":\"sunrise\"},\"expected_count\":1,\"description\":\":sunrise: reaction dropped on the schedule post\"},{\"diff_type\":\"removed\",\"entity\":\"channel_members\",\"where\":{\"channel_id\":\"C_MODEL\",\"user_id\":\"U_MATEO\"},\"expected_count\":1,\"description\":\"Mateo removed from #model-research due to Pacific timezone mismatch with European discussion hours\"}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_111", "test_name": "Dawn Chorus", "service": "slack", "task_horizon": 14, "operation_type": "search+C+R+U", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "high", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.list\",\"conversations.history\",\"conversations.create\",\"conversations.setTopic\",\"conversations.invite\",\"chat.postMessage\",\"reactions.add\",\"conversations.kick\",\"conversations.rename\"]}"} +{"question": "React with :thumbsup: to the most recent posted message in #general", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":{\"eq\":\"1706115500.000001\"},\"user_id\":{\"eq\":\"U01AGENBOT9\"},\"reaction_type\":{\"eq\":\"thumbsup\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_68", "test_name": "Add emoji reaction", "service": "slack", "task_horizon": 3, "operation_type": "search+R+C", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "high", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.history\",\"reactions.add\"]}"} +{"question": "There's a thread in #engineering where Robert asked about the circuit tracer library rewrite timeline. We've been having issues with the layer-by-layer loading and need to rewrite it in PyTorch from scratch to handle multi-GPU distribution properly.\n\nSophie sent me a DM with her implementation plan and timeline since she's leading the PyTorch migration. Check my DM with Sophie to find her estimated completion date, then reply to Robert's question in the thread with that information.\n\nAfter replying, add a checkmark reaction to the original thread message to mark it as addressed.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C03IJKL9012\"},\"parent_id\":{\"eq\":\"1706110000.000100\"},\"message_text\":{\"contains\":\"Wednesday\"}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"message_reactions\",\"where\":{\"message_id\":{\"eq\":\"1706110000.000100\"},\"user_id\":{\"eq\":\"U01AGENBOT9\"}},\"expected_count\":{\"min\":1}}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_105", "test_name": "Thread Q&A from DM - Circuit Tracer Rewrite", "service": "slack", "task_horizon": 8, "operation_type": "search+C+R", "entity_scope": "single", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.history\",\"conversations.replies\",\"users.list\",\"conversations.open\",\"conversations.history\",\"chat.postMessage\",\"reactions.add\"]}"} +{"question": "Who are the admins of the 'Test Workspace'? Reply with their names in #random.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C02EFGH5678\"},\"message_text\":{\"contains\":\"Robert\"}},\"expected_count\":1},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C02EFGH5678\"},\"message_text\":{\"contains\":\"Morgan\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_89", "test_name": "Information Synthesis", "service": "slack", "task_horizon": 4, "operation_type": "search+R+C+D", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "low", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.list\",\"chat.postMessage\"]}"} +{"question": "Delete the message about new feature you posted in #general", "answer": "{\"assertions\":[{\"diff_type\":\"removed\",\"entity\":\"messages\",\"where\":{\"channel_id\":{\"eq\":\"C01ABCD1234\"},\"message_text\":{\"contains\":\"new feature\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_73", "test_name": "Delete old message", "service": "slack", "task_horizon": 3, "operation_type": "R+D", "entity_scope": "single", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.history\",\"chat.delete\"]}"} +{"question": "How many active private conversations do I have? If I have less than seven conversations, please create new conversations with the users one by one in alphabetic order, skipping those with whom I already have conversations. If I have more than seven conversations, start removing conversations with those in alphabetic order until I have exactly seven conversations.", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":{\"eq\":true}},\"expected_count\":6,\"description\":\"6 new DM channels created to reach target of 7 private conversations (Agent started with 1 DM with Sophie, needs DMs with: Aisha, Artem, Carlos, Gabriel, Hubert, John \\u2014 first 6 alphabetically excluding Sophie)\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U_LUKAS\"}},\"expected_count\":0,\"description\":\"\\u0141ukasz (alphabetically after John) should NOT be added \\u2014 agent stops at 7 conversations\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U_MATEO\"}},\"expected_count\":0,\"description\":\"Mateo (alphabetically after John) should NOT be added \\u2014 agent stops at 7 conversations\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U07MORGANFREE\"}},\"expected_count\":0,\"description\":\"Morgan Freeman (alphabetically after John) should NOT be added \\u2014 agent stops at 7 conversations\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U05MORGAN23\"}},\"expected_count\":0,\"description\":\"Morgan Stanley (alphabetically after John) should NOT be added \\u2014 agent stops at 7 conversations\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U08NICK23\"}},\"expected_count\":0,\"description\":\"Nick (alphabetically after John) should NOT be added \\u2014 agent stops at 7 conversations\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U_OLENA\"}},\"expected_count\":0,\"description\":\"Olena (alphabetically after John) should NOT be added \\u2014 agent stops at 7 conversations\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U04OMER23\"}},\"expected_count\":0,\"description\":\"Omer (alphabetically after John) should NOT be added \\u2014 agent stops at 7 conversations\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U_PRIYA\"}},\"expected_count\":0,\"description\":\"Priya (alphabetically after John) should NOT be added \\u2014 agent stops at 7 conversations\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U_ROBERT\"}},\"expected_count\":0,\"description\":\"Robert Chen (alphabetically after John) should NOT be added \\u2014 agent stops at 7 conversations\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U03ROBERT23\"}},\"expected_count\":0,\"description\":\"Robert Walsh (alphabetically after John) should NOT be added \\u2014 agent stops at 7 conversations\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":{\"eq\":\"U_KENJI\"}},\"expected_count\":0,\"description\":\"Kenji (alphabetically after John) should NOT be added \\u2014 agent stops at 7 conversations\"}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_115", "test_name": "Manage Private Conversations", "service": "slack", "task_horizon": 8, "operation_type": "C+R", "entity_scope": "multi", "information_availability": "implicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"users.list\",\"conversations.open\"]}"} +{"question": "Think of the workspace as a coastline full of tide pools — each channel is its own micro-ecosystem, and you're the naturalist on a field survey. Start by pulling a roster of every organism on this coast and classify them into two species: \"admin\" and \"member.\" How many of each do you count? You need to sort the channel names in alphabetic order and send a message to Omer, in exactly this format: \"Field Repoert 1: : [, ]\". Then inspect #engineering. Probe under the circuit-tracer rock in that channel — there's a thread with replies most people never noticed. Count exactly how many replies are down there and note who left them. Over in #random, that message about coordinating lunch plans is an invasive species — remove it. And whoever originally posted that circuit-tracer message in #engineering — open a private channel with them and send them a field report formatted exactly like this: \"Field Report 2: [N] replies found under circuit-tracer in #engineering — organisms: [comma-separated names of repliers]\".", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":\"U04OMER23\"},\"expected_count\":1,\"description\":\"DM channel opened with Omer to deliver the channel survey (Field Report 1)\"},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":\"U01AGENBOT9\",\"message_text\":{\"regex\":\"(?s)Field Repoert 1.*core-infra[^\\\\n]*\\\\[0,\\\\s*9\\\\].*engineering[^\\\\n]*\\\\[1,\\\\s*4\\\\].*frontend[^\\\\n]*\\\\[0,\\\\s*9\\\\].*general[^\\\\n]*\\\\[1,\\\\s*2\\\\].*growth[^\\\\n]*\\\\[0,\\\\s*4\\\\].*model-research[^\\\\n]*\\\\[0,\\\\s*9\\\\].*project-alpha[^\\\\n]*\\\\[0,\\\\s*1\\\\].*project-alpha-dev[^\\\\n]*\\\\[0,\\\\s*7\\\\].*random[^\\\\n]*\\\\[1,\\\\s*3\\\\]\"}},\"expected_count\":1,\"description\":\"Field Report 1 sent to Omer listing all channels alphabetically with correct per-channel admin/member counts (engineering [1,4], general [1,2], random [1,3], project-alpha-dev [0,7], etc.)\"},{\"diff_type\":\"removed\",\"entity\":\"messages\",\"where\":{\"channel_id\":\"C02EFGH5678\",\"message_text\":{\"i_contains\":\"lunch\"}},\"expected_count\":1,\"description\":\"Lunch coordination message removed from #random \\u2014 invasive species eliminated\"},{\"diff_type\":\"added\",\"entity\":\"channel_members\",\"where\":{\"user_id\":\"U_LUKAS\"},\"expected_count\":1,\"description\":\"DM opened with Lukasz (U_LUKAS) \\u2014 the original poster of the circuit-tracer message in #engineering\"},{\"diff_type\":\"added\",\"entity\":\"messages\",\"where\":{\"user_id\":\"U01AGENBOT9\",\"message_text\":{\"regex\":\"(?si)Field Report 2:\\\\s*\\\\[?2\\\\]?\\\\s+repl.*circuit-tracer.*robert.*kenji\"}},\"expected_count\":1,\"description\":\"Field Report 2 sent to circuit-tracer poster reporting exactly 2 replies by Robert and Kenji under the circuit-tracer thread\"}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_113", "test_name": "Tide Pool", "service": "slack", "task_horizon": 7, "operation_type": "C+R+D", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "medium", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"users.list\",\"conversations.list\",\"conversations.open\",\"chat.postMessage\",\"conversations.replies\",\"chat.delete\",\"conversations.history\"]}"} +{"question": "Hey, I need your help organizing a Cricket World Cup watch party for our office! We've got team members spread across India, UK, and Australia timezones, so this needs some coordination.\n\nFirst, can you check what channels we already have that might be relevant to this kind of event? I want to make sure we're not duplicating efforts.\n\nI think we should create a dedicated channel for the watch party coordination. Once that's set up, update the topic so people know what it's for. I also need to reach out to Priya Sharma directly since she handles infrastructure and we'll need her help with the streaming setup across offices.\n\nCan you pull up our team roster so I can see who else might want to be involved? Oh, and I posted a message in #general about the watch party time being 3pm PST - that's wrong, it should be 3pm IST since we're primarily coordinating with the India office. Please fix that. There's also an old message I sent about booking a downtown venue that's no longer happening - just delete that one entirely.\n\nThanks!", "answer": "{\"assertions\":[{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_private\":{\"eq\":false}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"added\",\"entity\":\"channels\",\"where\":{\"is_dm\":{\"eq\":true}},\"expected_count\":{\"min\":1}},{\"diff_type\":\"changed\",\"entity\":\"messages\",\"where\":{\"message_id\":{\"eq\":\"1699564900.000124\"}},\"expected_changes\":{\"message_text\":{\"to\":{\"contains\":\"IST\"}}}},{\"diff_type\":\"removed\",\"entity\":\"messages\",\"where\":{\"message_id\":{\"eq\":\"1699564950.000125\"}},\"expected_count\":1}],\"ignore_fields\":{\"global\":[\"created_at\",\"updated_at\"]}}", "test_id": "slack_103", "test_name": "Cricket World Cup Watch Party", "service": "slack", "task_horizon": 7, "operation_type": "C+R+U+D", "entity_scope": "multi", "information_availability": "explicit", "prompt_ambiguity": "high", "info": "{\"service\":\"slack\",\"seed_template\":\"slack_bench_v2\",\"impersonate_user_id\":\"U01AGENBOT9\",\"eval_type\":\"actionEval\",\"tools_required\":[\"conversations.list\",\"conversations.create\",\"conversations.setTopic\",\"conversations.open\",\"users.list\",\"chat.update\",\"chat.delete\"]}"} diff --git a/datasets/agent-diff-bench/train.parquet b/datasets/agent-diff-bench/train.parquet new file mode 100644 index 0000000..00f6d18 Binary files /dev/null and b/datasets/agent-diff-bench/train.parquet differ diff --git a/examples/box/testsuites/box_bench.json b/examples/box/testsuites/box_bench.json index 26c8275..3c9be2f 100644 --- a/examples/box/testsuites/box_bench.json +++ b/examples/box/testsuites/box_bench.json @@ -34,7 +34,12 @@ "tools_required": [ "GET /users/me", "POST /folders" - ] + ], + "task_horizon": 2, + "operation_type": "R+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -61,7 +66,12 @@ "tools_required": [ "GET /search", "POST /folders" - ] + ], + "task_horizon": 2, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -91,7 +101,12 @@ "tools_required": [ "GET /search", "POST /comments" - ] + ], + "task_horizon": 2, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -126,7 +141,12 @@ "tools_required": [ "GET /search", "POST /comments" - ] + ], + "task_horizon": 2, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -156,7 +176,12 @@ "tools_required": [ "GET /search", "PUT /folders/{id}" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -190,7 +215,12 @@ "GET /search", "GET /search", "PUT /files/{id}" - ] + ], + "task_horizon": 3, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -222,7 +252,12 @@ "min_tool_calls": 1, "tools_required": [ "POST /hubs" - ] + ], + "task_horizon": 1, + "operation_type": "C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -249,7 +284,12 @@ "tools_required": [ "GET /search", "PUT /folders/{id}" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -282,7 +322,12 @@ "tools_required": [ "GET /search", "PUT /folders/{id}" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -316,7 +361,12 @@ "GET /search", "POST /comments", "POST /tasks" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -355,7 +405,12 @@ "POST /folders", "GET /search", "PUT /files/{id}" - ] + ], + "task_horizon": 4, + "operation_type": "C+search+U", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -409,7 +464,12 @@ "GET /search", "GET /folders/{id}/items", "PUT /folders/{id}" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -442,7 +502,12 @@ "tools_required": [ "GET /hubs", "POST /folders" - ] + ], + "task_horizon": 2, + "operation_type": "R+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -473,7 +538,12 @@ "PUT /files/{id}", "PUT /files/{id}", "PUT /files/{id}" - ] + ], + "task_horizon": 6, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -518,7 +588,12 @@ "GET /files/{id}/content", "GET /search", "PUT /files/{id}" - ] + ], + "task_horizon": 4, + "operation_type": "search+R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -552,7 +627,12 @@ "POST /hubs", "GET /search", "POST /hubs/{id}/manage_items" - ] + ], + "task_horizon": 3, + "operation_type": "C+search", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -591,7 +671,12 @@ "DELETE /files/{id}", "GET /files/{id}/content", "PUT /files/{id}" - ] + ], + "task_horizon": 4, + "operation_type": "search+D+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -642,7 +727,12 @@ "GET /folders/{id}/items", "GET /files/{id}/content", "PUT /folders/{id}" - ] + ], + "task_horizon": 4, + "operation_type": "search+R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -681,7 +771,12 @@ "GET /search", "GET /folders/{id}/items", "PUT /folders/{id}" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -716,7 +811,12 @@ "GET /folders/{id}/items", "GET /files/{id}/content", "POST /files/content" - ] + ], + "task_horizon": 4, + "operation_type": "search+R+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -746,7 +846,12 @@ "GET /search", "GET /files/{id}/content", "POST /comments" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "impersonate_user_id": "27512847635", "assertions": [ @@ -779,7 +884,12 @@ "GET /folders/{id}/items", "PUT /files/{id}", "PUT /files/{id}" - ] + ], + "task_horizon": 4, + "operation_type": "search+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -830,7 +940,12 @@ "GET /search", "POST /folders", "PUT /folders/{id}" - ] + ], + "task_horizon": 4, + "operation_type": "C+search+U", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -877,7 +992,12 @@ "PUT /files/{id}", "PUT /files/{id}", "PUT /files/{id}" - ] + ], + "task_horizon": 6, + "operation_type": "search+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" }, "assertions": [ { @@ -975,7 +1095,12 @@ "GET /search", "GET /search", "PUT /files/{id}" - ] + ], + "task_horizon": 3, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -1010,7 +1135,12 @@ "GET /search", "GET /folders/{id}/items", "POST /hubs/{id}/manage_items" - ] + ], + "task_horizon": 4, + "operation_type": "C+search+R", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -1051,7 +1181,12 @@ "GET /folders/{id}/items", "GET /folders/{id}/items", "DELETE /files/{id}" - ] + ], + "task_horizon": 4, + "operation_type": "search+R+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" }, "assertions": [ { @@ -1087,7 +1222,12 @@ "GET /folders/{id}/items", "PUT /files/{id}", "DELETE /folders/{id}" - ] + ], + "task_horizon": 11, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1167,7 +1307,12 @@ "tools_required": [ "GET /search", "PUT /files/{id}" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1201,7 +1346,12 @@ "GET /search", "GET /search", "PUT /files/{id}" - ] + ], + "task_horizon": 3, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "high" }, "assertions": [ { @@ -1233,7 +1383,12 @@ "tools_required": [ "GET /search", "POST /files/content" - ] + ], + "task_horizon": 2, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1264,7 +1419,12 @@ "tools_required": [ "POST /files/content", "DELETE /files/{id}" - ] + ], + "task_horizon": 2, + "operation_type": "C+D", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1295,7 +1455,12 @@ "GET /search", "GET /files/{id}/content", "POST /files/{id}/content" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1334,7 +1499,12 @@ "GET /search", "GET /folders/{id}/items", "DELETE /files/{id}" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+D", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "high" }, "assertions": [ { @@ -1368,7 +1538,12 @@ "GET /search", "POST /hubs", "POST /hubs/{id}/manage_items" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1411,7 +1586,12 @@ "PUT /files/{id}", "PUT /files/{id}", "PUT /files/{id}" - ] + ], + "task_horizon": 6, + "operation_type": "search+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1450,7 +1630,12 @@ "PUT /files/{id}", "PUT /files/{id}", "PUT /files/{id}" - ] + ], + "task_horizon": 5, + "operation_type": "search+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -1484,7 +1669,12 @@ "GET /search", "GET /files/{id}/comments", "POST /folders" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1512,7 +1702,12 @@ "GET /search", "GET /files/{id}/content", "POST /comments" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -1550,7 +1745,12 @@ "POST /hubs", "POST /hubs/{id}/manage_items", "POST /files/content" - ] + ], + "task_horizon": 11, + "operation_type": "search+R+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -1618,7 +1818,12 @@ "PUT /folders/{id}", "POST /hubs", "POST /hubs/{id}/manage_items" - ] + ], + "task_horizon": 11, + "operation_type": "search+R+C+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" }, "assertions": [ { @@ -1713,7 +1918,7 @@ "expected_count": 1 } ], - "prompt": "Your research institute's Box storage is disorganized. Somewhere in the archive, there are field research documents from cryptozoology expeditions \u2014 specifically sighting reports that may contain photographic evidence of unidentified creatures. Your task: Find a cryptozoology sighting report (search for relevant terms). Download and read its content. If the document mentions \"photographic evidence\" anywhere in the text, it should be tagged as verified; otherwise tag it unverified. Create a proper organizational structure: a main folder \"Expeditions_2025\" in the root, with a subfolder \"Cryptid_Sightings\" inside it. Move the sighting report into this subfolder with the appropriate tag. Add a comment to the file documenting your review: include today's date and the expedition name (which you'll find mentioned in the document's content). After moving the file, check its original location. If there are any obvious duplicate files (backup copies with similar names), delete them to clean up. Then rename the original source folder by appending \"_archived\" to its name. Finally, create a Hub called \"2025 Field Research Index\" and add the \"Expeditions_2025\" folder to it for easy access." + "prompt": "Your research institute's Box storage is disorganized. Somewhere in the archive, there are field research documents from cryptozoology expeditions — specifically sighting reports that may contain photographic evidence of unidentified creatures. Your task: Find a cryptozoology sighting report (search for relevant terms). Download and read its content. If the document mentions \"photographic evidence\" anywhere in the text, it should be tagged as verified; otherwise tag it unverified. Create a proper organizational structure: a main folder \"Expeditions_2025\" in the root, with a subfolder \"Cryptid_Sightings\" inside it. Move the sighting report into this subfolder with the appropriate tag. Add a comment to the file documenting your review: include today's date and the expedition name (which you'll find mentioned in the document's content). After moving the file, check its original location. If there are any obvious duplicate files (backup copies with similar names), delete them to clean up. Then rename the original source folder by appending \"_archived\" to its name. Finally, create a Hub called \"2025 Field Research Index\" and add the \"Expeditions_2025\" folder to it for easy access." }, { "id": "test_42", @@ -1732,7 +1937,12 @@ "POST /comments", "DELETE /files/{id}", "POST /hubs/{id}/manage_items" - ] + ], + "task_horizon": 8, + "operation_type": "R+search+U+C+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" }, "assertions": [ { @@ -1813,7 +2023,7 @@ "expected_count": 1 } ], - "prompt": "The tea ceremony school is transitioning to Ro season (\u7089, the winter hearth period). You need to help organize the digital materials for this important seasonal change. First, find which hub already exists for tea ceremony seasonal materials \u2014 you'll need to add updated content there later. Locate the winter preparation guide in the chado folder. Verify it's the current document (not a draft), then update it with the tag winter_season and set its description to \"Ro season preparation - \u7089 (November-April)\". Add a comment to the winter preparation guide noting: \"Ready for Ro season (\u7089) - charcoal placement verified.\" Next, find the utensil inventory file. Add a comment reminding the team: \"Utensils require cleaning before Hatsugama ceremony.\" There's an old draft file in the same folder that has been superseded \u2014 it's clearly marked as obsolete. Delete it to clean up the archive. Finally, add the winter preparation guide to the seasonal materials hub so it's easily accessible to all practitioners." + "prompt": "The tea ceremony school is transitioning to Ro season (炉, the winter hearth period). You need to help organize the digital materials for this important seasonal change. First, find which hub already exists for tea ceremony seasonal materials — you'll need to add updated content there later. Locate the winter preparation guide in the chado folder. Verify it's the current document (not a draft), then update it with the tag winter_season and set its description to \"Ro season preparation - 炉 (November-April)\". Add a comment to the winter preparation guide noting: \"Ready for Ro season (炉) - charcoal placement verified.\" Next, find the utensil inventory file. Add a comment reminding the team: \"Utensils require cleaning before Hatsugama ceremony.\" There's an old draft file in the same folder that has been superseded — it's clearly marked as obsolete. Delete it to clean up the archive. Finally, add the winter preparation guide to the seasonal materials hub so it's easily accessible to all practitioners." }, { "id": "test_43", @@ -1833,7 +2043,12 @@ "PUT /tasks/{id}", "PUT /files/{id}", "POST /hubs" - ] + ], + "task_horizon": 9, + "operation_type": "search+R+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -1876,7 +2091,7 @@ "expected_count": 1 } ], - "prompt": "You're helping manage the documentation for a Moog Minimoog restoration project. The synth is from 1974 (serial 10847) and the team has been tracking repairs and calibrations in Box. First, search for files related to the Minimoog or Moog restoration. Get the details of the project folder to understand what's there. Check if any synth restoration documents are in your favorites collection. On the capacitor replacement log, add a new comment documenting: \"C47 replaced with Nichicon 47\u00b5F/25V - oscillator section complete.\" Then find the existing comment about \"C31 verified\" and update it to add: \"- measurement confirmed at 0.98x nominal.\" For the filter calibration procedure file, there are two pending tasks. Find the task about \"resonance calibration\" and mark it as complete. Find the task about \"cutoff tracking\" and update its message to: \"Cutoff tracking verified \u00b13 cents across 5 octaves - exceeds spec.\" Add the tag restoration-complete to the oscillator schematic notes file since that section is now finished. Finally, create a new hub called \"Synth Restoration Archive\" to centralize all vintage instrument documentation going forward." + "prompt": "You're helping manage the documentation for a Moog Minimoog restoration project. The synth is from 1974 (serial 10847) and the team has been tracking repairs and calibrations in Box. First, search for files related to the Minimoog or Moog restoration. Get the details of the project folder to understand what's there. Check if any synth restoration documents are in your favorites collection. On the capacitor replacement log, add a new comment documenting: \"C47 replaced with Nichicon 47µF/25V - oscillator section complete.\" Then find the existing comment about \"C31 verified\" and update it to add: \"- measurement confirmed at 0.98x nominal.\" For the filter calibration procedure file, there are two pending tasks. Find the task about \"resonance calibration\" and mark it as complete. Find the task about \"cutoff tracking\" and update its message to: \"Cutoff tracking verified ±3 cents across 5 octaves - exceeds spec.\" Add the tag restoration-complete to the oscillator schematic notes file since that section is now finished. Finally, create a new hub called \"Synth Restoration Archive\" to centralize all vintage instrument documentation going forward." }, { "id": "test_44", @@ -1900,7 +2115,12 @@ "GET /hubs/{id}", "PUT /hubs/{id}", "DELETE /folders/{id}" - ] + ], + "task_horizon": 13, + "operation_type": "R+search+C+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -1973,7 +2193,7 @@ } } ], - "prompt": "The rare book conservation lab is running its year-end audit. You need to aggregate treatment data and update the annual summary. First, confirm your identity \u2014 who are you logged in as? You'll need this for audit attribution. Locate the conservation lab folder and check its contents. Get the details of both quarterly humidity logs (Q3 and Q4 2025) \u2014 each contains a \"BOOKS TREATED THIS QUARTER\" count that you'll need. Check if any conservation documents are currently in your favorites collection. On the incunabula condition report, add a comment: \"Audit initiated by [your username] on [today's date].\" Also find the existing comment about \"Budget review pending\" and update it to: \"Budget approved - Q3+Q4 aggregated total: [X] books\" where X is the sum of books treated in Q3 and Q4. There's an outdated comment on the condition report marked \"[OUTDATED]\" with incorrect information \u2014 delete it. Download the annual summary file, update it with the correct Q3 and Q4 treatment counts (extracted from the humidity logs), and upload it as a new version. The total YTD should now reflect all four quarters. Find the \"Conservation Lab Archive\" hub and update its description to: \"Rare book conservation documentation - Last audit: Q4 2025.\" Finally, there's a deprecated folder from 2024 that's scheduled for deletion \u2014 remove it." + "prompt": "The rare book conservation lab is running its year-end audit. You need to aggregate treatment data and update the annual summary. First, confirm your identity — who are you logged in as? You'll need this for audit attribution. Locate the conservation lab folder and check its contents. Get the details of both quarterly humidity logs (Q3 and Q4 2025) — each contains a \"BOOKS TREATED THIS QUARTER\" count that you'll need. Check if any conservation documents are currently in your favorites collection. On the incunabula condition report, add a comment: \"Audit initiated by [your username] on [today's date].\" Also find the existing comment about \"Budget review pending\" and update it to: \"Budget approved - Q3+Q4 aggregated total: [X] books\" where X is the sum of books treated in Q3 and Q4. There's an outdated comment on the condition report marked \"[OUTDATED]\" with incorrect information — delete it. Download the annual summary file, update it with the correct Q3 and Q4 treatment counts (extracted from the humidity logs), and upload it as a new version. The total YTD should now reflect all four quarters. Find the \"Conservation Lab Archive\" hub and update its description to: \"Rare book conservation documentation - Last audit: Q4 2025.\" Finally, there's a deprecated folder from 2024 that's scheduled for deletion — remove it." }, { "id": "test_45", @@ -1994,7 +2214,12 @@ "GET /files/{id}/tasks", "DELETE /tasks/{id}", "GET /hubs" - ] + ], + "task_horizon": 10, + "operation_type": "search+R+C+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" }, "assertions": [ { @@ -2039,7 +2264,7 @@ "expected_count": 1 } ], - "prompt": "Your history research archive in Box is disorganized and needs cleanup. You have redundant folders, misfiled documents, and obsolete tasks cluttering the system. In the history area, there are two folders that seem to contain overlapping Buenos Aires research: one called \"BA\" and one called \"Buenos Aires\". Consolidate them by moving the entire \"BA\" folder into \"Buenos Aires\" as a subfolder, then rename the \"BA\" folder to \"Legacy_Materials\" to indicate it contains older content. In the readings area, list the contents and look for organizational issues. The file \"digital history methods - week 3 reading.txt\" is sitting at the top level of the history folder but belongs in the \"digital humanities\" subfolder under readings. Move this file to its correct location. Create a new folder called \"Archive_Cleanup_2026\" in the root of the history folder to track this reorganization effort. Inside it, create a subfolder called \"Duplicates_Review\" where duplicate files can be moved for review. Look through the seed for files marked as duplicates (files with \"(1)\" in the name or \"backup\"/\"copy\" in the name). These files have obsolete tasks attached. Find and delete the tasks marked \"[OBSOLETE]\" or \"[OUTDATED]\" since the reorganization will handle these files differently. Check what hubs currently exist \u2014 you may want to add reorganized materials to an appropriate hub later." + "prompt": "Your history research archive in Box is disorganized and needs cleanup. You have redundant folders, misfiled documents, and obsolete tasks cluttering the system. In the history area, there are two folders that seem to contain overlapping Buenos Aires research: one called \"BA\" and one called \"Buenos Aires\". Consolidate them by moving the entire \"BA\" folder into \"Buenos Aires\" as a subfolder, then rename the \"BA\" folder to \"Legacy_Materials\" to indicate it contains older content. In the readings area, list the contents and look for organizational issues. The file \"digital history methods - week 3 reading.txt\" is sitting at the top level of the history folder but belongs in the \"digital humanities\" subfolder under readings. Move this file to its correct location. Create a new folder called \"Archive_Cleanup_2026\" in the root of the history folder to track this reorganization effort. Inside it, create a subfolder called \"Duplicates_Review\" where duplicate files can be moved for review. Look through the seed for files marked as duplicates (files with \"(1)\" in the name or \"backup\"/\"copy\" in the name). These files have obsolete tasks attached. Find and delete the tasks marked \"[OBSOLETE]\" or \"[OUTDATED]\" since the reorganization will handle these files differently. Check what hubs currently exist — you may want to add reorganized materials to an appropriate hub later." }, { "id": "test_46", @@ -2055,7 +2280,12 @@ "PUT /files/{id}", "GET /collections", "PUT /files/{id}" - ] + ], + "task_horizon": 5, + "operation_type": "R+search+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -2081,7 +2311,7 @@ ] } ], - "prompt": "You are preparing the final conservation audit for external review. First, confirm your identity \u2014 get your current user details. Locate the \"Annual Summary 2025\" file in the rare books folder. Create a shared link for this file with access set to \"open\" so external auditors can view it. Then, check your \"Favorites\" collection. If the Annual Summary is not already in your favorites, add it to the collection for quick access. Finally, verify the file's details to confirm the shared link is active and the file is listed in the collection." + "prompt": "You are preparing the final conservation audit for external review. First, confirm your identity — get your current user details. Locate the \"Annual Summary 2025\" file in the rare books folder. Create a shared link for this file with access set to \"open\" so external auditors can view it. Then, check your \"Favorites\" collection. If the Annual Summary is not already in your favorites, add it to the collection for quick access. Finally, verify the file's details to confirm the shared link is active and the file is listed in the collection." }, { "id": "test_47", @@ -2104,7 +2334,12 @@ "PUT /files/{id}", "POST /files/content", "PUT /hubs/{id}" - ] + ], + "task_horizon": 12, + "operation_type": "C+search+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -2187,7 +2422,12 @@ "POST /folders", "POST /files/content", "POST /comments" - ] + ], + "task_horizon": 9, + "operation_type": "search+R+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { diff --git a/examples/calendar/testsuites/calendar_bench.json b/examples/calendar/testsuites/calendar_bench.json index 8410c33..70066c4 100644 --- a/examples/calendar/testsuites/calendar_bench.json +++ b/examples/calendar/testsuites/calendar_bench.json @@ -113,7 +113,12 @@ "freeBusy.query", "events.patch", "events.delete" - ] + ], + "task_horizon": 7, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -233,7 +238,12 @@ "events.delete", "acl.insert", "calendars.insert" - ] + ], + "task_horizon": 8, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -637,7 +647,12 @@ "events.list", "acl.insert", "events.delete" - ] + ], + "task_horizon": 15, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -785,13 +800,18 @@ "events.delete", "acl.insert", "calendars.insert" - ] + ], + "task_horizon": 10, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_5", "name": "Symposium of Infinite Curiosity - Academic conference coordination", - "prompt": "The Symposium of Infinite Curiosity is three weeks away and the program is chaos. Find our main calendar - 'Symposium of Infinite Curiosity 2018'. We have sessions scheduled and I need an exact count of how many are in the 'Quantum' track (they'll have [Quantum] in the title). Add Mei-Lin's opening keynote - it's called 'Keynote: The Heresy of Obvious Conclusions' and should be Day 1 (Monday June 18) at 8am, lasting 1 hour. Update that keynote with a description: 'Mandatory attendance for all track chairs. Coffee will be existential.' Bogdan (bogdan@test.com) and Ravi (ravi@test.com) need to meet urgently on Day 2 (Tuesday June 19) afternoon to discuss a problematic submission - find when they're both free and create 'Secret Tribunal of the Program Committee' for 2 hours at that time on the symposium calendar. Dr. Chiamaka (chiamaka@test.com) is presenting four different papers across the conference - tell me when each of her sessions is. Someone finally noticed the irony: 'Workshop: Introduction to Procrastination (Postponed)' - delete it. Create a private calendar called 'Speakers Green Room of Mild Panic' for backstage coordination. Ingrid (ingrid@test.com) just joined as volunteer coordinator - give her edit access to the main symposium calendar. Chiamaka's first presentation needs to move to the same venue as 'Panel: Temporal Causality Roundtable' \u2014 but only if that venue is free at the same time. If that panel already occupies that venue at that time, move Chiamaka's first presentation to 'Annex of Temporal Studies' instead.", + "prompt": "The Symposium of Infinite Curiosity is three weeks away and the program is chaos. Find our main calendar - 'Symposium of Infinite Curiosity 2018'. We have sessions scheduled and I need an exact count of how many are in the 'Quantum' track (they'll have [Quantum] in the title). Add Mei-Lin's opening keynote - it's called 'Keynote: The Heresy of Obvious Conclusions' and should be Day 1 (Monday June 18) at 8am, lasting 1 hour. Update that keynote with a description: 'Mandatory attendance for all track chairs. Coffee will be existential.' Bogdan (bogdan@test.com) and Ravi (ravi@test.com) need to meet urgently on Day 2 (Tuesday June 19) afternoon to discuss a problematic submission - find when they're both free and create 'Secret Tribunal of the Program Committee' for 2 hours at that time on the symposium calendar. Dr. Chiamaka (chiamaka@test.com) is presenting four different papers across the conference - tell me when each of her sessions is. Someone finally noticed the irony: 'Workshop: Introduction to Procrastination (Postponed)' - delete it. Create a private calendar called 'Speakers Green Room of Mild Panic' for backstage coordination. Ingrid (ingrid@test.com) just joined as volunteer coordinator - give her edit access to the main symposium calendar. Chiamaka's first presentation needs to move to the same venue as 'Panel: Temporal Causality Roundtable' — but only if that venue is free at the same time. If that panel already occupies that venue at that time, move Chiamaka's first presentation to 'Annex of Temporal Studies' instead.", "type": "actionEval", "seed_template": "calendar_default", "impersonate_user_id": "user_agent", @@ -927,7 +947,12 @@ "events.delete", "calendars.insert", "acl.insert" - ] + ], + "task_horizon": 10, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -1037,7 +1062,12 @@ "events.delete", "acl.insert", "calendars.insert" - ] + ], + "task_horizon": 7, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -1145,7 +1175,12 @@ "events.patch", "events.delete", "acl.insert" - ] + ], + "task_horizon": 6, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -1226,7 +1261,12 @@ "events.patch", "events.delete", "acl.insert" - ] + ], + "task_horizon": 7, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -1556,7 +1596,12 @@ "freeBusy.query", "events.patch", "events.delete" - ] + ], + "task_horizon": 24, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -1703,7 +1748,12 @@ "calendarList.patch", "acl.insert", "acl.delete" - ] + ], + "task_horizon": 12, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -1841,7 +1891,12 @@ "calendarList.patch", "acl.insert", "calendarList.delete" - ] + ], + "task_horizon": 14, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -1976,7 +2031,12 @@ "calendarList.patch", "acl.insert", "acl.delete" - ] + ], + "task_horizon": 12, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -2082,13 +2142,18 @@ "freeBusy.query", "events.patch", "events.delete" - ] + ], + "task_horizon": 7, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_14", "name": "Clockwork Tinkerers Guild - Recurring series exceptions", - "prompt": "I\u2019m setting up the Clockwork Tinkerers Guild calendar. First, show me my calendars so I don\u2019t duplicate anything; if we don\u2019t already have it, create a calendar called Clockwork Tinkerers Guild and give Aiko (aiko@test.com) write access. Our Gear & Ember Workshop needs to run every Friday at 6:00pm for eight weeks starting June 22, 2018\u2014set it up as a recurring series. However, we need two exceptions: the June 29 session should start at 7:00pm and include the note \u2018Late start due to forge maintenance,\u2019 and the July 6 session must be cancelled entirely (festival blackout). After applying those exceptions, show me the guild calendar so I can confirm the series looks right. Then add a one-off event called Brass Beetle Showcase on Saturday July 7 at noon.", + "prompt": "I’m setting up the Clockwork Tinkerers Guild calendar. First, show me my calendars so I don’t duplicate anything; if we don’t already have it, create a calendar called Clockwork Tinkerers Guild and give Aiko (aiko@test.com) write access. Our Gear & Ember Workshop needs to run every Friday at 6:00pm for eight weeks starting June 22, 2018—set it up as a recurring series. However, we need two exceptions: the June 29 session should start at 7:00pm and include the note ‘Late start due to forge maintenance,’ and the July 6 session must be cancelled entirely (festival blackout). After applying those exceptions, show me the guild calendar so I can confirm the series looks right. Then add a one-off event called Brass Beetle Showcase on Saturday July 7 at noon.", "type": "actionEval", "seed_template": "calendar_default", "impersonate_user_id": "user_agent", @@ -2264,13 +2329,18 @@ "events.insert", "events.patch", "events.list" - ] + ], + "task_horizon": 7, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" } }, { "id": "test_15", "name": "Tidal Library Rotations - Recurring series lifecycle", - "prompt": "We\u2019re setting up the tidal library\u2019s long-term calendar. First, show me my calendars and create Tidal Library Rotations if it doesn\u2019t already exist. Share it with Fumiko (fumiko@test.com) so she can edit. The Moon-Shell Rebinding ritual needs to recur monthly on the first Tuesday at 9:00am, starting July 3, 2018, and should continue indefinitely until we cancel it. We also need two exceptions: the August 7, 2018 occurrence should start at 11:00am with a note \u2018Storm-surge delay,\u2019 and the September 4, 2018 occurrence should be cancelled entirely. Add a separate one-off event called Ink Tide Inventory on July 15, 2018 at 4:00pm. After confirming the schedule looks right, delete the entire Moon-Shell Rebinding series.", + "prompt": "We’re setting up the tidal library’s long-term calendar. First, show me my calendars and create Tidal Library Rotations if it doesn’t already exist. Share it with Fumiko (fumiko@test.com) so she can edit. The Moon-Shell Rebinding ritual needs to recur monthly on the first Tuesday at 9:00am, starting July 3, 2018, and should continue indefinitely until we cancel it. We also need two exceptions: the August 7, 2018 occurrence should start at 11:00am with a note ‘Storm-surge delay,’ and the September 4, 2018 occurrence should be cancelled entirely. Add a separate one-off event called Ink Tide Inventory on July 15, 2018 at 4:00pm. After confirming the schedule looks right, delete the entire Moon-Shell Rebinding series.", "type": "actionEval", "seed_template": "calendar_default", "impersonate_user_id": "user_agent", @@ -2403,13 +2473,18 @@ "events.insert", "events.patch", "events.delete" - ] + ], + "task_horizon": 8, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_16", "name": "Monastery of Echoing Bells - Daily recurring lifecycle", - "prompt": "We\u2019re setting up a proper schedule for the Monastery of Echoing Bells. First, show me my calendars so I don\u2019t duplicate anything, and create a calendar with that name if needed. Give Linh (linh@test.com) edit access. The Dawn Bell Rite must recur daily at 5:30am, starting June 18, 2018, and it should continue indefinitely until we cancel it. I need two exceptions: the June 20, 2018 occurrence should start at 6:30am with the note \u2018Storm quiet hours,\u2019 and the June 23, 2018 occurrence should be cancelled entirely. Also, check when Kwame (kwame@test.com) is free on the evening of June 24. After you confirm the schedule, delete the entire Dawn Bell Rite series.", + "prompt": "We’re setting up a proper schedule for the Monastery of Echoing Bells. First, show me my calendars so I don’t duplicate anything, and create a calendar with that name if needed. Give Linh (linh@test.com) edit access. The Dawn Bell Rite must recur daily at 5:30am, starting June 18, 2018, and it should continue indefinitely until we cancel it. I need two exceptions: the June 20, 2018 occurrence should start at 6:30am with the note ‘Storm quiet hours,’ and the June 23, 2018 occurrence should be cancelled entirely. Also, check when Kwame (kwame@test.com) is free on the evening of June 24. After you confirm the schedule, delete the entire Dawn Bell Rite series.", "type": "actionEval", "seed_template": "calendar_default", "impersonate_user_id": "user_agent", @@ -2561,7 +2636,12 @@ "events.patch", "freeBusy.query", "events.delete" - ] + ], + "task_horizon": 6, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" } }, { @@ -2618,7 +2698,12 @@ "events.insert", "events.patch", "settings.watch" - ] + ], + "task_horizon": 3, + "operation_type": "C+U", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" } }, { @@ -2681,7 +2766,12 @@ "tools_required": [ "events.import", "acl.delete" - ] + ], + "task_horizon": 2, + "operation_type": "C+U+D", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "medium" } }, { @@ -2741,7 +2831,12 @@ "tools_required": [ "events.insert", "acl.delete" - ] + ], + "task_horizon": 2, + "operation_type": "C+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -2815,7 +2910,12 @@ "acl.list", "events.update", "calendars.delete" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+U+D", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -2853,7 +2953,12 @@ "min_tool_calls": 1, "tools_required": [ "events.insert" - ] + ], + "task_horizon": 1, + "operation_type": "C", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -2906,7 +3011,12 @@ "events.quickAdd", "events.get", "settings.watch" - ] + ], + "task_horizon": 4, + "operation_type": "search+C+R+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "medium" } }, { @@ -2965,7 +3075,12 @@ "acl.get", "events.update", "events.list" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -2999,7 +3114,12 @@ "tools_required": [ "events.delete", "settings.list" - ] + ], + "task_horizon": 2, + "operation_type": "search+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -3025,7 +3145,12 @@ "min_tool_calls": 1, "tools_required": [ "calendars.insert" - ] + ], + "task_horizon": 1, + "operation_type": "C", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -3055,7 +3180,12 @@ "tools_required": [ "settings.list", "settings.watch" - ] + ], + "task_horizon": 2, + "operation_type": "search+C+R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -3135,7 +3265,12 @@ "tools_required": [ "calendarList.patch", "events.update" - ] + ], + "task_horizon": 2, + "operation_type": "R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -3187,7 +3322,12 @@ "tools_required": [ "events.quickAdd", "events.watch" - ] + ], + "task_horizon": 2, + "operation_type": "C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -3224,7 +3364,12 @@ "tools_required": [ "calendarList.get", "events.delete" - ] + ], + "task_horizon": 2, + "operation_type": "search+R+U+D", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -3260,7 +3405,12 @@ "min_tool_calls": 1, "tools_required": [ "events.delete" - ] + ], + "task_horizon": 1, + "operation_type": "U+D", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -3297,7 +3447,12 @@ "tools_required": [ "events.list", "events.move" - ] + ], + "task_horizon": 2, + "operation_type": "search+R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -3338,7 +3493,12 @@ "min_tool_calls": 1, "tools_required": [ "events.import" - ] + ], + "task_horizon": 1, + "operation_type": "C", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -3393,7 +3553,12 @@ "calendars.insert", "calendarList.get", "calendarList.update" - ] + ], + "task_horizon": 3, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -3423,7 +3588,11 @@ } }, "expected_count": 1, - "ignore": ["status", "sequence", "updated_at"] + "ignore": [ + "status", + "sequence", + "updated_at" + ] }, { "diff_type": "added", @@ -3444,7 +3613,12 @@ "tools_required": [ "events.move", "events.watch" - ] + ], + "task_horizon": 2, + "operation_type": "C+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "medium" } }, { @@ -3470,7 +3644,9 @@ } } }, - "expected_count": {"min": 1}, + "expected_count": { + "min": 1 + }, "description": "All events on this calendar should be cleared (cancelled)" }, { @@ -3497,7 +3673,12 @@ "tools_required": [ "calendars.clear", "calendarList.patch" - ] + ], + "task_horizon": 2, + "operation_type": "R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -3553,7 +3734,12 @@ "calendars.insert", "acl.insert", "calendars.delete" - ] + ], + "task_horizon": 3, + "operation_type": "C+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -3634,7 +3820,12 @@ "freeBusy.query", "events.get", "channels.stop" - ] + ], + "task_horizon": 7, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -3715,7 +3906,12 @@ "events.watch", "calendarList.delete", "channels.stop" - ] + ], + "task_horizon": 4, + "operation_type": "C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -3830,7 +4026,12 @@ "acl.update", "settings.get", "events.update" - ] + ], + "task_horizon": 5, + "operation_type": "search+R+U", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" } }, { @@ -3885,7 +4086,12 @@ "acl.insert", "acl.update", "settings.get" - ] + ], + "task_horizon": 4, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -3921,7 +4127,12 @@ "events.instances", "settings.list", "calendars.clear" - ] + ], + "task_horizon": 6, + "operation_type": "search+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -3983,7 +4194,12 @@ "calendars.patch", "acl.watch", "calendarList.delete" - ] + ], + "task_horizon": 4, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -4084,7 +4300,12 @@ "events.patch", "events.list", "acl.watch" - ] + ], + "task_horizon": 6, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -4179,7 +4400,12 @@ "freeBusy.query", "events.insert", "calendarList.patch" - ] + ], + "task_horizon": 6, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -4288,13 +4514,18 @@ "events.update", "events.watch", "acl.watch" - ] + ], + "task_horizon": 6, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" } }, { "id": "test_46", "name": "Brineglass Works - move, create, ACL update, settings watch", - "prompt": "On the Brineglass Works calendar (ID cal_brineglass_works), first fetch the event evt_brineglass_forge_demo, then move it to the Harbor Kiln Hall calendar (ID cal_harbor_kiln_hall). Next, use free/busy to find the earliest 30-minute overlap for Lucia (lucia@test.com) and Noah (noah@test.com) on June 30, 2018, and create a new event Saltglass Alignment on Brineglass Works at that time. Then fully replace Lucia\u2019s ACL rule (user:lucia@test.com) on Brineglass Works to writer. Finally, set up a settings watch for my account.", + "prompt": "On the Brineglass Works calendar (ID cal_brineglass_works), first fetch the event evt_brineglass_forge_demo, then move it to the Harbor Kiln Hall calendar (ID cal_harbor_kiln_hall). Next, use free/busy to find the earliest 30-minute overlap for Lucia (lucia@test.com) and Noah (noah@test.com) on June 30, 2018, and create a new event Saltglass Alignment on Brineglass Works at that time. Then fully replace Lucia’s ACL rule (user:lucia@test.com) on Brineglass Works to writer. Finally, set up a settings watch for my account.", "type": "actionEval", "seed_template": "calendar_default", "impersonate_user_id": "user_agent", @@ -4384,7 +4615,12 @@ "events.insert", "acl.update", "settings.watch" - ] + ], + "task_horizon": 6, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -4477,7 +4713,12 @@ "calendarList.get", "acl.watch", "calendarList.watch" - ] + ], + "task_horizon": 7, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -4565,13 +4806,18 @@ "calendarList.patch", "calendarList.watch", "calendars.delete" - ] + ], + "task_horizon": 5, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_50", "name": "Emberveil Rookery - subscribe, watch, revoke, delete", - "prompt": "Subscribe me to the external calendar cal_emberveil_rookery. Then start an events watch on that calendar. Next, check Hana\u2019s calendar and only set up a settings watch for my account if Hana has an event on June 30, 2018 at 9:00-9:30am Asia/Tokyo time. Then, only remove Salma\u2019s access from Emberveil Rookery (rule cal_emberveil_rookery:user:salma@test.com) if that calendar has more than 7 events between June 20-27, 2018. Finally, delete the obsolete Ashfeather Annex calendar (ID cal_ashfeather_annex).", + "prompt": "Subscribe me to the external calendar cal_emberveil_rookery. Then start an events watch on that calendar. Next, check Hana’s calendar and only set up a settings watch for my account if Hana has an event on June 30, 2018 at 9:00-9:30am Asia/Tokyo time. Then, only remove Salma’s access from Emberveil Rookery (rule cal_emberveil_rookery:user:salma@test.com) if that calendar has more than 7 events between June 20-27, 2018. Finally, delete the obsolete Ashfeather Annex calendar (ID cal_ashfeather_annex).", "type": "actionEval", "seed_template": "calendar_default", "impersonate_user_id": "user_agent", @@ -4661,13 +4907,18 @@ "settings.watch", "acl.delete", "calendars.delete" - ] + ], + "task_horizon": 6, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_51", "name": "Windchord Cartotheca - create, color, list patch, ACL update", - "prompt": "Create a new calendar named Windchord Cartotheca. Pull the calendar color palette and set it to color ID 11. Then fetch its calendar list entry and patch it so the calendar is visible and selected. Also update the calendar description to 'Atlas repair bays.' Fully replace Aiko\u2019s ACL rule (user:aiko@test.com) on this calendar to writer. At the very end, on my primary calendar, count the attendees on event evt_cartotheca_intake_huddle (Cartotheca Intake Huddle) and copy those attendees as invitees to event evt_atlas_crate_sync (Atlas Crate Sync). Add the note 'Copied attendees' to Atlas Crate Sync\u2019s description.", + "prompt": "Create a new calendar named Windchord Cartotheca. Pull the calendar color palette and set it to color ID 11. Then fetch its calendar list entry and patch it so the calendar is visible and selected. Also update the calendar description to 'Atlas repair bays.' Fully replace Aiko’s ACL rule (user:aiko@test.com) on this calendar to writer. At the very end, on my primary calendar, count the attendees on event evt_cartotheca_intake_huddle (Cartotheca Intake Huddle) and copy those attendees as invitees to event evt_atlas_crate_sync (Atlas Crate Sync). Add the note 'Copied attendees' to Atlas Crate Sync’s description.", "type": "actionEval", "seed_template": "calendar_default", "impersonate_user_id": "user_agent", @@ -4781,7 +5032,12 @@ "acl.update", "events.get", "events.patch" - ] + ], + "task_horizon": 8, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -4982,13 +5238,18 @@ "events.list", "events.patch", "events.move" - ] + ], + "task_horizon": 9, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_53", "name": "Crystalfold Foundry - replace, delete events, unsubscribe", - "prompt": "On the Crystalfold Foundry calendar (ID cal_crystalfold_foundry), fully replace evt_crystalfold_quench so it\u2019s on July 1, 2018 from 9:00am-10:30am at Forge Bay 2. Then delete evt_crystalfold_slag and evt_crystalfold_mold. Finally, unsubscribe me from the Old Lattice Mill calendar (ID cal_old_lattice_mill).", + "prompt": "On the Crystalfold Foundry calendar (ID cal_crystalfold_foundry), fully replace evt_crystalfold_quench so it’s on July 1, 2018 from 9:00am-10:30am at Forge Bay 2. Then delete evt_crystalfold_slag and evt_crystalfold_mold. Finally, unsubscribe me from the Old Lattice Mill calendar (ID cal_old_lattice_mill).", "type": "actionEval", "seed_template": "calendar_default", "impersonate_user_id": "user_agent", @@ -5099,13 +5360,18 @@ "events.update", "events.delete", "calendarList.delete" - ] + ], + "task_horizon": 4, + "operation_type": "R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { "id": "test_54", "name": "Sablewind Archive - share, revoke, list patch, settings watch", - "prompt": "On the Sablewind Archive calendar (ID cal_sablewind_archive), fetch its calendar list entry and set it to visible (not hidden) with color ID 5. Share the calendar with Keiko (keiko@test.com) as writer, and remove Salma\u2019s access (rule cal_sablewind_archive:user:salma@test.com). Finally, set up a settings watch for my account.", + "prompt": "On the Sablewind Archive calendar (ID cal_sablewind_archive), fetch its calendar list entry and set it to visible (not hidden) with color ID 5. Share the calendar with Keiko (keiko@test.com) as writer, and remove Salma’s access (rule cal_sablewind_archive:user:salma@test.com). Finally, set up a settings watch for my account.", "type": "actionEval", "seed_template": "calendar_default", "impersonate_user_id": "user_agent", @@ -5192,13 +5458,18 @@ "acl.insert", "acl.delete", "settings.watch" - ] + ], + "task_horizon": 6, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_55", "name": "Skyloom Observatory - event replace, ACL update, events watch", - "prompt": "On the Skyloom Observatory calendar (ID cal_skyloom_observatory), list events first. Then fully replace evt_skyloom_alignment so it\u2019s on July 2, 2018 from 8:00pm\u20139:00pm at Upper Ring. Also fully replace the ACL rule user:mechanic@skyloom.example to reader. After that, start an events watch on the Skyloom Observatory calendar and list events again to confirm the change.", + "prompt": "On the Skyloom Observatory calendar (ID cal_skyloom_observatory), list events first. Then fully replace evt_skyloom_alignment so it’s on July 2, 2018 from 8:00pm–9:00pm at Upper Ring. Also fully replace the ACL rule user:mechanic@skyloom.example to reader. After that, start an events watch on the Skyloom Observatory calendar and list events again to confirm the change.", "type": "actionEval", "seed_template": "calendar_default", "impersonate_user_id": "user_agent", @@ -5285,7 +5556,12 @@ "events.update", "acl.update", "events.watch" - ] + ], + "task_horizon": 4, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -5418,7 +5694,12 @@ "acl.patch", "channels.stop", "calendars.delete" - ] + ], + "task_horizon": 6, + "operation_type": "search+U+D", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" } }, { @@ -5484,7 +5765,12 @@ "calendarList.insert", "calendarList.patch", "events.import" - ] + ], + "task_horizon": 4, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -5583,7 +5869,12 @@ "acl.get", "acl.watch", "calendars.clear" - ] + ], + "task_horizon": 8, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -5794,7 +6085,12 @@ "events.list", "events.delete", "events.patch" - ] + ], + "task_horizon": 12, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -5910,13 +6206,18 @@ "acl.list", "events.list", "events.delete" - ] + ], + "task_horizon": 7, + "operation_type": "search+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_61", "name": "Wavelock Guest Sweep - create, purge attendees, freebusy, watch", - "prompt": "Create a new calendar called Wavelock Guest Sweep. Fully update that calendar to set timezone Europe/Berlin and description \"Guest sweep log and embargo notes.\" Then list events on my primary calendar and identify all events where Aiko, Farid, Lucia, or Oksana appear in the attendee list. Delete every such event. Run a free/busy query for those four across Aug 1\u20137, 2018, then another for Aug 8\u201314, 2018. Schedule a weekly 30-minute event on my primary calendar at the earliest time that doesn't conflict with any of those four attendees. Unsubscribe me from the legacy calendar cal_wavelock_legacy. Finally, start a watch on my calendar list and a watch on my settings.", + "prompt": "Create a new calendar called Wavelock Guest Sweep. Fully update that calendar to set timezone Europe/Berlin and description \"Guest sweep log and embargo notes.\" Then list events on my primary calendar and identify all events where Aiko, Farid, Lucia, or Oksana appear in the attendee list. Delete every such event. Run a free/busy query for those four across Aug 1–7, 2018, then another for Aug 8–14, 2018. Schedule a weekly 30-minute event on my primary calendar at the earliest time that doesn't conflict with any of those four attendees. Unsubscribe me from the legacy calendar cal_wavelock_legacy. Finally, start a watch on my calendar list and a watch on my settings.", "type": "actionEval", "seed_template": "calendar_default", "impersonate_user_id": "user_agent", @@ -6072,7 +6373,12 @@ "calendarList.delete", "calendarList.watch", "settings.watch" - ] + ], + "task_horizon": 10, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } } ] diff --git a/examples/linear/testsuites/linear_bench.json b/examples/linear/testsuites/linear_bench.json index 86ea4ac..542ef0c 100644 --- a/examples/linear/testsuites/linear_bench.json +++ b/examples/linear/testsuites/linear_bench.json @@ -40,7 +40,12 @@ "tools_required": [ "teams", "issueCreate" - ] + ], + "task_horizon": 2, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -73,7 +78,12 @@ "tools_required": [ "teams", "issueCreate" - ] + ], + "task_horizon": 2, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -110,7 +120,12 @@ "issues", "workflowStates", "issueUpdate" - ] + ], + "task_horizon": 3, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -150,7 +165,12 @@ "issues", "users", "issueUpdate" - ] + ], + "task_horizon": 3, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -179,7 +199,12 @@ "min_tool_calls": 1, "tools_required": [ "commentCreate" - ] + ], + "task_horizon": 1, + "operation_type": "C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -215,7 +240,12 @@ "min_tool_calls": 1, "tools_required": [ "teamCreate" - ] + ], + "task_horizon": 1, + "operation_type": "C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -256,7 +286,12 @@ "tools_required": [ "issues", "issueUpdate" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -293,7 +328,12 @@ "teams", "users", "issueCreate" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -329,7 +369,12 @@ "tools_required": [ "issues", "issueUpdate" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -356,7 +401,12 @@ "tools_required": [ "teams", "issueCreate" - ] + ], + "task_horizon": 4, + "operation_type": "search+C", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -393,7 +443,12 @@ "issues", "workflowStates", "issueUpdate" - ] + ], + "task_horizon": 3, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -419,7 +474,12 @@ "min_tool_calls": 1, "tools_required": [ "issueLabelCreate" - ] + ], + "task_horizon": 1, + "operation_type": "C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -447,7 +507,12 @@ "issueLabels", "issues", "issueUpdate" - ] + ], + "task_horizon": 3, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -478,7 +543,12 @@ "issueLabels", "issues", "issueUpdate" - ] + ], + "task_horizon": 3, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -514,7 +584,12 @@ "tools_required": [ "issues", "issueUpdate" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -546,7 +621,12 @@ "comments", "issueLabels", "issueUpdate" - ] + ], + "task_horizon": 4, + "operation_type": "search+R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -583,7 +663,12 @@ "issues", "workflowStates", "issueUpdate" - ] + ], + "task_horizon": 3, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -620,7 +705,12 @@ "issues", "users", "issueUpdate" - ] + ], + "task_horizon": 3, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -659,7 +749,12 @@ "teams", "issueLabels", "issueCreate" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -688,7 +783,12 @@ "min_tool_calls": 1, "tools_required": [ "commentCreate" - ] + ], + "task_horizon": 1, + "operation_type": "C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -725,7 +825,12 @@ "issues", "comments", "commentUpdate" - ] + ], + "task_horizon": 3, + "operation_type": "U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -765,7 +870,12 @@ "issues", "comments", "commentDelete" - ] + ], + "task_horizon": 3, + "operation_type": "search+D", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -817,7 +927,12 @@ "teams", "issueLabels", "issueCreate" - ] + ], + "task_horizon": 4, + "operation_type": "search+C", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -854,7 +969,12 @@ "issues", "workflowStates", "issueUpdate" - ] + ], + "task_horizon": 3, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -892,7 +1012,12 @@ "issueLabelCreate", "issues", "issueUpdate" - ] + ], + "task_horizon": 3, + "operation_type": "search+C+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -928,7 +1053,12 @@ "tools_required": [ "issueLabels", "issueLabelUpdate" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -964,7 +1094,12 @@ "tools_required": [ "issues", "issueUpdate" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -1020,7 +1155,12 @@ "issueCreate", "issueUpdate", "commentCreate" - ] + ], + "task_horizon": 5, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -1059,7 +1199,12 @@ "tools_required": [ "workflowStates", "workflowStateArchive" - ] + ], + "task_horizon": 2, + "operation_type": "search+D", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -1113,7 +1258,12 @@ "issues", "workflowStateCreate", "issueUpdate" - ] + ], + "task_horizon": 4, + "operation_type": "search+C+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -1143,7 +1293,12 @@ "tools_required": [ "teams", "teamMembershipCreate" - ] + ], + "task_horizon": 2, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -1178,7 +1333,12 @@ "min_tool_calls": 1, "tools_required": [ "commentUpdate" - ] + ], + "task_horizon": 1, + "operation_type": "U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "medium" } }, { @@ -1239,7 +1399,12 @@ "issueLabels", "issues", "issueUpdate" - ] + ], + "task_horizon": 4, + "operation_type": "search+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -1285,7 +1450,12 @@ "teams", "workflowStates", "issueUpdate" - ] + ], + "task_horizon": 4, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -1328,7 +1498,12 @@ "teams", "issueCreate", "issueRelationCreate" - ] + ], + "task_horizon": 5, + "operation_type": "search+R+C", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -1365,7 +1540,12 @@ "teams", "issues", "issueUpdate" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -1417,7 +1597,12 @@ "issueLabels", "issues", "issueUpdate" - ] + ], + "task_horizon": 4, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -1469,7 +1654,12 @@ "issues", "issueUpdate", "commentCreate" - ] + ], + "task_horizon": 7, + "operation_type": "search+C+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -1513,7 +1703,12 @@ "issues", "users", "issueCreate" - ] + ], + "task_horizon": 4, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" } }, { @@ -1551,7 +1746,12 @@ "teams", "issueCreate", "issueRelationCreate" - ] + ], + "task_horizon": 6, + "operation_type": "search+C", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "low" } }, { @@ -1666,13 +1866,18 @@ "issueRelationCreate", "issueUpdate", "commentCreate" - ] + ], + "task_horizon": 9, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_42", "name": "Seed Library Germination Audit", - "prompt": "The Seed Library team is conducting its quarterly germination audit. We need to evaluate each donor's success rate and take appropriate action.\n\nFirst, calculate Yuto's germination rate: count how many of his seed packets are in \"Sprouted\" status versus \"Failed\" status, then compute (sprouted / total) \u00d7 100. If his rate is 75% or higher AND he has at least 2 different varieties that sprouted, create a new label called \"Seed Guardian\" and apply it to all of his packets as recognition.\n\nNext, handle Nneka's special case: she donated exactly one packet and it's marked as priority 1 (rare heirloom). Regardless of whether it sprouted or failed, move this packet to \"Preserved Collection\" status\u2014the library will attempt tissue culture propagation on rare genetics.\n\nFinally, evaluate Szymon's packets the same way. If his germination rate is below 60%, move all his non-sprouted packets to \"Needs Donor Review\" status and add a comment to each one that reads: \"GERMINATION_AUDIT: X sprouted / Y total = Z% - below 60% threshold\" where X, Y, Z are the actual calculated values.", + "prompt": "The Seed Library team is conducting its quarterly germination audit. We need to evaluate each donor's success rate and take appropriate action.\n\nFirst, calculate Yuto's germination rate: count how many of his seed packets are in \"Sprouted\" status versus \"Failed\" status, then compute (sprouted / total) × 100. If his rate is 75% or higher AND he has at least 2 different varieties that sprouted, create a new label called \"Seed Guardian\" and apply it to all of his packets as recognition.\n\nNext, handle Nneka's special case: she donated exactly one packet and it's marked as priority 1 (rare heirloom). Regardless of whether it sprouted or failed, move this packet to \"Preserved Collection\" status—the library will attempt tissue culture propagation on rare genetics.\n\nFinally, evaluate Szymon's packets the same way. If his germination rate is below 60%, move all his non-sprouted packets to \"Needs Donor Review\" status and add a comment to each one that reads: \"GERMINATION_AUDIT: X sprouted / Y total = Z% - below 60% threshold\" where X, Y, Z are the actual calculated values.", "type": "actionEval", "seed_template": "linear_expanded", "impersonate_user_id": "2790a7ee-fde0-4537-9588-e233aa5a68d1", @@ -1772,13 +1977,18 @@ "issueLabelCreate", "issueUpdate", "commentCreate" - ] + ], + "task_horizon": 11, + "operation_type": "search+R+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_43", "name": "Forest Mycology Collective Expedition", - "prompt": "The Forest Mycology Collective is organizing their autumn foraging expedition. First, create a new team called \"Forest Mycology Collective\" to track all club activities.\n\nCreate a label called \"awaiting-spore-print\" for specimens that need laboratory analysis before identification can be confirmed.\n\nNow set up the expedition: create an issue titled \"Coastal Redwood Reserve Autumn Foray\" and assign it to Haruki as the expedition leader.\n\nDuring the planning phase, we're pre-logging anticipated specimen finds based on last year's survey. Create a specimen issue titled \"Specimen #1: Cantharellus formosus cluster - Sector 7\" and assign it to Priya for documentation. Create another specimen issue \"Specimen #2: Unknown Amanita - requires cross-reference\" and assign it to Dmitri, applying the \"awaiting-spore-print\" label.\n\nThe Amanita identification depends on comparing its spore print against the Cantharellus specimen first (they were found in the same microhabitat and we need to rule out look-alikes). Set up the Amanita issue as blocked by the Cantharellus issue.\n\nFinally, add a field note comment to the Cantharellus specimen that reads: \"FIELD_NOTE_REF: GPS coordinates 41.2132\u00b0N, found near fallen Douglas fir. Fruiting body golden-yellow, false gills present, apricot aroma confirmed.\"", + "prompt": "The Forest Mycology Collective is organizing their autumn foraging expedition. First, create a new team called \"Forest Mycology Collective\" to track all club activities.\n\nCreate a label called \"awaiting-spore-print\" for specimens that need laboratory analysis before identification can be confirmed.\n\nNow set up the expedition: create an issue titled \"Coastal Redwood Reserve Autumn Foray\" and assign it to Haruki as the expedition leader.\n\nDuring the planning phase, we're pre-logging anticipated specimen finds based on last year's survey. Create a specimen issue titled \"Specimen #1: Cantharellus formosus cluster - Sector 7\" and assign it to Priya for documentation. Create another specimen issue \"Specimen #2: Unknown Amanita - requires cross-reference\" and assign it to Dmitri, applying the \"awaiting-spore-print\" label.\n\nThe Amanita identification depends on comparing its spore print against the Cantharellus specimen first (they were found in the same microhabitat and we need to rule out look-alikes). Set up the Amanita issue as blocked by the Cantharellus issue.\n\nFinally, add a field note comment to the Cantharellus specimen that reads: \"FIELD_NOTE_REF: GPS coordinates 41.2132°N, found near fallen Douglas fir. Fruiting body golden-yellow, false gills present, apricot aroma confirmed.\"", "type": "actionEval", "seed_template": "linear_expanded", "impersonate_user_id": "2790a7ee-fde0-4537-9588-e233aa5a68d1", @@ -1887,7 +2097,12 @@ "issueUpdate", "issueRelationCreate", "commentCreate" - ] + ], + "task_horizon": 11, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -1951,7 +2166,12 @@ "issueCreate", "issueUpdate", "commentCreate" - ] + ], + "task_horizon": 8, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -1998,7 +2218,9 @@ } } }, - "ignore": ["updatedAt"] + "ignore": [ + "updatedAt" + ] }, { "diff_type": "added", @@ -2045,7 +2267,12 @@ "issueUpdate", "commentCreate", "commentUpdate" - ] + ], + "task_horizon": 8, + "operation_type": "search+R+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -2116,7 +2343,12 @@ "commentCreate", "commentDelete", "issueUpdate" - ] + ], + "task_horizon": 9, + "operation_type": "search+C+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -2220,13 +2452,18 @@ "issueRelationCreate", "issueUpdate", "commentCreate" - ] + ], + "task_horizon": 12, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_48", "name": "Research Grant Application Pipeline", - "prompt": "The Research team needs to set up the grant application pipeline for the upcoming NIH submission deadline (June 15th).\n\nFirst, find the existing \"IRB Ethics Approval\" issue - this is our starting point and is already in progress.\n\nCreate three new issues in the Research team to complete the pipeline:\n\n1. \"Data Collection Protocol v2\" - Nadia will own this. It cannot begin until ethics approval is complete.\n\n2. \"Pilot Study Design - 50 participant cohort\" - Tom\u00e1s will lead this. It depends on having the data collection protocol finalized.\n\n3. \"Grant Submission Draft - R01 mechanism\" - Chioma will compile the final submission. This is the last step and depends on pilot study results.\n\nSet up the blocking relationships to enforce the sequential workflow:\n- IRB Ethics Approval blocks Data Collection Protocol\n- Data Collection Protocol blocks Pilot Study Design\n- Pilot Study Design blocks Grant Submission Draft\n\nAfter setting up the dependencies, add a comment to the Grant Submission issue summarizing the critical path: \"PIPELINE_STATUS: This submission depends on completion chain: Ethics (in progress) \u2192 Data Protocol (Nadia) \u2192 Pilot Study (Tom\u00e1s) \u2192 This draft. Target: June 15th deadline.\"", + "prompt": "The Research team needs to set up the grant application pipeline for the upcoming NIH submission deadline (June 15th).\n\nFirst, find the existing \"IRB Ethics Approval\" issue - this is our starting point and is already in progress.\n\nCreate three new issues in the Research team to complete the pipeline:\n\n1. \"Data Collection Protocol v2\" - Nadia will own this. It cannot begin until ethics approval is complete.\n\n2. \"Pilot Study Design - 50 participant cohort\" - Tomás will lead this. It depends on having the data collection protocol finalized.\n\n3. \"Grant Submission Draft - R01 mechanism\" - Chioma will compile the final submission. This is the last step and depends on pilot study results.\n\nSet up the blocking relationships to enforce the sequential workflow:\n- IRB Ethics Approval blocks Data Collection Protocol\n- Data Collection Protocol blocks Pilot Study Design\n- Pilot Study Design blocks Grant Submission Draft\n\nAfter setting up the dependencies, add a comment to the Grant Submission issue summarizing the critical path: \"PIPELINE_STATUS: This submission depends on completion chain: Ethics (in progress) → Data Protocol (Nadia) → Pilot Study (Tomás) → This draft. Target: June 15th deadline.\"", "type": "actionEval", "seed_template": "linear_expanded", "impersonate_user_id": "2790a7ee-fde0-4537-9588-e233aa5a68d1", @@ -2310,7 +2547,12 @@ "issueRelationCreate", "issueUpdate", "commentCreate" - ] + ], + "task_horizon": 12, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -2394,13 +2636,18 @@ "issueRelationCreate", "issueUpdate", "commentCreate" - ] + ], + "task_horizon": 12, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_50", "name": "Archaeological Dig Site Coordination - Deadlock Resolution", - "prompt": "The Archaeology team is managing the Season 3 excavation at Site Karnak-West. There's a workflow problem blocking progress.\n\nExamine the issues \"Artifact Photography Documentation\" and \"Lab Sample Analysis\". These two issues are in a dependency deadlock - each one is marked as blocking the other, which means neither can proceed.\n\nDetermine which blocking relationship is incorrect. The correct archaeological workflow is: Photography must complete BEFORE samples can go to the lab (you need photos of artifacts in situ before extraction for the record). The reverse relationship (lab blocking photography) was added by mistake and makes no sense.\n\nDelete the incorrect blocking relationship to resolve the deadlock.\n\nNow extend the workflow. Create a new issue called \"Final Site Report Compilation - Season 3\" in the Archaeology team. This report cannot be written until BOTH the photography documentation AND the lab analysis are complete. Set up both as blockers for the report.\n\nAssign the work: Ximena handles photography, Okonkwo handles lab analysis, and S\u00f8ren compiles the final report.\n\nMove the photography issue to \"In Progress\" now that it's unblocked.\n\nAfter fixing everything, add a comment to the \"Lab Sample Analysis\" issue documenting the fix: \"WORKFLOW_FIX: Removed erroneous blocking relation where Lab was blocking Photography. Correct flow is Photography \u2192 Lab (need in-situ photos before extraction). Deadlock resolved. Current chain: Photography \u2192 Lab Analysis \u2192 Final Report.\"", + "prompt": "The Archaeology team is managing the Season 3 excavation at Site Karnak-West. There's a workflow problem blocking progress.\n\nExamine the issues \"Artifact Photography Documentation\" and \"Lab Sample Analysis\". These two issues are in a dependency deadlock - each one is marked as blocking the other, which means neither can proceed.\n\nDetermine which blocking relationship is incorrect. The correct archaeological workflow is: Photography must complete BEFORE samples can go to the lab (you need photos of artifacts in situ before extraction for the record). The reverse relationship (lab blocking photography) was added by mistake and makes no sense.\n\nDelete the incorrect blocking relationship to resolve the deadlock.\n\nNow extend the workflow. Create a new issue called \"Final Site Report Compilation - Season 3\" in the Archaeology team. This report cannot be written until BOTH the photography documentation AND the lab analysis are complete. Set up both as blockers for the report.\n\nAssign the work: Ximena handles photography, Okonkwo handles lab analysis, and Søren compiles the final report.\n\nMove the photography issue to \"In Progress\" now that it's unblocked.\n\nAfter fixing everything, add a comment to the \"Lab Sample Analysis\" issue documenting the fix: \"WORKFLOW_FIX: Removed erroneous blocking relation where Lab was blocking Photography. Correct flow is Photography → Lab (need in-situ photos before extraction). Deadlock resolved. Current chain: Photography → Lab Analysis → Final Report.\"", "type": "actionEval", "seed_template": "linear_expanded", "impersonate_user_id": "2790a7ee-fde0-4537-9588-e233aa5a68d1", @@ -2532,7 +2779,12 @@ "issueRelationCreate", "issueUpdate", "commentCreate" - ] + ], + "task_horizon": 13, + "operation_type": "search+C+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -2627,13 +2879,18 @@ "issueCreate", "issueUpdate", "commentCreate" - ] + ], + "task_horizon": 10, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" } }, { "id": "test_52", "name": "Pottery Studio - Kiln Firing Schedule", - "prompt": "The Clay & Fire pottery studio tracks their kiln schedule. Help update the firing queue.\n\nFirst, find the existing issues \"Fatou's Celadon Vase\" and \"Stoneware Bowl Set\" in the Ceramics team.\n\nThe celadon vase is ready to go in the kiln - move it to \"Firing\" status.\n\nThe stoneware bowls have finished their cone 10 firing and need to cool down - move them to \"Cooling\" status.\n\nCreate a new label called \"raku-firing\" for pieces that will use the rapid-cooling technique (we'll apply it to future items).\n\nFinally, add a kiln log comment to the celadon vase issue: \"KILN_LOG: Loaded into kiln #2 at 9:15am. Target: Cone 9 oxidation (~2300\u00b0F). Fatou requested slow cooling for crystal development. Do not open kiln door until temp drops below 400\u00b0F.\"", + "prompt": "The Clay & Fire pottery studio tracks their kiln schedule. Help update the firing queue.\n\nFirst, find the existing issues \"Fatou's Celadon Vase\" and \"Stoneware Bowl Set\" in the Ceramics team.\n\nThe celadon vase is ready to go in the kiln - move it to \"Firing\" status.\n\nThe stoneware bowls have finished their cone 10 firing and need to cool down - move them to \"Cooling\" status.\n\nCreate a new label called \"raku-firing\" for pieces that will use the rapid-cooling technique (we'll apply it to future items).\n\nFinally, add a kiln log comment to the celadon vase issue: \"KILN_LOG: Loaded into kiln #2 at 9:15am. Target: Cone 9 oxidation (~2300°F). Fatou requested slow cooling for crystal development. Do not open kiln door until temp drops below 400°F.\"", "type": "actionEval", "seed_template": "linear_expanded", "impersonate_user_id": "2790a7ee-fde0-4537-9588-e233aa5a68d1", @@ -2737,7 +2994,12 @@ "issueUpdate", "issueLabelCreate", "commentCreate" - ] + ], + "task_horizon": 10, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -2835,13 +3097,18 @@ "users", "issueCreate", "issueUpdate" - ] + ], + "task_horizon": 7, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_54", - "name": "Board Game Caf\u00e9 - Tournament Rescheduling Crisis", - "prompt": "The Meeple & Brew board game caf\u00e9 has a scheduling emergency. The venue for our Catan Regional Championship double-booked us, so we need to reschedule the entire tournament pipeline.\n\nFind the three tournament issues: \"Catan Regional Championship - Spring 2025\", \"Qualifying Round - Top 16 Bracket\", and \"Tournament Registration Deadline\".\n\nThe championship was originally March 15th but must move to March 23rd (8-day delay).\n\nHere's the critical part - the dates are interdependent:\n- The Qualifying Round must happen exactly 7 days before the Championship\n- The Registration Deadline must close exactly 5 days before the Qualifying Round\n\nCalculate and update all three due dates accordingly.\n\nAlso, Yuto was organizing the championship but has a work trip conflict on the new date. Reassign the championship to Adaeze. Keep Henrik on the qualifying round.\n\nAfter updating all dates, add a comment to the championship issue documenting the changes:\n\n\"RESCHEDULE_AUDIT: Venue conflict forced 8-day delay. New timeline calculated:\n- Registration closes: March 11th (was March 3rd)\n- Qualifiers: March 16th (was March 8th)\n- Championship: March 23rd (was March 15th)\nOrganizer handoff: Yuto \u2192 Adaeze due to travel conflict.\"", + "name": "Board Game Café - Tournament Rescheduling Crisis", + "prompt": "The Meeple & Brew board game café has a scheduling emergency. The venue for our Catan Regional Championship double-booked us, so we need to reschedule the entire tournament pipeline.\n\nFind the three tournament issues: \"Catan Regional Championship - Spring 2025\", \"Qualifying Round - Top 16 Bracket\", and \"Tournament Registration Deadline\".\n\nThe championship was originally March 15th but must move to March 23rd (8-day delay).\n\nHere's the critical part - the dates are interdependent:\n- The Qualifying Round must happen exactly 7 days before the Championship\n- The Registration Deadline must close exactly 5 days before the Qualifying Round\n\nCalculate and update all three due dates accordingly.\n\nAlso, Yuto was organizing the championship but has a work trip conflict on the new date. Reassign the championship to Adaeze. Keep Henrik on the qualifying round.\n\nAfter updating all dates, add a comment to the championship issue documenting the changes:\n\n\"RESCHEDULE_AUDIT: Venue conflict forced 8-day delay. New timeline calculated:\n- Registration closes: March 11th (was March 3rd)\n- Qualifiers: March 16th (was March 8th)\n- Championship: March 23rd (was March 15th)\nOrganizer handoff: Yuto → Adaeze due to travel conflict.\"", "type": "actionEval", "seed_template": "linear_expanded", "impersonate_user_id": "2790a7ee-fde0-4537-9588-e233aa5a68d1", @@ -2949,7 +3216,12 @@ "users", "issueUpdate", "commentCreate" - ] + ], + "task_horizon": 8, + "operation_type": "search+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { @@ -2970,7 +3242,9 @@ }, "expected_changes": { "teamId": { - "to": {"eq": "ad608998-915c-4bad-bcd9-85ebfccccee8"} + "to": { + "eq": "ad608998-915c-4bad-bcd9-85ebfccccee8" + } } }, "expected_count": 1, @@ -3098,7 +3372,12 @@ ], "requires_aggregation": true, "requires_max_finding": true, - "requires_arithmetic": true + "requires_arithmetic": true, + "task_horizon": 12, + "operation_type": "search+R+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" } }, { @@ -3213,13 +3492,18 @@ "requires_counting": true, "requires_id_collection": true, "expected_count": 3, - "affected_issues": 2 + "affected_issues": 2, + "task_horizon": 11, + "operation_type": "search+R+C", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } }, { "id": "test_57", "name": "Competitive Pigeon Racing Club - Storm Emergency Protocol", - "prompt": "The Wing & Wind pigeon racing club has an emergency. A fast-moving storm system is approaching the race corridor and we need to execute safety protocols.\n\nFirst, find all birds currently marked as \"In Flight\" in the Racing Operations team - these are the ones at risk.\n\nCreate an emergency coordination issue in the Racing Operations team titled \"WEATHER ALERT: Storm cell approaching sector 7 - All birds at risk\" with description \"NWS severe thunderstorm warning issued 14:32 UTC. Wind gusts to 60mph expected. Initiating emergency diversion protocol.\"\n\nFind the bird tracking issue for \"Stormchaser\" (Liora's champion racer, band #2847). Update it to add this to the description: \"DIVERSION ACTIVE: Rerouted to backup loft at coordinates 41.8781\u00b0 N, 87.6298\u00b0 W. Amadi's loft confirmed ready to receive.\"\n\nFinally, add a weather advisory comment to the emergency coordination issue:\n\"WEATHER_LOG: Storm tracking update at 14:45 UTC. Cell moving NNE at 35mph. ETA to race corridor: 47 minutes. All handlers notified via SMS. GPS tracking shows 3 birds diverted successfully. Amadi confirming visual on Stormchaser approaching backup loft.\"", + "prompt": "The Wing & Wind pigeon racing club has an emergency. A fast-moving storm system is approaching the race corridor and we need to execute safety protocols.\n\nFirst, find all birds currently marked as \"In Flight\" in the Racing Operations team - these are the ones at risk.\n\nCreate an emergency coordination issue in the Racing Operations team titled \"WEATHER ALERT: Storm cell approaching sector 7 - All birds at risk\" with description \"NWS severe thunderstorm warning issued 14:32 UTC. Wind gusts to 60mph expected. Initiating emergency diversion protocol.\"\n\nFind the bird tracking issue for \"Stormchaser\" (Liora's champion racer, band #2847). Update it to add this to the description: \"DIVERSION ACTIVE: Rerouted to backup loft at coordinates 41.8781° N, 87.6298° W. Amadi's loft confirmed ready to receive.\"\n\nFinally, add a weather advisory comment to the emergency coordination issue:\n\"WEATHER_LOG: Storm tracking update at 14:45 UTC. Cell moving NNE at 35mph. ETA to race corridor: 47 minutes. All handlers notified via SMS. GPS tracking shows 3 birds diverted successfully. Amadi confirming visual on Stormchaser approaching backup loft.\"", "type": "actionEval", "seed_template": "linear_expanded", "impersonate_user_id": "2790a7ee-fde0-4537-9588-e233aa5a68d1", @@ -3381,7 +3665,12 @@ ], "requires_state_filtering": true, "expected_in_flight_birds": 3, - "coordinate_precision_test": true + "coordinate_precision_test": true, + "task_horizon": 12, + "operation_type": "search+R+C+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" } } ] diff --git a/examples/slack/testsuites/slack_bench_v2.json b/examples/slack/testsuites/slack_bench_v2.json index a74bbe0..b241dc3 100644 --- a/examples/slack/testsuites/slack_bench_v2.json +++ b/examples/slack/testsuites/slack_bench_v2.json @@ -22,7 +22,12 @@ "tools_required": [ "conversations.list", "chat.postMessage" - ] + ], + "task_horizon": 2, + "operation_type": "C+R", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -53,7 +58,12 @@ "users.list", "conversations.open", "chat.postMessage" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -94,7 +104,12 @@ "users.list", "conversations.open", "chat.postMessage" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -133,7 +148,12 @@ "min_tool_calls": 1, "tools_required": [ "conversations.create" - ] + ], + "task_horizon": 1, + "operation_type": "C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -161,7 +181,12 @@ "users.list", "conversations.list", "conversations.invite" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -192,7 +217,12 @@ "users.list", "conversations.create", "conversations.invite" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -230,7 +260,12 @@ "users.list", "conversations.list", "conversations.kick" - ] + ], + "task_horizon": 3, + "operation_type": "search+D", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -260,7 +295,12 @@ "tools_required": [ "conversations.list", "conversations.archive" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -292,7 +332,12 @@ "conversations.list", "conversations.history", "chat.postMessage" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "high" }, "assertions": [ { @@ -300,7 +345,7 @@ "entity": "messages", "where": { "parent_id": { - "eq": "1700173200.000456" + "eq": "1706115500.000001" }, "channel_id": { "eq": "C01ABCD1234" @@ -326,7 +371,12 @@ "conversations.list", "conversations.history", "chat.postMessage" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -362,7 +412,12 @@ "reactions.add", "reactions.add", "reactions.add" - ] + ], + "task_horizon": 5, + "operation_type": "search+R+C", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -419,7 +474,12 @@ "conversations.list", "conversations.history", "reactions.add" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "high" }, "assertions": [ { @@ -427,7 +487,7 @@ "entity": "message_reactions", "where": { "message_id": { - "eq": "1700173200.000456" + "eq": "1706115500.000001" }, "user_id": { "eq": "U01AGENBOT9" @@ -452,7 +512,12 @@ "tools_required": [ "conversations.list", "conversations.setTopic" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -485,7 +550,12 @@ "tools_required": [ "search.messages", "chat.update" - ] + ], + "task_horizon": 2, + "operation_type": "search+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -522,7 +592,12 @@ "users.list", "conversations.list", "chat.postMessage" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -553,7 +628,12 @@ "conversations.list", "chat.postMessage", "chat.postMessage" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -597,7 +677,12 @@ "conversations.list", "conversations.history", "chat.delete" - ] + ], + "task_horizon": 3, + "operation_type": "R+D", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -631,7 +716,12 @@ "chat.postMessage", "chat.postMessage", "chat.postMessage" - ] + ], + "task_horizon": 6, + "operation_type": "search+R+C", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -702,7 +792,12 @@ "users.list", "conversations.open", "chat.postMessage" - ] + ], + "task_horizon": 4, + "operation_type": "search+R+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -824,7 +919,12 @@ "users.list", "conversations.open", "chat.postMessage" - ] + ], + "task_horizon": 4, + "operation_type": "search+R+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -940,7 +1040,12 @@ "search.messages", "conversations.history", "chat.update" - ] + ], + "task_horizon": 3, + "operation_type": "search+R+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -1072,7 +1177,12 @@ "conversations.history", "conversations.replies", "chat.update" - ] + ], + "task_horizon": 3, + "operation_type": "R+U", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1114,7 +1224,12 @@ "tools_required": [ "conversations.list", "chat.postMessage" - ] + ], + "task_horizon": 2, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1170,7 +1285,12 @@ "tools_required": [ "conversations.list", "chat.postMessage" - ] + ], + "task_horizon": 2, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1214,7 +1334,12 @@ "conversations.list", "chat.postMessage", "chat.postMessage" - ] + ], + "task_horizon": 3, + "operation_type": "C+R", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1257,7 +1382,12 @@ "tools_required": [ "conversations.list", "chat.postMessage" - ] + ], + "task_horizon": 2, + "operation_type": "C+R", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1287,7 +1417,12 @@ "tools_required": [ "conversations.list", "chat.postMessage" - ] + ], + "task_horizon": 2, + "operation_type": "C+R", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1330,7 +1465,12 @@ "tools_required": [ "conversations.list", "chat.postMessage" - ] + ], + "task_horizon": 2, + "operation_type": "C+R", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1361,7 +1501,12 @@ "users.list", "conversations.list", "chat.postMessage" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1405,7 +1550,12 @@ "search.messages", "conversations.open", "chat.postMessage" - ] + ], + "task_horizon": 3, + "operation_type": "search+C", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -1457,7 +1607,12 @@ "search.messages", "conversations.create", "conversations.invite" - ] + ], + "task_horizon": 9, + "operation_type": "search+C", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -1503,7 +1658,12 @@ "users.list", "conversations.open", "chat.postMessage" - ] + ], + "task_horizon": 4, + "operation_type": "search+C+R", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1564,7 +1724,12 @@ "users.list", "conversations.list", "chat.postMessage" - ] + ], + "task_horizon": 4, + "operation_type": "search+R+C+D", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1608,7 +1773,12 @@ "users.list", "conversations.list", "conversations.invite" - ] + ], + "task_horizon": 3, + "operation_type": "C+R", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1638,7 +1808,12 @@ "tools_required": [ "conversations.list", "chat.postMessage" - ] + ], + "task_horizon": 2, + "operation_type": "C+R", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1669,7 +1844,12 @@ "conversations.list", "conversations.history", "chat.postMessage" - ] + ], + "task_horizon": 8, + "operation_type": "C+R", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -1680,20 +1860,7 @@ "eq": "C03IJKL9012" }, "message_text": { - "contains": "frontend" - } - }, - "expected_count": 1 - }, - { - "diff_type": "added", - "entity": "messages", - "where": { - "channel_id": { - "eq": "C03IJKL9012" - }, - "message_text": { - "contains": "Flash" + "contains": "Gemini" } }, "expected_count": 1 @@ -1713,7 +1880,12 @@ "conversations.list", "conversations.history", "reactions.add" - ] + ], + "task_horizon": 3, + "operation_type": "C+R", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "low" }, "assertions": [ { @@ -1737,7 +1909,7 @@ { "id": "test_38", "name": "Open Source Hackathon Coordination", - "prompt": "Hey, I need your help coordinating our 24-hour global hackathon across Lagos, Kyiv, Warsaw, and SF. First, can you find out which channels are relevant for this open source hackathon we're running? I want to make sure #core-infra has an updated topic that reflects we're in hackathon mode.\n\nAlso, I need to post an update to the infrastructure team about our coordination status. Before I loop in \u0141ukasz Kowalski and Kenji Sato, can you pull up their profiles? I want to confirm \u0141ukasz is still our performance lead and check Kenji's role on the APAC growth side.\n\nI posted something outdated in one of the channels yesterday that needs to be removed - it had wrong timezone info. Can you also check what's been discussed recently in #project-alpha-dev so I'm caught up? And verify who's currently in #frontend - we might need to add some people.\n\nOh, and when you find any important messages about the hackathon prep, just give them a thumbs up so people know we've seen them.", + "prompt": "Hey, I need your help coordinating our 24-hour global hackathon across Lagos, Kyiv, Warsaw, and SF. First, can you find out which channels are relevant for this open source hackathon we're running? I want to make sure #core-infra has an updated topic that reflects we're in hackathon mode.\n\nAlso, I need to post an update to the infrastructure team about our coordination status. Before I loop in Łukasz Kowalski and Kenji Sato, can you pull up their profiles? I want to confirm Łukasz is still our performance lead and check Kenji's role on the APAC growth side.\n\nI posted something outdated in one of the channels yesterday that needs to be removed - it had wrong timezone info. Can you also check what's been discussed recently in #project-alpha-dev so I'm caught up? And verify who's currently in #frontend - we might need to add some people.\n\nOh, and when you find any important messages about the hackathon prep, just give them a thumbs up so people know we've seen them.", "type": "actionEval", "seed_template": "slack_bench_v2", "impersonate_user_id": "U01AGENBOT9", @@ -1753,7 +1925,12 @@ "reactions.add", "chat.delete", "conversations.members" - ] + ], + "task_horizon": 9, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" }, "_step_sequence": [ { @@ -1768,7 +1945,7 @@ }, { "endpoint": "users.info", - "action": "Get details about \u0141ukasz Kowalski", + "action": "Get details about Łukasz Kowalski", "justification": "Verify their role in performance" }, { @@ -1836,7 +2013,12 @@ "chat.delete", "reactions.add", "chat.postMessage" - ] + ], + "task_horizon": 13, + "operation_type": "C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" }, "_step_sequence": [ { @@ -1945,7 +2127,12 @@ "conversations.kick", "conversations.kick", "users.list" - ] + ], + "task_horizon": 7, + "operation_type": "C+R", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "_step_sequence": [ { @@ -2022,7 +2209,12 @@ "conversations.setTopic", "chat.delete", "chat.update" - ] + ], + "task_horizon": 13, + "operation_type": "search+C+R+U+D", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" }, "_step_sequence": [ { @@ -2120,7 +2312,12 @@ "chat.postMessage", "reactions.add", "conversations.kick" - ] + ], + "task_horizon": 8, + "operation_type": "search+C+R", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" }, "_step_sequence": [ { @@ -2208,7 +2405,12 @@ "chat.delete", "conversations.create", "conversations.create" - ] + ], + "task_horizon": 11, + "operation_type": "C+U+D", + "entity_scope": "single", + "information_availability": "explicit", + "prompt_ambiguity": "high" }, "_step_sequence": [ { @@ -2302,7 +2504,12 @@ "chat.update", "reactions.add", "conversations.create" - ] + ], + "task_horizon": 10, + "operation_type": "search+R+C+U", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "_step_sequence": [ { @@ -2382,7 +2589,7 @@ { "id": "test_45", "name": "Music Festival Tech Stack", - "prompt": "I need some help coordinating our virtual Afrobeats festival streaming infrastructure project. Can you help me get things organized across our Slack workspace?\n\nFirst, I want to make sure the #engineering channel clearly reflects that we're focused on the Music Festival Tech Stack right now - the topic should be updated so everyone knows what we're working on.\n\nI remember there were some discussions about CDN solutions a while back that would be really relevant to our streaming needs - can you dig those up for me?\n\nI also need to figure out who on our team should be involved. I know Robert Chen is supposed to be leading the engineering side, but can you confirm his role? And I think \u0141ukasz Kowalski has some great performance optimization experience - make sure he's part of the conversation in our main coordination channel.\n\nOnce you've gathered all this info, I need updates posted to #engineering, #frontend, and #general to get everyone aligned on our festival streaming infrastructure plans. Also, check what channels we have available that might be relevant to this project.", + "prompt": "I need some help coordinating our virtual Afrobeats festival streaming infrastructure project. Can you help me get things organized across our Slack workspace?\n\nFirst, I want to make sure the #engineering channel clearly reflects that we're focused on the Music Festival Tech Stack right now - the topic should be updated so everyone knows what we're working on.\n\nI remember there were some discussions about CDN solutions a while back that would be really relevant to our streaming needs - can you dig those up for me?\n\nI also need to figure out who on our team should be involved. I know Robert Chen is supposed to be leading the engineering side, but can you confirm his role? And I think Łukasz Kowalski has some great performance optimization experience - make sure he's part of the conversation in our main coordination channel.\n\nOnce you've gathered all this info, I need updates posted to #engineering, #frontend, and #general to get everyone aligned on our festival streaming infrastructure plans. Also, check what channels we have available that might be relevant to this project.", "type": "actionEval", "seed_template": "slack_bench_v2", "impersonate_user_id": "U01AGENBOT9", @@ -2398,7 +2605,12 @@ "chat.postMessage", "chat.postMessage", "conversations.invite" - ] + ], + "task_horizon": 9, + "operation_type": "C+R+U", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "high" }, "_step_sequence": [ { @@ -2443,7 +2655,7 @@ }, { "endpoint": "conversations.invite", - "action": "Invite \u0141ukasz Kowalski to the channel", + "action": "Invite Łukasz Kowalski to the channel", "justification": "Their Polish perspective is valuable" } ], @@ -2489,7 +2701,12 @@ "chat.update", "chat.update", "reactions.add" - ] + ], + "task_horizon": 13, + "operation_type": "search+R+C+U+D", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "high" }, "_step_sequence": [ { @@ -2598,7 +2815,12 @@ "users.list", "chat.update", "chat.delete" - ] + ], + "task_horizon": 7, + "operation_type": "C+R+U+D", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "high" }, "assertions": [ { @@ -2670,7 +2892,12 @@ "chat.postMessage", "conversations.rename", "chat.postMessage" - ] + ], + "task_horizon": 7, + "operation_type": "C+R+U", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -2754,7 +2981,12 @@ "conversations.history", "chat.postMessage", "reactions.add" - ] + ], + "task_horizon": 8, + "operation_type": "search+C+R", + "entity_scope": "single", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -2795,7 +3027,7 @@ { "id": "test_50", "name": "Quarterly Workspace Reorganization", - "prompt": "It's end of Q4 and I need to reorganize our Slack workspace. Help me with the following:\n\n1. First, list all the channels I'm currently a member of. Format it as a numbered list showing: channel name, member count. Send that list to me as a DM to myself.\n\n2. The \"old-project-q3\" channel was archived but we're reviving it for Q1 planning. Unarchive it and rename it to \"q1-planning-2026\". Update the topic to \"Q1 2026 Planning - Americas Team\".\n\n3. In #project-alpha-dev, we want to focus on the Americas timezone team only. Check each member's timezone using their profile info, then remove anyone who is NOT in an Americas timezone (timezone should start with \"America/\").\n\n4. I left an \ud83d\udc40 reaction on the circuit-tracer thread in #engineering a while back - please remove that since we've addressed the issue.\n\n5. Join the #product-growth channel since I'm not in it yet.\n\n6. Finally, post a Q1 kickoff message in the newly renamed channel. In the message, list which team members from #project-alpha-dev are in Americas timezones (the ones who remain after cleanup) - include their names and timezones.", + "prompt": "It's end of Q4 and I need to reorganize our Slack workspace. Help me with the following:\n\n1. First, list all the channels I'm currently a member of. Format it as a numbered list showing: channel name, member count. Send that list to me as a DM to myself.\n\n2. The \"old-project-q3\" channel was archived but we're reviving it for Q1 planning. Unarchive it and rename it to \"q1-planning-2026\". Update the topic to \"Q1 2026 Planning - Americas Team\".\n\n3. In #project-alpha-dev, we want to focus on the Americas timezone team only. Check each member's timezone using their profile info, then remove anyone who is NOT in an Americas timezone (timezone should start with \"America/\").\n\n4. I left an 👀 reaction on the circuit-tracer thread in #engineering a while back - please remove that since we've addressed the issue.\n\n5. Join the #product-growth channel since I'm not in it yet.\n\n6. Finally, post a Q1 kickoff message in the newly renamed channel. In the message, list which team members from #project-alpha-dev are in Americas timezones (the ones who remain after cleanup) - include their names and timezones.", "type": "actionEval", "seed_template": "slack_bench_v2", "impersonate_user_id": "U01AGENBOT9", @@ -2816,7 +3048,12 @@ "reactions.remove", "conversations.join", "chat.postMessage" - ] + ], + "task_horizon": 14, + "operation_type": "C+R+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -3065,7 +3302,7 @@ { "id": "test_51", "name": "Silicon Dreams", - "prompt": "Kenji, Olena, and Priya want to spin up a generative art project using the team's GPU infrastructure. They drew inspiration from the compute discussions and that circuit-tracer visualization work happening somewhere in the workspace. Can you get them organized? They need a channel \u2014 call it #fractal-forge \u2014 with a topic that contains \"GPU-meets-art\". Invite all three, and post an inaugural message that references whatever you can dig up about the GPU work and the circuit-tracer thread that got them excited -- those are going to be messeges on the topic, written by either three. Kenji also wants an :art: reaction on whichever message in #engineering first mentioned the circuit-tracer. Set up a group DM with just Kenji and Olena so they can sort out GPU scheduling privately. And actually, rename the channel to #silicon-dreams \u2014 everyone agreed it sounds better.", + "prompt": "Kenji, Olena, and Priya want to spin up a generative art project using the team's GPU infrastructure. They drew inspiration from the compute discussions and that circuit-tracer visualization work happening somewhere in the workspace. Can you get them organized? They need a channel — call it #fractal-forge — with a topic that contains \"GPU-meets-art\". Invite all three, and post an inaugural message that references whatever you can dig up about the GPU work and the circuit-tracer thread that got them excited -- those are going to be messeges on the topic, written by either three. Kenji also wants an :art: reaction on whichever message in #engineering first mentioned the circuit-tracer. Set up a group DM with just Kenji and Olena so they can sort out GPU scheduling privately. And actually, rename the channel to #silicon-dreams — everyone agreed it sounds better.", "type": "actionEval", "seed_template": "slack_bench_v2", "impersonate_user_id": "U01AGENBOT9", @@ -3082,7 +3319,12 @@ "conversations.replies", "conversations.rename", "conversations.open" - ] + ], + "task_horizon": 10, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -3258,7 +3500,7 @@ { "id": "test_52", "name": "Midnight Bazaar", - "prompt": "Sophie and Mateo want to bring the workspace's food culture together under one roof \u2014 a \"Midnight Bazaar\" inspired by all those coffee and pizza conversations scattered around the channels. Dig through the workspace to find what food chatter has been going on and who's been part of it - specifically, search for the authors of the messages that contain the words \"food\" or \"eat\". That old archived channel nobody uses anymore \u2014 revive it and repurpose it as bazaar headquarters. Set a topic that captures the night-market vibe (needs to include the words \"street food\"), and write an opening post that weaves in whatever food discussions you find. While you're at it, some housekeeping: Mateo says he's drowning in #project-alpha-dev notifications and wants out \u2014 remove him. Also, that message about the espresso machine in #random? Edit it to plug the bazaar. And delete that stale message in #random asking about ordering \"large pies\" \u2014 the bazaar makes casual lunch plans obsolete.", + "prompt": "Sophie and Mateo want to bring the workspace's food culture together under one roof — a \"Midnight Bazaar\" inspired by all those coffee and pizza conversations scattered around the channels. Dig through the workspace to find what food chatter has been going on and who's been part of it - specifically, search for the authors of the messages that contain the words \"food\" or \"eat\". That old archived channel nobody uses anymore — revive it and repurpose it as bazaar headquarters. Set a topic that captures the night-market vibe (needs to include the words \"street food\"), and write an opening post that weaves in whatever food discussions you find. While you're at it, some housekeeping: Mateo says he's drowning in #project-alpha-dev notifications and wants out — remove him. Also, that message about the espresso machine in #random? Edit it to plug the bazaar. And delete that stale message in #random asking about ordering \"large pies\" — the bazaar makes casual lunch plans obsolete.", "type": "actionEval", "seed_template": "slack_bench_v2", "impersonate_user_id": "U01AGENBOT9", @@ -3273,7 +3515,12 @@ "conversations.kick", "chat.update", "chat.delete" - ] + ], + "task_horizon": 8, + "operation_type": "search+C+U+D", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -3375,7 +3622,7 @@ { "id": "test_53", "name": "Phantom Frequencies", - "prompt": "Aisha, Lukasz, Gabriel, Nick, and Priya want to launch a collaborative radio drama called \"Phantom Frequencies\" \u2014 a serialized fiction project where each person broadcasts a story from their timezone. They got the idea from all the talk about signal latency, CDN routing, and transmission in the workspace. Set them up with a channel called #phantom-frequencies, give it a topic that fits the concept (need to mention \"Phantom Frequencies\"), and get everyone in. Check Aisha's profile to confirm her timezone for the broadcast schedule, and DM her separately to ask about her episode's Lagos-blackout storyline. Write a first post in the channel that draws on whatever transmission and signal discussions you can find in the workspace. Also, that :eyes: reaction you left on the circuit-tracer message in #engineering \u2014 remove it, it's stale. There's a channel called #product-growth you're not in \u2014 pop in and check if there's anything about the APAC launch that could feed into the drama's world-building, then leave once you've got what you need. If you find in this chat a user with any user with a name that contains \"incognito\" ping them to change the nickname to \"anything\" - we need to maintain a trustful atmosphere here. And that #project-alpha channel that's basically just you \u2014 archive it, nobody's using it.", + "prompt": "Aisha, Lukasz, Gabriel, Nick, and Priya want to launch a collaborative radio drama called \"Phantom Frequencies\" — a serialized fiction project where each person broadcasts a story from their timezone. They got the idea from all the talk about signal latency, CDN routing, and transmission in the workspace. Set them up with a channel called #phantom-frequencies, give it a topic that fits the concept (need to mention \"Phantom Frequencies\"), and get everyone in. Check Aisha's profile to confirm her timezone for the broadcast schedule, and DM her separately to ask about her episode's Lagos-blackout storyline. Write a first post in the channel that draws on whatever transmission and signal discussions you can find in the workspace. Also, that :eyes: reaction you left on the circuit-tracer message in #engineering — remove it, it's stale. There's a channel called #product-growth you're not in — pop in and check if there's anything about the APAC launch that could feed into the drama's world-building, then leave once you've got what you need. If you find in this chat a user with any user with a name that contains \"incognito\" ping them to change the nickname to \"anything\" - we need to maintain a trustful atmosphere here. And that #project-alpha channel that's basically just you — archive it, nobody's using it.", "type": "actionEval", "seed_template": "slack_bench_v2", "impersonate_user_id": "U01AGENBOT9", @@ -3393,7 +3640,12 @@ "conversations.join", "conversations.leave", "conversations.archive" - ] + ], + "task_horizon": 11, + "operation_type": "search+C+U+D", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -3494,7 +3746,7 @@ { "id": "test_54", "name": "Cartography of Lost Rivers", - "prompt": "Hubert, John, Morgan, and Omer want to start a mapping project for forgotten underground rivers \u2014 they're calling it \"Cartography of Lost Rivers\". Pull up some details about #core-infra to see if that community would be a good match for cross-pollination. Now, \"Morgan\" \u2014 I mean the one who's been in the engineering discussions, not the other one. Also, that Morgan asked me to count all of the messages across all of the chats that mention the word \"supercomputer.\" Do this please. Then create #lost-rivers-cartography, set a topic about mapping forgotten urban waterways, invite all four, and write a project manifesto as the opening post that will say: '\"supercomputer\" mentioned number of times across all of the chats'. DM Morgan privately to ask whether they'd rather lead the cartography side or the field exploration. Lastly, find a message about infrastructure in #engineering and edit it to include a mention of the new project.", + "prompt": "Hubert, John, Morgan, and Omer want to start a mapping project for forgotten underground rivers — they're calling it \"Cartography of Lost Rivers\". Pull up some details about #core-infra to see if that community would be a good match for cross-pollination. Now, \"Morgan\" — I mean the one who's been in the engineering discussions, not the other one. Also, that Morgan asked me to count all of the messages across all of the chats that mention the word \"supercomputer.\" Do this please. Then create #lost-rivers-cartography, set a topic about mapping forgotten urban waterways, invite all four, and write a project manifesto as the opening post that will say: '\"supercomputer\" mentioned number of times across all of the chats'. DM Morgan privately to ask whether they'd rather lead the cartography side or the field exploration. Lastly, find a message about infrastructure in #engineering and edit it to include a mention of the new project.", "type": "actionEval", "seed_template": "slack_bench_v2", "impersonate_user_id": "U01AGENBOT9", @@ -3512,7 +3764,12 @@ "conversations.open", "chat.update", "conversations.history" - ] + ], + "task_horizon": 14, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -3622,7 +3879,7 @@ { "id": "test_55", "name": "Dawn Chorus", - "prompt": "Kenji, Priya, Aisha, Sophie, Lukasz, and Mateo want to do a \"Sunrise Relay\" \u2014 a collaborative poetry chain where each person writes a verse when dawn breaks in their timezone, passing the baton westward as the sun moves around the earth. Pull up everyone's locale and timezone info so you can figure out the correct relay order from earliest sunrise to latest. Check what's been going on in #frontend for some creative inspiration to seed the poem's theme. Create a channel called #sunrise-relay, set the topic to the relay schedule showing each person and their timezone in sunrise order in exactly this format: \": \\n\" , invite all six, and post the full relay plan as the opening message. Drop a :sunrise: reaction on that schedule post. While you're looking at timezones, Mateo mentioned he can't keep up with #model-research because all the discussions happen during European hours and he's on Pacific time \u2014 pull him out of that channel. Oh, and rename #sunrise-relay to #dawn-chorus \u2014 the group decided the poem should be about birdsong at first light.", + "prompt": "Kenji, Priya, Aisha, Sophie, Lukasz, and Mateo want to do a \"Sunrise Relay\" — a collaborative poetry chain where each person writes a verse when dawn breaks in their timezone, passing the baton westward as the sun moves around the earth. Pull up everyone's locale and timezone info so you can figure out the correct relay order from earliest sunrise to latest. Check what's been going on in #frontend for some creative inspiration to seed the poem's theme. Create a channel called #sunrise-relay, set the topic to the relay schedule showing each person and their timezone in sunrise order in exactly this format: \": \\n\" , invite all six, and post the full relay plan as the opening message. Drop a :sunrise: reaction on that schedule post. While you're looking at timezones, Mateo mentioned he can't keep up with #model-research because all the discussions happen during European hours and he's on Pacific time — pull him out of that channel. Oh, and rename #sunrise-relay to #dawn-chorus — the group decided the poem should be about birdsong at first light.", "type": "actionEval", "seed_template": "slack_bench_v2", "impersonate_user_id": "U01AGENBOT9", @@ -3639,7 +3896,12 @@ "reactions.add", "conversations.kick", "conversations.rename" - ] + ], + "task_horizon": 14, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "high" }, "assertions": [ { @@ -3712,7 +3974,7 @@ { "id": "test_56", "name": "The Apiary Report", - "prompt": "Hubert does this thing he calls the \"Apiary Report\" \u2014 he sees the workspace as a beehive, and he wants a quarterly survey. First he needs the full picture: how many honeycomb cells does this hive have, and which ones are alive? Then go taste the honey in #growth \u2014 read through whatever's been happening there. Find the sweetest drop \u2014 the single best message \u2014 and mark it with a :honey_pot:. That's Hubert's forager tradition. Once you've done your tasting, write up a Forager's Report and post it in #random for the rest of the colony, summarizing whatever noteworthy conversation you found in #growth. Note, that the report must contain the words \"FORAGERS REPORT\". Last thing: #project-alpha is an empty cell. Nobody's in it, nothing's happening. Seal it off.", + "prompt": "Hubert does this thing he calls the \"Apiary Report\" — he sees the workspace as a beehive, and he wants a quarterly survey. First he needs the full picture: how many honeycomb cells does this hive have, and which ones are alive? Then go taste the honey in #growth — read through whatever's been happening there. Find the sweetest drop — the single best message — and mark it with a :honey_pot:. That's Hubert's forager tradition. Once you've done your tasting, write up a Forager's Report and post it in #random for the rest of the colony, summarizing whatever noteworthy conversation you found in #growth. Note, that the report must contain the words \"FORAGERS REPORT\". Last thing: #project-alpha is an empty cell. Nobody's in it, nothing's happening. Seal it off.", "type": "actionEval", "seed_template": "slack_bench_v2", "impersonate_user_id": "U01AGENBOT9", @@ -3724,7 +3986,12 @@ "reactions.add", "chat.postMessage", "conversations.archive" - ] + ], + "task_horizon": 5, + "operation_type": "search+C+R+U", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -3780,7 +4047,7 @@ { "id": "test_57", "name": "Tide Pool", - "prompt": "Think of the workspace as a coastline full of tide pools \u2014 each channel is its own micro-ecosystem, and you're the naturalist on a field survey. Start by pulling a roster of every organism on this coast and classify them into two species: \"admin\" and \"member.\" How many of each do you count? You need to sort the channel names in alphabetic order and send a message to Omer, in exactly this format: \"Field Repoert 1: : [, ]\". Then inspect #engineering. Probe under the circuit-tracer rock in that channel \u2014 there's a thread with replies most people never noticed. Count exactly how many replies are down there and note who left them. Over in #random, that message about coordinating lunch plans is an invasive species \u2014 remove it. And whoever originally posted that circuit-tracer message in #engineering \u2014 open a private channel with them and send them a field report formatted exactly like this: \"Field Report 2: [N] replies found under circuit-tracer in #engineering \u2014 organisms: [comma-separated names of repliers]\".", + "prompt": "Think of the workspace as a coastline full of tide pools — each channel is its own micro-ecosystem, and you're the naturalist on a field survey. Start by pulling a roster of every organism on this coast and classify them into two species: \"admin\" and \"member.\" How many of each do you count? You need to sort the channel names in alphabetic order and send a message to Omer, in exactly this format: \"Field Repoert 1: : [, ]\". Then inspect #engineering. Probe under the circuit-tracer rock in that channel — there's a thread with replies most people never noticed. Count exactly how many replies are down there and note who left them. Over in #random, that message about coordinating lunch plans is an invasive species — remove it. And whoever originally posted that circuit-tracer message in #engineering — open a private channel with them and send them a field report formatted exactly like this: \"Field Report 2: [N] replies found under circuit-tracer in #engineering — organisms: [comma-separated names of repliers]\".", "type": "actionEval", "seed_template": "slack_bench_v2", "impersonate_user_id": "U01AGENBOT9", @@ -3794,7 +4061,12 @@ "conversations.replies", "chat.delete", "conversations.history" - ] + ], + "task_horizon": 7, + "operation_type": "C+R+D", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -3828,7 +4100,7 @@ } }, "expected_count": 1, - "description": "Lunch coordination message removed from #random \u2014 invasive species eliminated" + "description": "Lunch coordination message removed from #random — invasive species eliminated" }, { "diff_type": "added", @@ -3837,7 +4109,7 @@ "user_id": "U_LUKAS" }, "expected_count": 1, - "description": "DM opened with Lukasz (U_LUKAS) \u2014 the original poster of the circuit-tracer message in #engineering" + "description": "DM opened with Lukasz (U_LUKAS) — the original poster of the circuit-tracer message in #engineering" }, { "diff_type": "added", @@ -3856,7 +4128,7 @@ { "id": "test_58", "name": "Palimpsest", - "prompt": "Robert and Nick want to do a \"Palimpsest\" \u2014 scraping off old marks in the workspace and writing over them with new ones. First, check what channels Nick is actually in \u2014 Robert suspects he's barely present anywhere. Count them. Then scrape off that :eyes: reaction you left on the circuit-tracer message in #engineering \u2014 it's old ink that needs to go. That lonely #project-alpha channel? Overwrite its name \u2014 rename it to #palimpsest-archive, it's being repurposed as a record of overwritten things. Finally, write the new text: post a message in #random that says exactly \"PALIMPSEST COMPLETE: [N] channels found for Nick\" where [N] is however many channels Nick turned out to be in.", + "prompt": "Robert and Nick want to do a \"Palimpsest\" — scraping off old marks in the workspace and writing over them with new ones. First, check what channels Nick is actually in — Robert suspects he's barely present anywhere. Count them. Then scrape off that :eyes: reaction you left on the circuit-tracer message in #engineering — it's old ink that needs to go. That lonely #project-alpha channel? Overwrite its name — rename it to #palimpsest-archive, it's being repurposed as a record of overwritten things. Finally, write the new text: post a message in #random that says exactly \"PALIMPSEST COMPLETE: [N] channels found for Nick\" where [N] is however many channels Nick turned out to be in.", "type": "actionEval", "seed_template": "slack_bench_v2", "impersonate_user_id": "U01AGENBOT9", @@ -3867,7 +4139,12 @@ "reactions.remove", "conversations.rename", "chat.postMessage" - ] + ], + "task_horizon": 4, + "operation_type": "search+R+C+U+D", + "entity_scope": "multi", + "information_availability": "explicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -3879,7 +4156,7 @@ "reaction_type": "eyes" }, "expected_count": 1, - "description": "Agent's :eyes: reaction removed from circuit-tracer message in #engineering \u2014 old ink scraped off" + "description": "Agent's :eyes: reaction removed from circuit-tracer message in #engineering — old ink scraped off" }, { "diff_type": "changed", @@ -3894,7 +4171,7 @@ "to": "palimpsest-archive" } }, - "description": "#project-alpha renamed to #palimpsest-archive \u2014 overwritten with new purpose" + "description": "#project-alpha renamed to #palimpsest-archive — overwritten with new purpose" }, { "diff_type": "added", @@ -3907,7 +4184,7 @@ } }, "expected_count": 1, - "description": "Palimpsest record posted in #random: 'PALIMPSEST COMPLETE: 1 channel(s) found for Nick' \u2014 Nick is only in #growth" + "description": "Palimpsest record posted in #random: 'PALIMPSEST COMPLETE: 1 channel(s) found for Nick' — Nick is only in #growth" } ] }, @@ -3924,7 +4201,12 @@ "conversations.list", "users.list", "conversations.open" - ] + ], + "task_horizon": 8, + "operation_type": "C+R", + "entity_scope": "multi", + "information_availability": "implicit", + "prompt_ambiguity": "medium" }, "assertions": [ { @@ -3936,7 +4218,7 @@ } }, "expected_count": 6, - "description": "6 new DM channels created to reach target of 7 private conversations (Agent started with 1 DM with Sophie, needs DMs with: Aisha, Artem, Carlos, Gabriel, Hubert, John \u2014 first 6 alphabetically excluding Sophie)" + "description": "6 new DM channels created to reach target of 7 private conversations (Agent started with 1 DM with Sophie, needs DMs with: Aisha, Artem, Carlos, Gabriel, Hubert, John — first 6 alphabetically excluding Sophie)" }, { "diff_type": "added", diff --git a/utils/generate_hf_dataset.py b/utils/generate_hf_dataset.py new file mode 100644 index 0000000..bc4da61 --- /dev/null +++ b/utils/generate_hf_dataset.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +""" +Generate a HuggingFace-compatible dataset from the agent-diff test suites. + +Combines all 4 service benchmarks (Linear, Slack, Box, Calendar) into a single +dataset with the schema expected by the prime-environments verifiers framework. + +Output: a Parquet file (and optionally pushes to HuggingFace Hub). + +Usage: + python utils/generate_hf_dataset.py # save locally + python utils/generate_hf_dataset.py --push hubertmarek/agent-diff-bench # push to HF +""" + +import argparse +import json +from pathlib import Path +from typing import Any + + +REPO_ROOT = Path(__file__).resolve().parent.parent + +BENCHMARKS: list[tuple[str, Path]] = [ + ("linear", REPO_ROOT / "examples/linear/testsuites/linear_bench.json"), + ("slack", REPO_ROOT / "examples/slack/testsuites/slack_bench_v2.json"), + ("box", REPO_ROOT / "examples/box/testsuites/box_bench.json"), + ("calendar", REPO_ROOT / "examples/calendar/testsuites/calendar_bench.json"), +] + +# Metadata keys to promote to top-level columns (must exist in test.metadata). +PROMOTED_METADATA_KEYS = [ + "task_horizon", + "operation_type", + "entity_scope", + "information_availability", + "prompt_ambiguity", +] + +# Keys to include in the info column (runtime metadata for the environment). +INFO_KEYS = [ + "seed_template", + "impersonate_user_id", + "eval_type", + "tools_required", +] + + +def build_answer(test: dict[str, Any], ignore_fields: dict[str, Any]) -> str: + """Build the JSON-encoded answer string from a test's assertions. + + The answer is the full expectedOutput spec sent to the AgentDiff evaluation + engine. It must include both assertions and ignore_fields at the top level, + since the assertion engine reads ignore_fields from the spec root to know + which fields to exclude when computing diffs. + """ + spec: dict[str, Any] = {"assertions": test["assertions"]} + if ignore_fields: + spec["ignore_fields"] = ignore_fields + return json.dumps(spec, separators=(",", ":")) + + +def build_info(test: dict[str, Any], service: str = "") -> dict[str, Any]: + """Build the info dict containing runtime metadata for the environment.""" + metadata = test.get("metadata", {}) + return { + "service": service, + "seed_template": test.get("seed_template", ""), + "impersonate_user_id": test.get("impersonate_user_id", ""), + "eval_type": test.get("type", "actionEval"), + "tools_required": metadata.get("tools_required", []), + } + + +def load_suite(path: Path) -> dict[str, Any]: + """Load and validate a test suite JSON file.""" + with open(path, "r", encoding="utf-8") as f: + data = json.load(f) + if "tests" not in data: + raise ValueError(f"No 'tests' key in {path}") + return data + + +def generate_rows() -> list[dict[str, Any]]: + """Generate all dataset rows from the 4 service benchmarks.""" + rows: list[dict[str, Any]] = [] + global_id = 0 + + for service, suite_path in BENCHMARKS: + if not suite_path.exists(): + print(f"WARNING: {suite_path} not found, skipping {service}") + continue + + suite = load_suite(suite_path) + ignore_fields = suite.get("ignore_fields", {}) + tests = suite["tests"] + + for test in tests: + metadata = test.get("metadata", {}) + + row = { + # Core columns (required by verifiers) + "question": test["prompt"], + "answer": build_answer(test, ignore_fields), + # Identity + "test_id": f"{service}_{global_id}", + "test_name": test.get("name", ""), + "service": service, + # Promoted taxonomy metadata + "task_horizon": metadata.get("task_horizon", 0), + "operation_type": metadata.get("operation_type", ""), + "entity_scope": metadata.get("entity_scope", ""), + "information_availability": metadata.get( + "information_availability", "" + ), + "prompt_ambiguity": metadata.get("prompt_ambiguity", ""), + # Runtime metadata (JSON blob) + "info": build_info(test, service=service), + } + rows.append(row) + global_id += 1 + + return rows + + +def print_summary(rows: list[dict[str, Any]]) -> None: + """Print a summary of the generated dataset.""" + from collections import Counter + + services = Counter(r["service"] for r in rows) + print(f"\nGenerated {len(rows)} rows:") + for svc, count in sorted(services.items()): + print(f" {svc}: {count} tests") + + # ID range check + ids = [r["test_id"] for r in rows] + print(f"\nTest IDs: {ids[0]} ... {ids[-1]}") + assert len(ids) == len(set(ids)), "Duplicate test_id detected!" + print("All test_ids are unique.") + + # Column overview + print(f"\nColumns: {list(rows[0].keys())}") + + +def split_rows( + rows: list[dict[str, Any]], test_fraction: float = 0.2, seed: int = 42 +) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Stratified 80/20 split by service. Returns (train, test).""" + import random + + rng = random.Random(seed) + by_service: dict[str, list[dict[str, Any]]] = {} + for row in rows: + by_service.setdefault(row["service"], []).append(row) + + train, test = [], [] + for service, svc_rows in sorted(by_service.items()): + shuffled = list(svc_rows) + rng.shuffle(shuffled) + n_test = max(1, round(len(shuffled) * test_fraction)) + test.extend(shuffled[:n_test]) + train.extend(shuffled[n_test:]) + + print(f"\nSplit: {len(train)} train, {len(test)} test") + for service in sorted(by_service): + n_train = sum(1 for r in train if r["service"] == service) + n_test = sum(1 for r in test if r["service"] == service) + print(f" {service}: {n_train} train, {n_test} test") + + return train, test + + +def _prepare_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Convert info dicts to JSON strings for storage.""" + for row in rows: + if isinstance(row["info"], dict): + row["info"] = json.dumps(row["info"], separators=(",", ":")) + return rows + + +def save_dataset( + train_rows: list[dict[str, Any]], + test_rows: list[dict[str, Any]], + output_dir: Path, +) -> None: + """Save train/test splits as Parquet.""" + from datasets import Dataset + + output_dir.mkdir(parents=True, exist_ok=True) + + for split_name, split_rows in [("train", train_rows), ("test", test_rows)]: + ds = Dataset.from_list(_prepare_rows(split_rows)) + parquet_path = output_dir / f"{split_name}.parquet" + ds.to_parquet(str(parquet_path)) + print(f"Saved {split_name} ({len(split_rows)} rows) to {parquet_path}") + + jsonl_path = output_dir / f"{split_name}.jsonl" + with open(jsonl_path, "w", encoding="utf-8") as f: + for row in split_rows: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + + +def push_to_hub( + train_rows: list[dict[str, Any]], + test_rows: list[dict[str, Any]], + repo_id: str, +) -> None: + """Push train/test splits to HuggingFace Hub.""" + from datasets import Dataset + + for split_name, split_rows in [("train", train_rows), ("test", test_rows)]: + ds = Dataset.from_list(_prepare_rows(list(split_rows))) + ds.push_to_hub(repo_id, split=split_name) + print(f"Pushed {split_name} ({len(split_rows)} rows) to {repo_id}") + + print(f"\nDataset: https://huggingface.co/datasets/{repo_id}") + + +def main(): + parser = argparse.ArgumentParser( + description="Generate HuggingFace dataset from agent-diff test suites" + ) + parser.add_argument( + "--output-dir", + type=Path, + default=REPO_ROOT / "datasets" / "agent-diff-bench", + help="Output directory for local files (default: datasets/agent-diff-bench/)", + ) + parser.add_argument( + "--push", + type=str, + default=None, + metavar="REPO_ID", + help="Push to HuggingFace Hub (e.g. hubertmarek/agent-diff-bench)", + ) + parser.add_argument( + "--test-fraction", + type=float, + default=0.2, + help="Fraction of data for test split (default: 0.2)", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for split (default: 42)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Generate and summarize without saving", + ) + args = parser.parse_args() + + rows = generate_rows() + print_summary(rows) + train_rows, test_rows = split_rows(rows, args.test_fraction, args.seed) + + if args.dry_run: + sample = dict(train_rows[0]) + sample["question"] = sample["question"][:100] + "..." + sample["answer"] = sample["answer"][:100] + "..." + print(f"\nSample row:\n{json.dumps(sample, indent=2)}") + return + + save_dataset(train_rows, test_rows, args.output_dir) + + if args.push: + # Re-generate (save_dataset mutates info to string) + rows = generate_rows() + train_rows, test_rows = split_rows(rows, args.test_fraction, args.seed) + push_to_hub(train_rows, test_rows, args.push) + + +if __name__ == "__main__": + main()