From 22a1ab5017af0435227d2f3d5df471605f289c58 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Tue, 17 Feb 2026 12:09:44 -0800 Subject: [PATCH 01/31] Add a dataset notebook --- notebooks/build_repair_create_dataset.ipynb | 230 ++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 notebooks/build_repair_create_dataset.ipynb diff --git a/notebooks/build_repair_create_dataset.ipynb b/notebooks/build_repair_create_dataset.ipynb new file mode 100644 index 0000000000..4f8604a7b4 --- /dev/null +++ b/notebooks/build_repair_create_dataset.ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "af4f2030", + "metadata": {}, + "source": "# Create Dataset for Build Repairs" + }, + { + "cell_type": "markdown", + "id": "f0f37fd7", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "id": "5f57608e", + "metadata": {}, + "source": [ + "import weave\n", + "\n", + "PROJECT_NAME = \"bugbug-build-repair-eval\"\n", + "DATASET_NAME = \"build_repair_one_commit_eval\"\n", + "\n", + "_ = weave.init(PROJECT_NAME)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "255ccaac", + "metadata": {}, + "source": [ + "## Prepare the Data\n", + "\n", + "### Load one commit build failures" + ] + }, + { + "cell_type": "code", + "id": "5bd97ef4", + "metadata": {}, + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_json('https://community-tc.services.mozilla.com/api/queue/v1/task/Ra5r2qSyS8G-9pjLrS6l6Q/runs/0/artifacts/public%2Fci_failures.json.zst', lines=True)\n", + "df = df[(df.failure_commits.apply(len) == 1) & (df.fix_commits.apply(len) == 1)]\n", + "print(f\"One commit fail and fix: {len(df)}\")\n", + "df = df[df.failures.apply(lambda fails: any('build' in f['task_name'] and 'test' not in f['task_name'] for f in fails))]\n", + "print(f\"Build fails only: {len(df)}\")" + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Check that corresponding bugs are public", + "id": "f5a96ec84466d611" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "import requests\n", + "\n", + "def is_bug_public(bug_id):\n", + " url = f'https://bugzilla.mozilla.org/rest/bug/{bug_id}'\n", + " resp = requests.get(url)\n", + " if resp.status_code == 401:\n", + " return False\n", + " elif resp.status_code == 200:\n", + " return True\n", + " else:\n", + " raise ValueError('Unexpected Bugzilla status code: {resp.status_code }')\n", + "\n", + "df = df[df.bug_id.apply(is_bug_public)]\n", + "print(f\"Build fails for public bugs: {len(df)}\")" + ], + "id": "b9289c8688226f08", + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-13T23:52:42.195048Z", + "start_time": "2026-02-13T23:52:42.191945Z" + } + }, + "cell_type": "markdown", + "source": "### Get GitHub revisions", + "id": "3d5e260dc3245c36" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "def get_git_rev(hg_revs):\n", + " for rev in hg_revs:\n", + " convert_url = f'https://lando.moz.tools/api/hg2git/firefox/{rev}'\n", + " resp = requests.get(convert_url)\n", + " if resp.status_code != 200:\n", + " raise ValueError(f'Unexpected HTTP status code: {resp.status_code}. {resp}')\n", + " yield resp.json()['git_hash']\n", + "\n", + "df['gh_failure_commits'] = df.failure_commits.apply(lambda commits: list(get_git_rev(commits)))\n", + "df['gh_fix_commits'] = df.fix_commits.apply(lambda commits: list(get_git_rev(commits)))\n", + "df = df.rename(columns={'failure_commits': 'hg_failure_commits', 'fix_commits': 'hg_fix_commits'})" + ], + "id": "23e362967af3e2cf", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Ger bugzilla comments before the fix", + "id": "9dc12cf3f6f9b844" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "from bugbug.tools.core.platforms.bugzilla import Bug\n", + "from libmozdata import config\n", + "\n", + "config.set_default_value(\"User-Agent\", \"name\", \"bugbug/1.0\")\n", + "\n", + "def _get_comments(bug, fail_commit):\n", + " for comment in bug._metadata['comments']:\n", + " if comment['creator'] == 'pulsebot@bmo.tld':\n", + " if fail_commit[:12] in comment['raw_text']:\n", + " # stop adding comments at failure commit push\n", + " yield comment['raw_text']\n", + " break\n", + " else:\n", + " continue\n", + " if comment['raw_text']:\n", + " yield comment['raw_text']\n", + "\n", + "def get_bug_info_pre_fix(build_fail):\n", + " bug_id = build_fail['bug_id']\n", + " fail_commit = build_fail['hg_failure_commits'][0]\n", + "\n", + " try:\n", + " bug = Bug.get(bug_id)\n", + " except ValueError as ex:\n", + " print(ex)\n", + " return None\n", + "\n", + " return {'title': bug.summary, 'comments': list(_get_comments(bug, fail_commit))}\n", + "\n", + "df['pre_fix_bug'] = df.apply(get_bug_info_pre_fix, axis=1)\n", + "df = df[df['pre_fix_bug'].notnull()]\n", + "print(f'Final number of bugs: {len(df)}')" + ], + "id": "cda623f631db857e", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": "df", + "id": "e1b6e97b2b16c638", + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "3c4f93a4", + "metadata": {}, + "source": [ + "## Save the Dataset" + ] + }, + { + "cell_type": "code", + "id": "c5041136", + "metadata": {}, + "source": [ + "examples = df.to_dict(orient=\"records\")\n", + "examples[0]" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "d132783a", + "metadata": {}, + "source": [ + "dataset = weave.Dataset(\n", + " name=DATASET_NAME,\n", + " description=\"Build repair evaluation dataset with failure logs, ground truth fix commits and pre fix Bugzilla comments.\",\n", + " rows=examples,\n", + ")\n", + "\n", + "_ = weave.publish(dataset)" + ], + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": "bugbug (3.12.7)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 77aeb2ef7435dc99661bcd9555f624f088b240f8 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Tue, 17 Feb 2026 16:57:16 -0800 Subject: [PATCH 02/31] Add JetBrains IDE --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index a00be42b73..77ee39bd33 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,6 @@ node_modules/ *.log # Desktop Service Store *.DS_Store + +# JetBrains IDEs +.idea From 7a95a7fc6cdef3dc86a321f04f3d6a0a251cfa87 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Tue, 17 Feb 2026 17:18:42 -0800 Subject: [PATCH 03/31] Initial implementation --- bugbug/tools/build_repair/__init__.py | 8 + bugbug/tools/build_repair/agent.py | 240 +++++++++++++++++++ bugbug/tools/build_repair/config.py | 64 +++++ bugbug/tools/build_repair/prompts.py | 35 +++ bugbug/tools/build_repair/scorer.py | 112 +++++++++ bugbug/tools/build_repair/try_server.py | 193 +++++++++++++++ bugbug/tools/build_repair/worktree.py | 47 ++++ docker/build_repair/Dockerfile | 12 + notebooks/build_repair_create_dataset.ipynb | 202 +++++++++++----- notebooks/build_repair_evaluation.ipynb | 246 ++++++++++++++++++++ requirements.txt | 1 + scripts/build_repair_eval.py | 143 ++++++++++++ 12 files changed, 1239 insertions(+), 64 deletions(-) create mode 100644 bugbug/tools/build_repair/__init__.py create mode 100644 bugbug/tools/build_repair/agent.py create mode 100644 bugbug/tools/build_repair/config.py create mode 100644 bugbug/tools/build_repair/prompts.py create mode 100644 bugbug/tools/build_repair/scorer.py create mode 100644 bugbug/tools/build_repair/try_server.py create mode 100644 bugbug/tools/build_repair/worktree.py create mode 100644 docker/build_repair/Dockerfile create mode 100644 notebooks/build_repair_evaluation.ipynb create mode 100644 scripts/build_repair_eval.py diff --git a/bugbug/tools/build_repair/__init__.py b/bugbug/tools/build_repair/__init__.py new file mode 100644 index 0000000000..2f3caac2c6 --- /dev/null +++ b/bugbug/tools/build_repair/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +from bugbug.tools.build_repair.agent import AgentResponse, BuildFailure, BuildRepairTool + +__all__ = ["AgentResponse", "BuildFailure", "BuildRepairTool"] diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py new file mode 100644 index 0000000000..d1e23bb06c --- /dev/null +++ b/bugbug/tools/build_repair/agent.py @@ -0,0 +1,240 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import json +import subprocess +from logging import getLogger +from pathlib import Path + +from claude_agent_sdk import ClaudeAgentOptions, ResultMessage, query +from pydantic import BaseModel, Field + +from bugbug.tools.base import GenerativeModelTool +from bugbug.tools.build_repair.config import ( + ANALYSIS_MODEL, + CLAUDE_PERMISSIONS_CONFIG, + DEFAULT_MAX_TURNS, + FIREFOX_MCP_URL, + FIX_MODEL, +) +from bugbug.tools.build_repair.prompts import ( + ANALYSIS_TEMPLATE, + FIX_TEMPLATE, + SYSTEM_PROMPT_TEMPLATE, +) + +logger = getLogger(__name__) + + +class BuildFailure(BaseModel): + """Input describing a build failure from the dataset.""" + + bug_id: int = Field(description="The ID of the bug in Bugzilla.") + bug_title: str | None = Field(default=None, description="Optional bug title.") + bug_comments: list[str] | None = Field( + default=None, description="Optional bug comments." + ) + git_commit: str = Field(description="Git revision to checkout.") + failure_tasks: list[dict] = Field( + description="List of {task_name, task_id, retry_id, failure_lines}." + ) + + +class AgentResponse(BaseModel): + """Output from a build repair run, including analysis, diff, cost, and build results.""" + + summary: str = Field(default="") + analysis: str = Field(default="") + diff: str = Field(default="") + error: str | None = Field(default=None) + cost_usd: float = Field(default=0.0) + num_turns: int = Field(default=0) + local_build_passed: bool | None = Field(default=None) + try_build_passed: bool | None = Field(default=None) + lando_job_id: str | None = Field(default=None) + treeherder_url: str | None = Field(default=None) + + +class BuildRepairTool(GenerativeModelTool): + """Two-stage build repair agent using Claude Agent SDK. + + Stage 1 (Opus): Analyzes the failure and produces analysis/planning/summary docs. + Stage 2 (Sonnet): Reads the analysis and implements a fix. Skipped in analysis-only mode. + After Stage 2, commits the fix, runs ./mach build, and optionally submits to try. + """ + + def __init__( + self, + target_software: str = "Mozilla Firefox", + analysis_only: bool = False, + analysis_model: str = ANALYSIS_MODEL, + fix_model: str = FIX_MODEL, + max_turns: int = DEFAULT_MAX_TURNS, + ) -> None: + self.target_software = target_software + self.analysis_only = analysis_only + self.analysis_model = analysis_model + self.fix_model = fix_model + self.max_turns = max_turns + + @classmethod + def create(cls, **kwargs): + return cls(**kwargs) + + def _prepare_input_files(self, failure: BuildFailure, worktree_path: Path) -> None: + in_dir = worktree_path / "repair_agent" / "in" / str(failure.bug_id) + in_dir.mkdir(parents=True, exist_ok=True) + + (in_dir / "bug_description.md").write_text( + f"# Bug {failure.bug_id}: {failure.bug_title}\n\n" + + "\n\n---\n\n".join(failure.bug_comments or []) + ) + + logs_content = "" + for task in failure.failure_tasks: + logs_content += f"## {task['task_name']} (task_id: {task['task_id']})\n\n" + logs_content += "\n".join(task["failure_lines"]) + "\n\n" + (in_dir / "build_failure_logs.md").write_text(logs_content) + + out_dir = worktree_path / "repair_agent" / "out" / str(failure.bug_id) + out_dir.mkdir(parents=True, exist_ok=True) + + def _write_settings(self, worktree_path: Path) -> None: + settings_dir = worktree_path / ".claude" + settings_dir.mkdir(exist_ok=True) + (settings_dir / "settings.json").write_text( + json.dumps(CLAUDE_PERMISSIONS_CONFIG, indent=2) + ) + + def _read_output(self, failure: BuildFailure, worktree_path: Path, key: str) -> str: + path = ( + worktree_path / "repair_agent" / "out" / str(failure.bug_id) / f"{key}.md" + ) + if path.exists(): + return path.read_text() + return "" + + async def run( + self, + failure: BuildFailure, + worktree_path: Path, + skip_try_push: bool = False, + ) -> AgentResponse: + self._prepare_input_files(failure, worktree_path) + self._write_settings(worktree_path) + + system_prompt = SYSTEM_PROMPT_TEMPLATE.format( + target_software=self.target_software + ) + mcp_servers = [{"url": FIREFOX_MCP_URL, "name": "firefox"}] + disallowed = ["AskUserQuestion", "Task"] + total_cost = 0.0 + total_turns = 0 + + # Stage 1: Analysis + stage1_options = ClaudeAgentOptions( + system_prompt=system_prompt, + model=self.analysis_model, + cwd=str(worktree_path), + disallowed_tools=disallowed, + permission_mode="default", + setting_sources=["project"], + max_turns=self.max_turns, + max_thinking_tokens=16000, + mcp_servers=mcp_servers, + ) + analysis_prompt = ANALYSIS_TEMPLATE.format( + bug_id=failure.bug_id, + target_software=self.target_software, + ) + try: + async for message in query(prompt=analysis_prompt, options=stage1_options): + if isinstance(message, ResultMessage): + total_cost += message.total_cost_usd or 0 + total_turns += message.num_turns or 0 + except Exception as e: + return AgentResponse( + error=str(e), + cost_usd=total_cost, + num_turns=total_turns, + ) + + summary = self._read_output(failure, worktree_path, "summary") + analysis = self._read_output(failure, worktree_path, "analysis") + + if self.analysis_only: + return AgentResponse( + summary=summary, + analysis=analysis, + cost_usd=total_cost, + num_turns=total_turns, + ) + + # Stage 2: Fix + stage2_options = ClaudeAgentOptions( + system_prompt=system_prompt, + model=self.fix_model, + cwd=str(worktree_path), + disallowed_tools=disallowed, + permission_mode="default", + setting_sources=["project"], + max_turns=self.max_turns, + mcp_servers=mcp_servers, + ) + fix_prompt = FIX_TEMPLATE.format(bug_id=failure.bug_id) + try: + async for message in query(prompt=fix_prompt, options=stage2_options): + if isinstance(message, ResultMessage): + total_cost += message.total_cost_usd or 0 + total_turns += message.num_turns or 0 + except Exception as e: + return AgentResponse( + summary=summary, + analysis=analysis, + error=str(e), + cost_usd=total_cost, + num_turns=total_turns, + ) + + diff_result = subprocess.run( + ["git", "diff", "HEAD"], + cwd=worktree_path, + capture_output=True, + text=True, + ) + diff = diff_result.stdout + + if not diff.strip(): + return AgentResponse( + summary=summary, + analysis=analysis, + diff=diff, + cost_usd=total_cost, + num_turns=total_turns, + ) + + from bugbug.tools.build_repair.try_server import run_try_verification + + task_name = ( + failure.failure_tasks[0]["task_name"] if failure.failure_tasks else "" + ) + try_result = run_try_verification( + worktree_path=worktree_path, + bug_id=failure.bug_id, + task_name=task_name, + skip_try_push=skip_try_push, + ) + + return AgentResponse( + summary=summary, + analysis=analysis, + diff=diff, + cost_usd=total_cost, + num_turns=total_turns, + local_build_passed=try_result.local_build_passed, + try_build_passed=try_result.try_build_passed, + lando_job_id=try_result.lando_job_id, + treeherder_url=try_result.treeherder_url, + ) diff --git a/bugbug/tools/build_repair/config.py b/bugbug/tools/build_repair/config.py new file mode 100644 index 0000000000..1fa36523ff --- /dev/null +++ b/bugbug/tools/build_repair/config.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +ANALYSIS_MODEL = "claude-opus-4-6" +FIX_MODEL = "claude-sonnet-4-6" +DEFAULT_MAX_TURNS = 80 +WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees" +TRY_PUSH_TIMEOUT_SECONDS = 7200 +TRY_PUSH_POLL_INTERVAL_SECONDS = 60 +TREEHERDER_BASE_URL = "https://treeherder.mozilla.org" + +FIREFOX_MCP_URL = "https://mcp-dev.moz.tools/mcp" + +CLAUDE_PERMISSIONS_CONFIG = { + "permissions": { + "allow": [ + "Edit(~/.mozbuild)", + "Edit(~/.cache/uv)", + "Bash(./mach build:*)", + "Bash(./mach clobber:*)", + "Bash(./mach configure:*)", + "Bash(./mach run:*)", + "Bash(./mach test:*)", + "Bash(./mach wpt:*)", + "Bash(./mach lint:*)", + "Bash(./mach format:*)", + "Bash(./mach clang-format:*)", + "Bash(./mach try:*)", + "Bash(./mach help:*)", + "Bash(./mach vendor:*)", + "Bash(./mach bootstrap:*)", + "Bash(./mach artifact:*)", + "Bash(clang++:*)", + "Bash(rm:*)", + "Bash(timeout:*)", + "Bash(find:*)", + "Bash(grep:*)", + "Bash(tee:*)", + "Bash(kill:*)", + "Bash(searchfox-cli:*)", + "Bash(treeherder-cli:*)", + "Bash(jj:*)", + "WebFetch(domain:firefox-source-docs.mozilla.org)", + "WebFetch(domain:treeherder.mozilla.org)", + "WebFetch(domain:searchfox.org)", + "WebFetch(o1069899.ingest.sentry.io)", + ], + "deny": [], + "additionalDirectories": [ + "~/.mozbuild", + "~/.cache/uv/", + ], + }, + "sandbox": { + "enabled": True, + "autoAllowBashIfSandboxed": True, + "allowUnsandboxedCommands": False, + "network": { + "allowLocalBinding": True, + }, + }, +} diff --git a/bugbug/tools/build_repair/prompts.py b/bugbug/tools/build_repair/prompts.py new file mode 100644 index 0000000000..6ba37636a2 --- /dev/null +++ b/bugbug/tools/build_repair/prompts.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Prompt templates for build repair agent.""" + +SYSTEM_PROMPT_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure. """ + +ANALYSIS_TEMPLATE = """Investigate why the last commit broke {target_software} build. + +The last commit attempted to fix a bug from Bugzilla. + +Analyze the following: +1. Git diff for the last commit +2. Bugzilla bug description +3. Taskcluster build failure logs +The files with bug description and logs are located at @repair_agent/in/{bug_id} + +Create three separate documents: +1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues +2. repair_agent/out/{bug_id}/planning.md with a fixing plan +3. repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction + +Do not prompt to edit those documents. + +Do not write any code yet. Work fully autonomously, do not ask any questions. Think hard. +""" + +FIX_TEMPLATE = """Read the following files and implement a fix of the failure: +1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues +2. repair_agent/out/{bug_id}/planning.md with a fixing plan + +Do not prompt to edit files. Work fully autonomously, do not ask any questions. Use all allowed tools without prompting. +""" diff --git a/bugbug/tools/build_repair/scorer.py b/bugbug/tools/build_repair/scorer.py new file mode 100644 index 0000000000..7cfac108a3 --- /dev/null +++ b/bugbug/tools/build_repair/scorer.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +from logging import getLogger + +import weave + +logger = getLogger(__name__) + + +class BasicMetricsScorer(weave.Scorer): + """Scores success rate, diff production rate, cost, and turn count.""" + + @weave.op() + def score(self, output: dict) -> dict: + return { + "successful": output.get("error") is None, + "has_diff": bool(output.get("diff", "").strip()), + "cost_usd": output.get("cost_usd", 0), + "num_turns": output.get("num_turns", 0), + } + + def summarize(self, score_rows: list[dict]) -> dict: + n = len(score_rows) + costs = [r["cost_usd"] for r in score_rows] + return { + "success_rate": sum(r["successful"] for r in score_rows) / n if n else 0, + "diff_rate": sum(r["has_diff"] for r in score_rows) / n if n else 0, + "avg_cost_usd": sum(costs) / n if n else 0, + "total_cost_usd": sum(costs), + "num_examples": n, + } + + +class BuildPassRateScorer(weave.Scorer): + """Scores local ./mach build and try push pass rates.""" + + @weave.op() + def score(self, output: dict) -> dict: + return { + "local_build_passed": output.get("local_build_passed"), + "try_build_passed": output.get("try_build_passed"), + } + + def summarize(self, score_rows: list[dict]) -> dict: + n = len(score_rows) + local_passed = sum(1 for r in score_rows if r["local_build_passed"] is True) + try_known = [r for r in score_rows if r["try_build_passed"] is not None] + try_passed = sum(1 for r in try_known if r["try_build_passed"] is True) + return { + "local_build_pass_rate": local_passed / n if n else 0, + "local_builds_passed": local_passed, + "try_build_pass_rate": try_passed / len(try_known) if try_known else 0, + "try_builds_passed": try_passed, + "try_builds_timed_out": n - len(try_known), + "num_examples": n, + } + + +class LLMFixMatchingScorer(weave.Scorer): + """Scaffold for LLM-as-a-judge comparing agent fix to ground truth. + + Implementation deferred. Will use a non-Claude LLM to semantically + compare the agent's diff against the ground truth fix commit. + """ + + @weave.op() + async def score(self, output: dict, gh_fix_commits: list[str]) -> dict: + return { + "match_score": None, + "match_category": "not_implemented", + } + + def summarize(self, score_rows: list[dict]) -> dict: + return {"status": "not_implemented"} + + +def compute_pass_at_k( + trial_results: list[list[dict]], + metric: str, +) -> dict: + """Compute pass@k metrics across multiple trial runs. + + Args: + trial_results: list of k trial result lists, each with per-example scores + metric: which boolean metric to use (e.g. "local_build_passed", "successful") + + Returns: + pass@1, pass@3, pass@k and pass^k metrics + """ + k = len(trial_results) + num_examples = len(trial_results[0]) + + pass_at = {} + for n in [1, 3, k]: + if n > k: + continue + successes = sum( + any(trial_results[t][i][metric] is True for t in range(n)) + for i in range(num_examples) + ) + pass_at[f"pass@{n}"] = successes / num_examples + + all_pass = sum( + all(trial_results[t][i][metric] is True for t in range(k)) + for i in range(num_examples) + ) + pass_at[f"pass^{k}"] = all_pass / num_examples + + return pass_at diff --git a/bugbug/tools/build_repair/try_server.py b/bugbug/tools/build_repair/try_server.py new file mode 100644 index 0000000000..e09acb2e7f --- /dev/null +++ b/bugbug/tools/build_repair/try_server.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import re +import subprocess +import time +from dataclasses import dataclass +from logging import getLogger +from pathlib import Path + +import requests + +from bugbug.tools.build_repair.config import ( + TREEHERDER_BASE_URL, + TRY_PUSH_POLL_INTERVAL_SECONDS, + TRY_PUSH_TIMEOUT_SECONDS, +) + +logger = getLogger(__name__) + +_HEADERS = {"User-Agent": "bugbug-build-repair-eval/1.0"} +_LANDO_JOB_ID_RE = re.compile(r"landoCommitID=([A-Za-z0-9_-]+)") + + +@dataclass +class TryPushResult: + """Result of local build verification and optional try push submission.""" + + local_build_passed: bool + try_build_passed: bool | None + lando_job_id: str | None + treeherder_url: str | None + + +def _commit_fix(worktree_path: Path, bug_id: int) -> None: + subprocess.run( + ["git", "add", "-A"], + cwd=worktree_path, + check=True, + ) + subprocess.run( + ["git", "commit", "-m", f"Build repair fix for bug {bug_id}"], + cwd=worktree_path, + check=True, + ) + + +def _run_local_build(worktree_path: Path) -> bool: + result = subprocess.run( + ["./mach", "build"], + cwd=worktree_path, + ) + return result.returncode == 0 + + +def _submit_try(worktree_path: Path, task_name: str) -> tuple[str | None, str | None]: + result = subprocess.run( + ["./mach", "try", "fuzzy", "--query", task_name], + cwd=worktree_path, + capture_output=True, + text=True, + ) + stdout = result.stdout + result.stderr + match = _LANDO_JOB_ID_RE.search(stdout) + if not match: + logger.warning("Could not parse Lando job ID from try output: %s", stdout) + return None, None + + lando_job_id = match.group(1) + treeherder_url = f"{TREEHERDER_BASE_URL}/jobs?repo=try&landoCommitID={lando_job_id}" + return lando_job_id, treeherder_url + + +def _get_push_revision(lando_job_id: str) -> str | None: + try: + resp = requests.get( + f"{TREEHERDER_BASE_URL}/api/project/try/push/", + params={"lando_commit_id": lando_job_id}, + headers=_HEADERS, + timeout=30, + ) + resp.raise_for_status() + results = resp.json().get("results", []) + if results: + return results[0].get("revision") + except Exception: + logger.exception("Error fetching push revision for lando job %s", lando_job_id) + return None + + +def _get_push_by_revision(revision: str) -> dict | None: + try: + resp = requests.get( + f"{TREEHERDER_BASE_URL}/api/project/try/push/", + params={"revision": revision}, + headers=_HEADERS, + timeout=30, + ) + resp.raise_for_status() + results = resp.json().get("results", []) + return results[0] if results else None + except Exception: + logger.exception("Error fetching push by revision %s", revision) + return None + + +def _get_build_job_result(push_id: int, task_name: str) -> str | None: + try: + resp = requests.get( + f"{TREEHERDER_BASE_URL}/api/project/try/jobs/", + params={"push_id": push_id, "count": 2000}, + headers=_HEADERS, + timeout=30, + ) + resp.raise_for_status() + for job in resp.json().get("results", []): + if task_name in job.get("job_type_name", ""): + if job["state"] != "completed": + return job["state"] + return job["result"] + except Exception: + logger.exception("Error fetching build job result for push %d", push_id) + return None + + +def _poll_treeherder(lando_job_id: str, task_name: str) -> bool | None: + deadline = time.monotonic() + TRY_PUSH_TIMEOUT_SECONDS + push_id: int | None = None + + while time.monotonic() < deadline: + if push_id is None: + revision = _get_push_revision(lando_job_id) + if revision: + push = _get_push_by_revision(revision) + if push: + push_id = push["id"] + + if push_id is not None: + result = _get_build_job_result(push_id, task_name) + if result == "success": + return True + if result in ("busted", "testfailed", "exception"): + return False + + time.sleep(TRY_PUSH_POLL_INTERVAL_SECONDS) + + logger.warning("Try push polling timed out for lando job %s", lando_job_id) + return None + + +def run_try_verification( + worktree_path: Path, + bug_id: int, + task_name: str, + skip_try_push: bool = False, +) -> TryPushResult: + _commit_fix(worktree_path, bug_id) + + local_passed = _run_local_build(worktree_path) + if not local_passed: + return TryPushResult( + local_build_passed=False, + try_build_passed=None, + lando_job_id=None, + treeherder_url=None, + ) + + if skip_try_push: + return TryPushResult( + local_build_passed=True, + try_build_passed=None, + lando_job_id=None, + treeherder_url=None, + ) + + lando_job_id, treeherder_url = _submit_try(worktree_path, task_name) + if not lando_job_id: + return TryPushResult( + local_build_passed=True, + try_build_passed=None, + lando_job_id=None, + treeherder_url=None, + ) + + try_passed = _poll_treeherder(lando_job_id, task_name) + return TryPushResult( + local_build_passed=True, + try_build_passed=try_passed, + lando_job_id=lando_job_id, + treeherder_url=treeherder_url, + ) diff --git a/bugbug/tools/build_repair/worktree.py b/bugbug/tools/build_repair/worktree.py new file mode 100644 index 0000000000..10026ec545 --- /dev/null +++ b/bugbug/tools/build_repair/worktree.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import subprocess +from pathlib import Path + +from bugbug.tools.build_repair.config import WORKTREE_BASE_DIR + + +class WorktreeManager: + """Manages git worktrees for parallel evaluation runs against a Firefox repo.""" + + def __init__( + self, + firefox_repo_path: str | Path, + base_dir: str = WORKTREE_BASE_DIR, + ): + self.repo = Path(firefox_repo_path) + self.base_dir = Path(base_dir) + self.base_dir.mkdir(parents=True, exist_ok=True) + + def create(self, commit_hash: str, name: str) -> Path: + worktree_path = self.base_dir / name + subprocess.run( + ["git", "worktree", "add", str(worktree_path), commit_hash], + cwd=self.repo, + check=True, + ) + return worktree_path + + def cleanup(self, name: str) -> None: + subprocess.run( + ["git", "worktree", "remove", str(self.base_dir / name), "--force"], + cwd=self.repo, + check=True, + ) + + def cleanup_all(self) -> None: + for entry in self.base_dir.iterdir(): + if entry.is_dir(): + subprocess.run( + ["git", "worktree", "remove", str(entry), "--force"], + cwd=self.repo, + check=False, + ) diff --git a/docker/build_repair/Dockerfile b/docker/build_repair/Dockerfile new file mode 100644 index 0000000000..6aaff90873 --- /dev/null +++ b/docker/build_repair/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.12-slim + +RUN apt-get update && apt-get install -y git nodejs npm && rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY . /app +RUN pip install -e . +RUN pip install claude-agent-sdk jupyter + +ENV FIREFOX_GIT_REPO=/workspace/firefox + +ENTRYPOINT ["python", "scripts/build_repair_eval.py"] diff --git a/notebooks/build_repair_create_dataset.ipynb b/notebooks/build_repair_create_dataset.ipynb index 4f8604a7b4..97e6f3c4be 100644 --- a/notebooks/build_repair_create_dataset.ipynb +++ b/notebooks/build_repair_create_dataset.ipynb @@ -16,8 +16,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "5f57608e", "metadata": {}, + "outputs": [], "source": [ "import weave\n", "\n", @@ -25,9 +27,7 @@ "DATASET_NAME = \"build_repair_one_commit_eval\"\n", "\n", "_ = weave.init(PROJECT_NAME)" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -41,110 +41,184 @@ }, { "cell_type": "code", + "execution_count": 1, "id": "5bd97ef4", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-17T22:51:22.099537Z", + "start_time": "2026-02-17T22:51:19.783734Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "One commit fail and fix: 388\n", + "Build fails only: 89\n" + ] + } + ], "source": [ "import pandas as pd\n", "\n", - "df = pd.read_json('https://community-tc.services.mozilla.com/api/queue/v1/task/Ra5r2qSyS8G-9pjLrS6l6Q/runs/0/artifacts/public%2Fci_failures.json.zst', lines=True)\n", + "df = pd.read_json(\n", + " \"https://community-tc.services.mozilla.com/api/queue/v1/task/Ra5r2qSyS8G-9pjLrS6l6Q/runs/0/artifacts/public%2Fci_failures.json.zst\",\n", + " lines=True,\n", + ")\n", "df = df[(df.failure_commits.apply(len) == 1) & (df.fix_commits.apply(len) == 1)]\n", "print(f\"One commit fail and fix: {len(df)}\")\n", - "df = df[df.failures.apply(lambda fails: any('build' in f['task_name'] and 'test' not in f['task_name'] for f in fails))]\n", + "df = df[\n", + " df.failures.apply(\n", + " lambda fails: any(\n", + " \"build\" in f[\"task_name\"] and \"test\" not in f[\"task_name\"] for f in fails\n", + " )\n", + " )\n", + "]\n", "print(f\"Build fails only: {len(df)}\")" - ], - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "### Check that corresponding bugs are public", - "id": "f5a96ec84466d611" + "id": "f5a96ec84466d611", + "metadata": {}, + "source": "### Check that corresponding bugs are public" }, { - "metadata": {}, "cell_type": "code", + "execution_count": 4, + "id": "b9289c8688226f08", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-17T22:56:51.134471Z", + "start_time": "2026-02-17T22:53:49.935290Z" + } + }, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 13\u001b[39m\n\u001b[32m 10\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 11\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[33mUnexpected Bugzilla status code: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresp.status_code\u001b[38;5;250m \u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m13\u001b[39m df = df[\u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbug_id\u001b[49m\u001b[43m.\u001b[49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mis_bug_public\u001b[49m\u001b[43m)\u001b[49m]\n\u001b[32m 14\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mBuild fails for public bugs: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(df)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/pandas/core/series.py:4943\u001b[39m, in \u001b[36mSeries.apply\u001b[39m\u001b[34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[39m\n\u001b[32m 4808\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mapply\u001b[39m(\n\u001b[32m 4809\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 4810\u001b[39m func: AggFuncType,\n\u001b[32m (...)\u001b[39m\u001b[32m 4815\u001b[39m **kwargs,\n\u001b[32m 4816\u001b[39m ) -> DataFrame | Series:\n\u001b[32m 4817\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 4818\u001b[39m \u001b[33;03m Invoke function on values of Series.\u001b[39;00m\n\u001b[32m 4819\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 4934\u001b[39m \u001b[33;03m dtype: float64\u001b[39;00m\n\u001b[32m 4935\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m 4936\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 4937\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 4938\u001b[39m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4939\u001b[39m \u001b[43m \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4940\u001b[39m \u001b[43m \u001b[49m\u001b[43mby_row\u001b[49m\u001b[43m=\u001b[49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4941\u001b[39m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m=\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4942\u001b[39m \u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m-> \u001b[39m\u001b[32m4943\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/pandas/core/apply.py:1422\u001b[39m, in \u001b[36mSeriesApply.apply\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1419\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.apply_compat()\n\u001b[32m 1421\u001b[39m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1422\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/pandas/core/apply.py:1502\u001b[39m, in \u001b[36mSeriesApply.apply_standard\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1496\u001b[39m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[32m 1497\u001b[39m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[32m 1498\u001b[39m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[32m 1499\u001b[39m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[32m 1500\u001b[39m \u001b[38;5;66;03m# Categorical (GH51645).\u001b[39;00m\n\u001b[32m 1501\u001b[39m action = \u001b[33m\"\u001b[39m\u001b[33mignore\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj.dtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1502\u001b[39m mapped = \u001b[43mobj\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1503\u001b[39m \u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[43m=\u001b[49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[32m 1504\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1506\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[32m0\u001b[39m], ABCSeries):\n\u001b[32m 1507\u001b[39m \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[32m 1508\u001b[39m \u001b[38;5;66;03m# See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[32m 1509\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m obj._constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index=obj.index)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/pandas/core/base.py:925\u001b[39m, in \u001b[36mIndexOpsMixin._map_values\u001b[39m\u001b[34m(self, mapper, na_action, convert)\u001b[39m\n\u001b[32m 922\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[32m 923\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m arr.map(mapper, na_action=na_action)\n\u001b[32m--> \u001b[39m\u001b[32m925\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[43m=\u001b[49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/pandas/core/algorithms.py:1743\u001b[39m, in \u001b[36mmap_array\u001b[39m\u001b[34m(arr, mapper, na_action, convert)\u001b[39m\n\u001b[32m 1741\u001b[39m values = arr.astype(\u001b[38;5;28mobject\u001b[39m, copy=\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m 1742\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1743\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1744\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1745\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m lib.map_infer_mask(\n\u001b[32m 1746\u001b[39m values, mapper, mask=isna(values).view(np.uint8), convert=convert\n\u001b[32m 1747\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/lib.pyx:2999\u001b[39m, in \u001b[36mpandas._libs.lib.map_infer\u001b[39m\u001b[34m()\u001b[39m\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 5\u001b[39m, in \u001b[36mis_bug_public\u001b[39m\u001b[34m(bug_id)\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mis_bug_public\u001b[39m(bug_id):\n\u001b[32m 4\u001b[39m url = \u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[33mhttps://bugzilla.mozilla.org/rest/bug/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbug_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m resp = \u001b[43mrequests\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m resp.status_code == \u001b[32m401\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m resp.status_code == \u001b[32m504\u001b[39m:\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/requests/api.py:73\u001b[39m, in \u001b[36mget\u001b[39m\u001b[34m(url, params, **kwargs)\u001b[39m\n\u001b[32m 62\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget\u001b[39m(url, params=\u001b[38;5;28;01mNone\u001b[39;00m, **kwargs):\n\u001b[32m 63\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33mr\u001b[39m\u001b[33;03m\"\"\"Sends a GET request.\u001b[39;00m\n\u001b[32m 64\u001b[39m \n\u001b[32m 65\u001b[39m \u001b[33;03m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 70\u001b[39m \u001b[33;03m :rtype: requests.Response\u001b[39;00m\n\u001b[32m 71\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m73\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mget\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m=\u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/requests/api.py:59\u001b[39m, in \u001b[36mrequest\u001b[39m\u001b[34m(method, url, **kwargs)\u001b[39m\n\u001b[32m 55\u001b[39m \u001b[38;5;66;03m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[32m 56\u001b[39m \u001b[38;5;66;03m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[32m 57\u001b[39m \u001b[38;5;66;03m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[32m 58\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m sessions.Session() \u001b[38;5;28;01mas\u001b[39;00m session:\n\u001b[32m---> \u001b[39m\u001b[32m59\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msession\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m=\u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/requests/sessions.py:589\u001b[39m, in \u001b[36mSession.request\u001b[39m\u001b[34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[39m\n\u001b[32m 584\u001b[39m send_kwargs = {\n\u001b[32m 585\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtimeout\u001b[39m\u001b[33m\"\u001b[39m: timeout,\n\u001b[32m 586\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mallow_redirects\u001b[39m\u001b[33m\"\u001b[39m: allow_redirects,\n\u001b[32m 587\u001b[39m }\n\u001b[32m 588\u001b[39m send_kwargs.update(settings)\n\u001b[32m--> \u001b[39m\u001b[32m589\u001b[39m resp = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 591\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/requests/sessions.py:703\u001b[39m, in \u001b[36mSession.send\u001b[39m\u001b[34m(self, request, **kwargs)\u001b[39m\n\u001b[32m 700\u001b[39m start = preferred_clock()\n\u001b[32m 702\u001b[39m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m703\u001b[39m r = \u001b[43madapter\u001b[49m\u001b[43m.\u001b[49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 705\u001b[39m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[32m 706\u001b[39m elapsed = preferred_clock() - start\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/requests/adapters.py:644\u001b[39m, in \u001b[36mHTTPAdapter.send\u001b[39m\u001b[34m(self, request, stream, timeout, verify, cert, proxies)\u001b[39m\n\u001b[32m 641\u001b[39m timeout = TimeoutSauce(connect=timeout, read=timeout)\n\u001b[32m 643\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m644\u001b[39m resp = \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 645\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 646\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m=\u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 647\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 648\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 649\u001b[39m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 650\u001b[39m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 651\u001b[39m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 652\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 653\u001b[39m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 654\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 655\u001b[39m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m=\u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 656\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 658\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[32m 659\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(err, request=request)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/urllib3/connectionpool.py:787\u001b[39m, in \u001b[36mHTTPConnectionPool.urlopen\u001b[39m\u001b[34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[39m\n\u001b[32m 784\u001b[39m response_conn = conn \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m release_conn \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 786\u001b[39m \u001b[38;5;66;03m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m787\u001b[39m response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 788\u001b[39m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 789\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 790\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 791\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 792\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 793\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 794\u001b[39m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m=\u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 795\u001b[39m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 796\u001b[39m \u001b[43m \u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[43m=\u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 797\u001b[39m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 798\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 799\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 800\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 802\u001b[39m \u001b[38;5;66;03m# Everything went great!\u001b[39;00m\n\u001b[32m 803\u001b[39m clean_exit = \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/urllib3/connectionpool.py:534\u001b[39m, in \u001b[36mHTTPConnectionPool._make_request\u001b[39m\u001b[34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[39m\n\u001b[32m 532\u001b[39m \u001b[38;5;66;03m# Receive the response from the server\u001b[39;00m\n\u001b[32m 533\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m534\u001b[39m response = \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 535\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m (BaseSSLError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 536\u001b[39m \u001b[38;5;28mself\u001b[39m._raise_timeout(err=e, url=url, timeout_value=read_timeout)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/urllib3/connection.py:571\u001b[39m, in \u001b[36mHTTPConnection.getresponse\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 568\u001b[39m _shutdown = \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m.sock, \u001b[33m\"\u001b[39m\u001b[33mshutdown\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m 570\u001b[39m \u001b[38;5;66;03m# Get the response from http.client.HTTPConnection\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m571\u001b[39m httplib_response = \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 573\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 574\u001b[39m assert_header_parsing(httplib_response.msg)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/http/client.py:1450\u001b[39m, in \u001b[36mHTTPConnection.getresponse\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1448\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 1449\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1450\u001b[39m \u001b[43mresponse\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbegin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1451\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m:\n\u001b[32m 1452\u001b[39m \u001b[38;5;28mself\u001b[39m.close()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/http/client.py:336\u001b[39m, in \u001b[36mHTTPResponse.begin\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 334\u001b[39m \u001b[38;5;66;03m# read until we get a non-100 response\u001b[39;00m\n\u001b[32m 335\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m336\u001b[39m version, status, reason = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_read_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 337\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m status != CONTINUE:\n\u001b[32m 338\u001b[39m \u001b[38;5;28;01mbreak\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/http/client.py:297\u001b[39m, in \u001b[36mHTTPResponse._read_status\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 296\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_read_status\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m297\u001b[39m line = \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_MAXLINE\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m)\u001b[49m, \u001b[33m\"\u001b[39m\u001b[33miso-8859-1\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 298\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(line) > _MAXLINE:\n\u001b[32m 299\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m LineTooLong(\u001b[33m\"\u001b[39m\u001b[33mstatus line\u001b[39m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/socket.py:719\u001b[39m, in \u001b[36mSocketIO.readinto\u001b[39m\u001b[34m(self, b)\u001b[39m\n\u001b[32m 717\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mcannot read from timed out object\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 718\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m719\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sock\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 720\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[32m 721\u001b[39m \u001b[38;5;28mself\u001b[39m._timeout_occurred = \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/ssl.py:1304\u001b[39m, in \u001b[36mSSLSocket.recv_into\u001b[39m\u001b[34m(self, buffer, nbytes, flags)\u001b[39m\n\u001b[32m 1300\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m flags != \u001b[32m0\u001b[39m:\n\u001b[32m 1301\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 1302\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m %\n\u001b[32m 1303\u001b[39m \u001b[38;5;28mself\u001b[39m.\u001b[34m__class__\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1304\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1305\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1306\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m().recv_into(buffer, nbytes, flags)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/ssl.py:1138\u001b[39m, in \u001b[36mSSLSocket.read\u001b[39m\u001b[34m(self, len, buffer)\u001b[39m\n\u001b[32m 1136\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 1137\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1138\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sslobj\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1139\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1140\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._sslobj.read(\u001b[38;5;28mlen\u001b[39m)\n", + "\u001b[31mKeyboardInterrupt\u001b[39m: " + ] + } + ], "source": [ "import requests\n", "\n", + "\n", "def is_bug_public(bug_id):\n", - " url = f'https://bugzilla.mozilla.org/rest/bug/{bug_id}'\n", + " url = f\"https://bugzilla.mozilla.org/rest/bug/{bug_id}\"\n", " resp = requests.get(url)\n", - " if resp.status_code == 401:\n", + " if resp.status_code == 401 or resp.status_code == 504:\n", " return False\n", " elif resp.status_code == 200:\n", " return True\n", " else:\n", - " raise ValueError('Unexpected Bugzilla status code: {resp.status_code }')\n", + " raise ValueError(f\"Unexpected Bugzilla status code: {resp.status_code}\")\n", + "\n", "\n", "df = df[df.bug_id.apply(is_bug_public)]\n", "print(f\"Build fails for public bugs: {len(df)}\")" - ], - "id": "b9289c8688226f08", - "outputs": [], - "execution_count": null + ] }, { + "cell_type": "markdown", + "id": "3d5e260dc3245c36", "metadata": { "ExecuteTime": { "end_time": "2026-02-13T23:52:42.195048Z", "start_time": "2026-02-13T23:52:42.191945Z" } }, - "cell_type": "markdown", - "source": "### Get GitHub revisions", - "id": "3d5e260dc3245c36" + "source": "### Get GitHub revisions" }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "id": "23e362967af3e2cf", + "metadata": {}, + "outputs": [], "source": [ "def get_git_rev(hg_revs):\n", " for rev in hg_revs:\n", - " convert_url = f'https://lando.moz.tools/api/hg2git/firefox/{rev}'\n", + " convert_url = f\"https://lando.moz.tools/api/hg2git/firefox/{rev}\"\n", " resp = requests.get(convert_url)\n", " if resp.status_code != 200:\n", - " raise ValueError(f'Unexpected HTTP status code: {resp.status_code}. {resp}')\n", - " yield resp.json()['git_hash']\n", + " raise ValueError(f\"Unexpected HTTP status code: {resp.status_code}. {resp}\")\n", + " yield resp.json()[\"git_hash\"]\n", "\n", - "df['gh_failure_commits'] = df.failure_commits.apply(lambda commits: list(get_git_rev(commits)))\n", - "df['gh_fix_commits'] = df.fix_commits.apply(lambda commits: list(get_git_rev(commits)))\n", - "df = df.rename(columns={'failure_commits': 'hg_failure_commits', 'fix_commits': 'hg_fix_commits'})" - ], - "id": "23e362967af3e2cf", - "outputs": [], - "execution_count": null + "\n", + "df[\"gh_failure_commits\"] = df.failure_commits.apply(\n", + " lambda commits: list(get_git_rev(commits))\n", + ")\n", + "df[\"gh_fix_commits\"] = df.fix_commits.apply(lambda commits: list(get_git_rev(commits)))\n", + "df = df.rename(\n", + " columns={\"failure_commits\": \"hg_failure_commits\", \"fix_commits\": \"hg_fix_commits\"}\n", + ")" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "### Ger bugzilla comments before the fix", - "id": "9dc12cf3f6f9b844" + "id": "9dc12cf3f6f9b844", + "metadata": {}, + "source": "### Ger bugzilla comments before the fix" }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "id": "cda623f631db857e", + "metadata": {}, + "outputs": [], "source": [ - "from bugbug.tools.core.platforms.bugzilla import Bug\n", "from libmozdata import config\n", "\n", + "from bugbug.tools.core.platforms.bugzilla import Bug\n", + "\n", "config.set_default_value(\"User-Agent\", \"name\", \"bugbug/1.0\")\n", "\n", + "\n", "def _get_comments(bug, fail_commit):\n", - " for comment in bug._metadata['comments']:\n", - " if comment['creator'] == 'pulsebot@bmo.tld':\n", - " if fail_commit[:12] in comment['raw_text']:\n", + " for comment in bug._metadata[\"comments\"]:\n", + " if comment[\"creator\"] == \"pulsebot@bmo.tld\":\n", + " if fail_commit[:12] in comment[\"raw_text\"]:\n", " # stop adding comments at failure commit push\n", - " yield comment['raw_text']\n", + " yield comment[\"raw_text\"]\n", " break\n", " else:\n", " continue\n", - " if comment['raw_text']:\n", - " yield comment['raw_text']\n", + " if comment[\"raw_text\"]:\n", + " yield comment[\"raw_text\"]\n", + "\n", "\n", "def get_bug_info_pre_fix(build_fail):\n", - " bug_id = build_fail['bug_id']\n", - " fail_commit = build_fail['hg_failure_commits'][0]\n", + " bug_id = build_fail[\"bug_id\"]\n", + " fail_commit = build_fail[\"hg_failure_commits\"][0]\n", "\n", " try:\n", " bug = Bug.get(bug_id)\n", @@ -152,23 +226,23 @@ " print(ex)\n", " return None\n", "\n", - " return {'title': bug.summary, 'comments': list(_get_comments(bug, fail_commit))}\n", + " return {\"title\": bug.summary, \"comments\": list(_get_comments(bug, fail_commit))}\n", "\n", - "df['pre_fix_bug'] = df.apply(get_bug_info_pre_fix, axis=1)\n", - "df = df[df['pre_fix_bug'].notnull()]\n", - "print(f'Final number of bugs: {len(df)}')" - ], - "id": "cda623f631db857e", - "outputs": [], - "execution_count": null + "\n", + "df[\"pre_fix_bug\"] = df.apply(get_bug_info_pre_fix, axis=1)\n", + "df = df[df[\"pre_fix_bug\"].notnull()]\n", + "print(f\"Final number of bugs: {len(df)}\")" + ] }, { - "metadata": {}, "cell_type": "code", - "source": "df", + "execution_count": null, "id": "e1b6e97b2b16c638", + "metadata": {}, "outputs": [], - "execution_count": null + "source": [ + "df" + ] }, { "cell_type": "markdown", @@ -180,19 +254,21 @@ }, { "cell_type": "code", + "execution_count": null, "id": "c5041136", "metadata": {}, + "outputs": [], "source": [ "examples = df.to_dict(orient=\"records\")\n", "examples[0]" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "d132783a", "metadata": {}, + "outputs": [], "source": [ "dataset = weave.Dataset(\n", " name=DATASET_NAME,\n", @@ -201,9 +277,7 @@ ")\n", "\n", "_ = weave.publish(dataset)" - ], - "outputs": [], - "execution_count": null + ] } ], "metadata": { diff --git a/notebooks/build_repair_evaluation.ipynb b/notebooks/build_repair_evaluation.ipynb new file mode 100644 index 0000000000..5ffbad5179 --- /dev/null +++ b/notebooks/build_repair_evaluation.ipynb @@ -0,0 +1,246 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build Repair Agent Evaluation\n", + "\n", + "This notebook runs W&B Weave evaluations for the automatic build repair agent." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-14T00:50:08.220718Z", + "start_time": "2026-02-14T00:50:06.650538Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import weave\n", + "\n", + "os.environ[\"WEAVE_PARALLELISM\"] = \"8\"\n", + "\n", + "PROJECT_NAME = \"bugbug-build-repair-eval\"\n", + "_ = weave.init(PROJECT_NAME)\n", + "\n", + "FIREFOX_REPO = os.environ[\"FIREFOX_GIT_REPO\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-14T00:50:16.665581Z", + "start_time": "2026-02-14T00:50:16.092754Z" + } + }, + "outputs": [], + "source": [ + "dataset = weave.ref(\"build_repair_one_commit_eval\").get()\n", + "print(f\"Dataset has {len(dataset.rows)} examples\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from functools import cached_property\n", + "\n", + "from bugbug.tools.build_repair.agent import AgentResponse, BuildFailure, BuildRepairTool\n", + "from bugbug.tools.build_repair.worktree import WorktreeManager\n", + "\n", + "\n", + "class BuildRepairModel(weave.Model):\n", + " firefox_repo: str\n", + "\n", + " @cached_property\n", + " def tool(self) -> BuildRepairTool:\n", + " return BuildRepairTool.create()\n", + "\n", + " @cached_property\n", + " def worktree_mgr(self) -> WorktreeManager:\n", + " return WorktreeManager(self.firefox_repo)\n", + "\n", + " @weave.op()\n", + " async def invoke(\n", + " self,\n", + " bug_id: int,\n", + " pre_fix_bug: dict,\n", + " gh_failure_commits: list[str],\n", + " failures: list[dict],\n", + " **kwargs,\n", + " ) -> dict:\n", + " wt_name = f\"bug-{bug_id}\"\n", + " worktree_path = self.worktree_mgr.create(gh_failure_commits[0], wt_name)\n", + " try:\n", + " failure = BuildFailure(\n", + " bug_id=bug_id,\n", + " bug_title=pre_fix_bug.get(\"title\"),\n", + " bug_comments=pre_fix_bug.get(\"comments\"),\n", + " git_commit=gh_failure_commits[0],\n", + " failure_tasks=[\n", + " f\n", + " for f in failures\n", + " if \"build\" in f[\"task_name\"] and \"test\" not in f[\"task_name\"]\n", + " ],\n", + " )\n", + " result: AgentResponse = await self.tool.run(\n", + " failure, worktree_path=worktree_path\n", + " )\n", + " return result.model_dump()\n", + " except Exception as e:\n", + " return {\n", + " \"error\": str(e),\n", + " \"diff\": \"\",\n", + " \"summary\": \"\",\n", + " \"analysis\": \"\",\n", + " \"cost_usd\": 0,\n", + " \"num_turns\": 0,\n", + " \"local_build_passed\": None,\n", + " \"try_build_passed\": None,\n", + " \"lando_job_id\": None,\n", + " \"treeherder_url\": None,\n", + " }\n", + " finally:\n", + " self.worktree_mgr.cleanup(wt_name)\n", + "\n", + "\n", + "model = BuildRepairModel(firefox_repo=FIREFOX_REPO)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bugbug.tools.build_repair.scorer import (\n", + " BasicMetricsScorer,\n", + " BuildPassRateScorer,\n", + " LLMFixMatchingScorer,\n", + ")\n", + "\n", + "evaluation = weave.Evaluation(\n", + " dataset=dataset,\n", + " scorers=[BasicMetricsScorer(), BuildPassRateScorer(), LLMFixMatchingScorer()],\n", + ")\n", + "\n", + "results = await evaluation.evaluate(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "basic = results.get(\"BasicMetricsScorer\", {})\n", + "build = results.get(\"BuildPassRateScorer\", {})\n", + "\n", + "metrics = {\n", + " \"success_rate\": basic.get(\"success_rate\", 0),\n", + " \"diff_rate\": basic.get(\"diff_rate\", 0),\n", + " \"local_build_pass_rate\": build.get(\"local_build_pass_rate\", 0),\n", + " \"try_build_pass_rate\": build.get(\"try_build_pass_rate\", 0),\n", + "}\n", + "\n", + "fig, ax = plt.subplots(figsize=(8, 4))\n", + "ax.bar(metrics.keys(), metrics.values())\n", + "ax.set_ylim(0, 1)\n", + "ax.set_ylabel(\"Rate\")\n", + "ax.set_title(\"Build Repair Evaluation Results\")\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(f\"Total cost: ${basic.get('total_cost_usd', 0):.2f}\")\n", + "print(f\"Avg cost per example: ${basic.get('avg_cost_usd', 0):.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Basic metrics: {basic}\")\n", + "print(f\"Build pass rates: {build}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. View in W&B\n", + "\n", + "Visit [W&B Weave](https://wandb.ai) to see detailed traces, compare evaluations, and explore individual predictions." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bugbug (3.12.7)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f431d8fa49..7ee624df7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ amqp==5.3.1 async-lru==2.1.0 beautifulsoup4==4.14.3 boto3==1.42.49 +claude-agent-sdk>=0.1.30 httpx==0.28.1 imbalanced-learn==0.14.1 langchain==1.2.10 diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py new file mode 100644 index 0000000000..653faf16bc --- /dev/null +++ b/scripts/build_repair_eval.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Standalone CLI for build repair evaluation. + +Usage: + python scripts/build_repair_eval.py + python scripts/build_repair_eval.py --analysis-only + python scripts/build_repair_eval.py --trials 3 + python scripts/build_repair_eval.py --limit 5 + python scripts/build_repair_eval.py --parallelism 4 + python scripts/build_repair_eval.py --no-try-push +""" + +import argparse +import asyncio +import os +from functools import cached_property + +import weave + +from bugbug.tools.build_repair.agent import AgentResponse, BuildFailure, BuildRepairTool +from bugbug.tools.build_repair.scorer import ( + BasicMetricsScorer, + BuildPassRateScorer, + LLMFixMatchingScorer, +) +from bugbug.tools.build_repair.worktree import WorktreeManager + + +class BuildRepairModel(weave.Model): + """Weave Model wrapper that creates a worktree per example and runs BuildRepairTool.""" + + firefox_repo: str + analysis_only: bool = False + no_try_push: bool = False + trial_id: int = 0 + + @cached_property + def tool(self) -> BuildRepairTool: + return BuildRepairTool.create(analysis_only=self.analysis_only) + + @cached_property + def worktree_mgr(self) -> WorktreeManager: + return WorktreeManager(self.firefox_repo) + + @weave.op() + async def invoke( + self, + bug_id: int, + pre_fix_bug: dict, + gh_failure_commits: list[str], + failures: list[dict], + **kwargs, + ) -> dict: + wt_name = f"bug-{bug_id}-trial-{self.trial_id}" + worktree_path = self.worktree_mgr.create(gh_failure_commits[0], wt_name) + try: + failure = BuildFailure( + bug_id=bug_id, + bug_title=pre_fix_bug.get("title"), + bug_comments=pre_fix_bug.get("comments"), + git_commit=gh_failure_commits[0], + failure_tasks=[ + f + for f in failures + if "build" in f["task_name"] and "test" not in f["task_name"] + ], + ) + result: AgentResponse = await self.tool.run( + failure, + worktree_path=worktree_path, + skip_try_push=self.no_try_push, + ) + return result.model_dump() + except Exception as e: + return { + "error": str(e), + "diff": "", + "summary": "", + "analysis": "", + "cost_usd": 0, + "num_turns": 0, + "local_build_passed": None, + "try_build_passed": None, + "lando_job_id": None, + "treeherder_url": None, + } + finally: + self.worktree_mgr.cleanup(wt_name) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Build repair evaluation") + parser.add_argument("--limit", type=int, default=None) + parser.add_argument("--trials", type=int, default=1) + parser.add_argument("--parallelism", type=int, default=8) + parser.add_argument("--firefox-repo", default=os.environ.get("FIREFOX_GIT_REPO")) + parser.add_argument("--dataset", default="build_repair_one_commit_eval") + parser.add_argument("--analysis-only", action="store_true") + parser.add_argument("--no-try-push", action="store_true") + args = parser.parse_args() + + if not args.firefox_repo: + parser.error("--firefox-repo or FIREFOX_GIT_REPO env var is required") + + os.environ["WEAVE_PARALLELISM"] = str(args.parallelism) + weave.init("bugbug-build-repair-eval") + + dataset = weave.ref(args.dataset).get() + rows = dataset.rows + if args.limit: + rows = rows[: args.limit] + + scorers = [BasicMetricsScorer(), LLMFixMatchingScorer()] + if not args.analysis_only: + scorers.insert(1, BuildPassRateScorer()) + + for trial in range(args.trials): + model = BuildRepairModel( + firefox_repo=args.firefox_repo, + analysis_only=args.analysis_only, + no_try_push=args.no_try_push, + trial_id=trial, + ) + evaluation = weave.Evaluation( + name=f"build-repair-trial-{trial}", + dataset=rows, + scorers=scorers, + ) + results = asyncio.run(evaluation.evaluate(model)) + print(f"Trial {trial} results: {results}") + + # TODO: To compute pass@k across trials, collect per-row scores from each + # trial via the Weave API (weave.ref(...).get() on individual evaluation + # runs) and pass them to compute_pass_at_k(). The evaluate() return value + # only contains aggregated summaries, not per-row data. + + +if __name__ == "__main__": + main() From a1af703cae692b51946a70349eede4b55794cf96 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Wed, 18 Feb 2026 17:31:31 -0800 Subject: [PATCH 04/31] Prevent data contamination --- bugbug/tools/build_repair/agent.py | 4 +- bugbug/tools/build_repair/config.py | 19 ++- notebooks/build_repair_create_dataset.ipynb | 169 ++++++++------------ notebooks/build_repair_evaluation.ipynb | 98 ++---------- scripts/build_repair_eval.py | 35 ++-- 5 files changed, 131 insertions(+), 194 deletions(-) diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py index d1e23bb06c..1b16176edf 100644 --- a/bugbug/tools/build_repair/agent.py +++ b/bugbug/tools/build_repair/agent.py @@ -60,8 +60,8 @@ class AgentResponse(BaseModel): class BuildRepairTool(GenerativeModelTool): """Two-stage build repair agent using Claude Agent SDK. - Stage 1 (Opus): Analyzes the failure and produces analysis/planning/summary docs. - Stage 2 (Sonnet): Reads the analysis and implements a fix. Skipped in analysis-only mode. + Stage 1: Analyzes the failure and produces analysis/planning/summary docs. + Stage 2: Reads the analysis and implements a fix. Skipped in analysis-only mode. After Stage 2, commits the fix, runs ./mach build, and optionally submits to try. """ diff --git a/bugbug/tools/build_repair/config.py b/bugbug/tools/build_repair/config.py index 1fa36523ff..faba55a81e 100644 --- a/bugbug/tools/build_repair/config.py +++ b/bugbug/tools/build_repair/config.py @@ -3,8 +3,10 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. +from datetime import date + ANALYSIS_MODEL = "claude-opus-4-6" -FIX_MODEL = "claude-sonnet-4-6" +FIX_MODEL = "claude-opus-4-6" DEFAULT_MAX_TURNS = 80 WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees" TRY_PUSH_TIMEOUT_SECONDS = 7200 @@ -13,6 +15,21 @@ FIREFOX_MCP_URL = "https://mcp-dev.moz.tools/mcp" +# Training data cutoff dates per model, for data contamination filtering. +# Examples with fix_commit_date before the cutoff may have been in training data. +# Source: https://platform.claude.com/docs/en/about-claude/models/overview +MODEL_CUTOFF_DATES = { + "claude-opus-4-6": date(2025, 8, 1), + "claude-sonnet-4-6": date(2026, 1, 1), + "claude-haiku-4-5-20251001": date(2025, 7, 1), + "claude-sonnet-4-5-20250929": date(2025, 7, 1), + "claude-opus-4-5-20251101": date(2025, 8, 1), + "claude-opus-4-1-20250805": date(2025, 3, 1), + "claude-sonnet-4-20250514": date(2025, 3, 1), + "claude-3-7-sonnet-20250219": date(2024, 11, 1), + "claude-opus-4-20250514": date(2025, 3, 1), +} + CLAUDE_PERMISSIONS_CONFIG = { "permissions": { "allow": [ diff --git a/notebooks/build_repair_create_dataset.ipynb b/notebooks/build_repair_create_dataset.ipynb index 97e6f3c4be..f1da8fee77 100644 --- a/notebooks/build_repair_create_dataset.ipynb +++ b/notebooks/build_repair_create_dataset.ipynb @@ -41,24 +41,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "5bd97ef4", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-17T22:51:22.099537Z", - "start_time": "2026-02-17T22:51:19.783734Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "One commit fail and fix: 388\n", - "Build fails only: 89\n" - ] - } - ], + "metadata": {}, + "outputs": [], "source": [ "import pandas as pd\n", "\n", @@ -78,75 +64,6 @@ "print(f\"Build fails only: {len(df)}\")" ] }, - { - "cell_type": "markdown", - "id": "f5a96ec84466d611", - "metadata": {}, - "source": "### Check that corresponding bugs are public" - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b9289c8688226f08", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-17T22:56:51.134471Z", - "start_time": "2026-02-17T22:53:49.935290Z" - } - }, - "outputs": [ - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 13\u001b[39m\n\u001b[32m 10\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 11\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[33mUnexpected Bugzilla status code: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresp.status_code\u001b[38;5;250m \u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m13\u001b[39m df = df[\u001b[43mdf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbug_id\u001b[49m\u001b[43m.\u001b[49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mis_bug_public\u001b[49m\u001b[43m)\u001b[49m]\n\u001b[32m 14\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mBuild fails for public bugs: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(df)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/pandas/core/series.py:4943\u001b[39m, in \u001b[36mSeries.apply\u001b[39m\u001b[34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[39m\n\u001b[32m 4808\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mapply\u001b[39m(\n\u001b[32m 4809\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 4810\u001b[39m func: AggFuncType,\n\u001b[32m (...)\u001b[39m\u001b[32m 4815\u001b[39m **kwargs,\n\u001b[32m 4816\u001b[39m ) -> DataFrame | Series:\n\u001b[32m 4817\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 4818\u001b[39m \u001b[33;03m Invoke function on values of Series.\u001b[39;00m\n\u001b[32m 4819\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 4934\u001b[39m \u001b[33;03m dtype: float64\u001b[39;00m\n\u001b[32m 4935\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m 4936\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 4937\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 4938\u001b[39m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4939\u001b[39m \u001b[43m \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4940\u001b[39m \u001b[43m \u001b[49m\u001b[43mby_row\u001b[49m\u001b[43m=\u001b[49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4941\u001b[39m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m=\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4942\u001b[39m \u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m-> \u001b[39m\u001b[32m4943\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/pandas/core/apply.py:1422\u001b[39m, in \u001b[36mSeriesApply.apply\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1419\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.apply_compat()\n\u001b[32m 1421\u001b[39m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1422\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/pandas/core/apply.py:1502\u001b[39m, in \u001b[36mSeriesApply.apply_standard\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1496\u001b[39m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[32m 1497\u001b[39m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[32m 1498\u001b[39m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[32m 1499\u001b[39m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[32m 1500\u001b[39m \u001b[38;5;66;03m# Categorical (GH51645).\u001b[39;00m\n\u001b[32m 1501\u001b[39m action = \u001b[33m\"\u001b[39m\u001b[33mignore\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj.dtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1502\u001b[39m mapped = \u001b[43mobj\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1503\u001b[39m \u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[43m=\u001b[49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[32m 1504\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1506\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[32m0\u001b[39m], ABCSeries):\n\u001b[32m 1507\u001b[39m \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[32m 1508\u001b[39m \u001b[38;5;66;03m# See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[32m 1509\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m obj._constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index=obj.index)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/pandas/core/base.py:925\u001b[39m, in \u001b[36mIndexOpsMixin._map_values\u001b[39m\u001b[34m(self, mapper, na_action, convert)\u001b[39m\n\u001b[32m 922\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[32m 923\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m arr.map(mapper, na_action=na_action)\n\u001b[32m--> \u001b[39m\u001b[32m925\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[43m=\u001b[49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/pandas/core/algorithms.py:1743\u001b[39m, in \u001b[36mmap_array\u001b[39m\u001b[34m(arr, mapper, na_action, convert)\u001b[39m\n\u001b[32m 1741\u001b[39m values = arr.astype(\u001b[38;5;28mobject\u001b[39m, copy=\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m 1742\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1743\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1744\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1745\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m lib.map_infer_mask(\n\u001b[32m 1746\u001b[39m values, mapper, mask=isna(values).view(np.uint8), convert=convert\n\u001b[32m 1747\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32mpandas/_libs/lib.pyx:2999\u001b[39m, in \u001b[36mpandas._libs.lib.map_infer\u001b[39m\u001b[34m()\u001b[39m\n", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 5\u001b[39m, in \u001b[36mis_bug_public\u001b[39m\u001b[34m(bug_id)\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mis_bug_public\u001b[39m(bug_id):\n\u001b[32m 4\u001b[39m url = \u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[33mhttps://bugzilla.mozilla.org/rest/bug/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbug_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m resp = \u001b[43mrequests\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m resp.status_code == \u001b[32m401\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m resp.status_code == \u001b[32m504\u001b[39m:\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/requests/api.py:73\u001b[39m, in \u001b[36mget\u001b[39m\u001b[34m(url, params, **kwargs)\u001b[39m\n\u001b[32m 62\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget\u001b[39m(url, params=\u001b[38;5;28;01mNone\u001b[39;00m, **kwargs):\n\u001b[32m 63\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33mr\u001b[39m\u001b[33;03m\"\"\"Sends a GET request.\u001b[39;00m\n\u001b[32m 64\u001b[39m \n\u001b[32m 65\u001b[39m \u001b[33;03m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 70\u001b[39m \u001b[33;03m :rtype: requests.Response\u001b[39;00m\n\u001b[32m 71\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m73\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mget\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m=\u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/requests/api.py:59\u001b[39m, in \u001b[36mrequest\u001b[39m\u001b[34m(method, url, **kwargs)\u001b[39m\n\u001b[32m 55\u001b[39m \u001b[38;5;66;03m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[32m 56\u001b[39m \u001b[38;5;66;03m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[32m 57\u001b[39m \u001b[38;5;66;03m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[32m 58\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m sessions.Session() \u001b[38;5;28;01mas\u001b[39;00m session:\n\u001b[32m---> \u001b[39m\u001b[32m59\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msession\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m=\u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/requests/sessions.py:589\u001b[39m, in \u001b[36mSession.request\u001b[39m\u001b[34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[39m\n\u001b[32m 584\u001b[39m send_kwargs = {\n\u001b[32m 585\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtimeout\u001b[39m\u001b[33m\"\u001b[39m: timeout,\n\u001b[32m 586\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mallow_redirects\u001b[39m\u001b[33m\"\u001b[39m: allow_redirects,\n\u001b[32m 587\u001b[39m }\n\u001b[32m 588\u001b[39m send_kwargs.update(settings)\n\u001b[32m--> \u001b[39m\u001b[32m589\u001b[39m resp = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 591\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/requests/sessions.py:703\u001b[39m, in \u001b[36mSession.send\u001b[39m\u001b[34m(self, request, **kwargs)\u001b[39m\n\u001b[32m 700\u001b[39m start = preferred_clock()\n\u001b[32m 702\u001b[39m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m703\u001b[39m r = \u001b[43madapter\u001b[49m\u001b[43m.\u001b[49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 705\u001b[39m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[32m 706\u001b[39m elapsed = preferred_clock() - start\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/requests/adapters.py:644\u001b[39m, in \u001b[36mHTTPAdapter.send\u001b[39m\u001b[34m(self, request, stream, timeout, verify, cert, proxies)\u001b[39m\n\u001b[32m 641\u001b[39m timeout = TimeoutSauce(connect=timeout, read=timeout)\n\u001b[32m 643\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m644\u001b[39m resp = \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 645\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 646\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m=\u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 647\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 648\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 649\u001b[39m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 650\u001b[39m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 651\u001b[39m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 652\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 653\u001b[39m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 654\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 655\u001b[39m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m=\u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 656\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 658\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[32m 659\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(err, request=request)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/urllib3/connectionpool.py:787\u001b[39m, in \u001b[36mHTTPConnectionPool.urlopen\u001b[39m\u001b[34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[39m\n\u001b[32m 784\u001b[39m response_conn = conn \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m release_conn \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 786\u001b[39m \u001b[38;5;66;03m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m787\u001b[39m response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 788\u001b[39m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 789\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 790\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 791\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 792\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 793\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 794\u001b[39m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m=\u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 795\u001b[39m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 796\u001b[39m \u001b[43m \u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[43m=\u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 797\u001b[39m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 798\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 799\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 800\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 802\u001b[39m \u001b[38;5;66;03m# Everything went great!\u001b[39;00m\n\u001b[32m 803\u001b[39m clean_exit = \u001b[38;5;28;01mTrue\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/urllib3/connectionpool.py:534\u001b[39m, in \u001b[36mHTTPConnectionPool._make_request\u001b[39m\u001b[34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[39m\n\u001b[32m 532\u001b[39m \u001b[38;5;66;03m# Receive the response from the server\u001b[39;00m\n\u001b[32m 533\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m534\u001b[39m response = \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 535\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m (BaseSSLError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 536\u001b[39m \u001b[38;5;28mself\u001b[39m._raise_timeout(err=e, url=url, timeout_value=read_timeout)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/site-packages/urllib3/connection.py:571\u001b[39m, in \u001b[36mHTTPConnection.getresponse\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 568\u001b[39m _shutdown = \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m.sock, \u001b[33m\"\u001b[39m\u001b[33mshutdown\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m 570\u001b[39m \u001b[38;5;66;03m# Get the response from http.client.HTTPConnection\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m571\u001b[39m httplib_response = \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 573\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 574\u001b[39m assert_header_parsing(httplib_response.msg)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/http/client.py:1450\u001b[39m, in \u001b[36mHTTPConnection.getresponse\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1448\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 1449\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1450\u001b[39m \u001b[43mresponse\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbegin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1451\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m:\n\u001b[32m 1452\u001b[39m \u001b[38;5;28mself\u001b[39m.close()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/http/client.py:336\u001b[39m, in \u001b[36mHTTPResponse.begin\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 334\u001b[39m \u001b[38;5;66;03m# read until we get a non-100 response\u001b[39;00m\n\u001b[32m 335\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m336\u001b[39m version, status, reason = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_read_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 337\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m status != CONTINUE:\n\u001b[32m 338\u001b[39m \u001b[38;5;28;01mbreak\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/http/client.py:297\u001b[39m, in \u001b[36mHTTPResponse._read_status\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 296\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_read_status\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m297\u001b[39m line = \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_MAXLINE\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m)\u001b[49m, \u001b[33m\"\u001b[39m\u001b[33miso-8859-1\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 298\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(line) > _MAXLINE:\n\u001b[32m 299\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m LineTooLong(\u001b[33m\"\u001b[39m\u001b[33mstatus line\u001b[39m\u001b[33m\"\u001b[39m)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/socket.py:719\u001b[39m, in \u001b[36mSocketIO.readinto\u001b[39m\u001b[34m(self, b)\u001b[39m\n\u001b[32m 717\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mcannot read from timed out object\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 718\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m719\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sock\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 720\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[32m 721\u001b[39m \u001b[38;5;28mself\u001b[39m._timeout_occurred = \u001b[38;5;28;01mTrue\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/ssl.py:1304\u001b[39m, in \u001b[36mSSLSocket.recv_into\u001b[39m\u001b[34m(self, buffer, nbytes, flags)\u001b[39m\n\u001b[32m 1300\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m flags != \u001b[32m0\u001b[39m:\n\u001b[32m 1301\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 1302\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m %\n\u001b[32m 1303\u001b[39m \u001b[38;5;28mself\u001b[39m.\u001b[34m__class__\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1304\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1305\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1306\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m().recv_into(buffer, nbytes, flags)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/bugbug/lib/python3.13/ssl.py:1138\u001b[39m, in \u001b[36mSSLSocket.read\u001b[39m\u001b[34m(self, len, buffer)\u001b[39m\n\u001b[32m 1136\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 1137\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1138\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sslobj\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1139\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1140\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._sslobj.read(\u001b[38;5;28mlen\u001b[39m)\n", - "\u001b[31mKeyboardInterrupt\u001b[39m: " - ] - } - ], - "source": [ - "import requests\n", - "\n", - "\n", - "def is_bug_public(bug_id):\n", - " url = f\"https://bugzilla.mozilla.org/rest/bug/{bug_id}\"\n", - " resp = requests.get(url)\n", - " if resp.status_code == 401 or resp.status_code == 504:\n", - " return False\n", - " elif resp.status_code == 200:\n", - " return True\n", - " else:\n", - " raise ValueError(f\"Unexpected Bugzilla status code: {resp.status_code}\")\n", - "\n", - "\n", - "df = df[df.bug_id.apply(is_bug_public)]\n", - "print(f\"Build fails for public bugs: {len(df)}\")" - ] - }, { "cell_type": "markdown", "id": "3d5e260dc3245c36", @@ -161,10 +78,13 @@ { "cell_type": "code", "execution_count": null, - "id": "23e362967af3e2cf", + "id": "196561e8d8b0659f", "metadata": {}, "outputs": [], "source": [ + "import requests\n", + "\n", + "\n", "def get_git_rev(hg_revs):\n", " for rev in hg_revs:\n", " convert_url = f\"https://lando.moz.tools/api/hg2git/firefox/{rev}\"\n", @@ -192,7 +112,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cda623f631db857e", + "id": "7d43ac43a96c50c6", "metadata": {}, "outputs": [], "source": [ @@ -216,46 +136,85 @@ " yield comment[\"raw_text\"]\n", "\n", "\n", - "def get_bug_info_pre_fix(build_fail):\n", + "def _get_fix_commit_date(bug, fix_commit):\n", + " for comment in bug._metadata[\"comments\"]:\n", + " if (\n", + " comment[\"creator\"] == \"pulsebot@bmo.tld\"\n", + " and fix_commit[:12] in comment[\"raw_text\"]\n", + " ):\n", + " return comment[\"time\"]\n", + " raise None\n", + "\n", + "\n", + "def get_bug_info_and_fix_date(build_fail):\n", " bug_id = build_fail[\"bug_id\"]\n", " fail_commit = build_fail[\"hg_failure_commits\"][0]\n", + " fix_commit = build_fail[\"hg_fix_commits\"][0]\n", "\n", " try:\n", " bug = Bug.get(bug_id)\n", " except ValueError as ex:\n", " print(ex)\n", - " return None\n", + " return pd.Series([None, None])\n", "\n", - " return {\"title\": bug.summary, \"comments\": list(_get_comments(bug, fail_commit))}\n", + " return pd.Series(\n", + " [\n", + " {\"title\": bug.summary, \"comments\": list(_get_comments(bug, fail_commit))},\n", + " _get_fix_commit_date(bug, fix_commit),\n", + " ]\n", + " )\n", "\n", "\n", - "df[\"pre_fix_bug\"] = df.apply(get_bug_info_pre_fix, axis=1)\n", - "df = df[df[\"pre_fix_bug\"].notnull()]\n", - "print(f\"Final number of bugs: {len(df)}\")" + "df[[\"pre_fix_bug\", \"fix_commit_date\"]] = df.apply(get_bug_info_and_fix_date, axis=1)\n", + "df = df[df[\"pre_fix_bug\"].notnull() & df[\"fix_commit_date\"].notnull()]\n", + "print(f\"With bug info: {len(df)}\")" ] }, + { + "cell_type": "markdown", + "id": "ab9m8qftc2q", + "metadata": {}, + "source": "### Filter out data before model cutoff (data contamination prevention)" + }, { "cell_type": "code", "execution_count": null, - "id": "e1b6e97b2b16c638", + "id": "e0f792f5b775076d", "metadata": {}, "outputs": [], "source": [ - "df" + "CONTAMINATION_CUTOFF = \"2025-09-01\"\n", + "\n", + "before = len(df)\n", + "df = df[\n", + " df[\"fix_commit_date\"].apply(lambda d: d is not None and d >= CONTAMINATION_CUTOFF)\n", + "]\n", + "print(\n", + " f\"Filtered {before - len(df)} examples with fix date before {CONTAMINATION_CUTOFF}\"\n", + ")\n", + "print(f\"Final number of examples: {len(df)}\")" ] }, { - "cell_type": "markdown", - "id": "3c4f93a4", + "cell_type": "code", + "execution_count": null, + "id": "fde92cac54949c08", "metadata": {}, + "outputs": [], "source": [ - "## Save the Dataset" + "df" ] }, + { + "cell_type": "markdown", + "id": "1695783abf03124f", + "metadata": {}, + "source": "## Save the Dataset" + }, { "cell_type": "code", "execution_count": null, - "id": "c5041136", + "id": "d6096ee230ee902a", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +225,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d132783a", + "id": "79d08ee1b92ca6de", "metadata": {}, "outputs": [], "source": [ @@ -278,6 +237,14 @@ "\n", "_ = weave.publish(dataset)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "384f041b61c6c2e7", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/notebooks/build_repair_evaluation.ipynb b/notebooks/build_repair_evaluation.ipynb index 5ffbad5179..3813cb00e9 100644 --- a/notebooks/build_repair_evaluation.ipynb +++ b/notebooks/build_repair_evaluation.ipynb @@ -19,19 +19,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-14T00:50:08.220718Z", - "start_time": "2026-02-14T00:50:06.650538Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "import weave\n", "\n", - "os.environ[\"WEAVE_PARALLELISM\"] = \"8\"\n", + "os.environ[\"WEAVE_PARALLELISM\"] = \"4\"\n", "\n", "PROJECT_NAME = \"bugbug-build-repair-eval\"\n", "_ = weave.init(PROJECT_NAME)\n", @@ -49,16 +44,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-14T00:50:16.665581Z", - "start_time": "2026-02-14T00:50:16.092754Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "dataset = weave.ref(\"build_repair_one_commit_eval\").get()\n", - "print(f\"Dataset has {len(dataset.rows)} examples\")" + "dataset = dataset.rows[:1]\n", + "print(f\"Dataset has {len(dataset)} examples\")" ] }, { @@ -74,76 +65,17 @@ "metadata": {}, "outputs": [], "source": [ - "from functools import cached_property\n", - "\n", - "from bugbug.tools.build_repair.agent import AgentResponse, BuildFailure, BuildRepairTool\n", - "from bugbug.tools.build_repair.worktree import WorktreeManager\n", - "\n", - "\n", - "class BuildRepairModel(weave.Model):\n", - " firefox_repo: str\n", + "from scripts.build_repair_eval import BuildRepairModel\n", "\n", - " @cached_property\n", - " def tool(self) -> BuildRepairTool:\n", - " return BuildRepairTool.create()\n", - "\n", - " @cached_property\n", - " def worktree_mgr(self) -> WorktreeManager:\n", - " return WorktreeManager(self.firefox_repo)\n", - "\n", - " @weave.op()\n", - " async def invoke(\n", - " self,\n", - " bug_id: int,\n", - " pre_fix_bug: dict,\n", - " gh_failure_commits: list[str],\n", - " failures: list[dict],\n", - " **kwargs,\n", - " ) -> dict:\n", - " wt_name = f\"bug-{bug_id}\"\n", - " worktree_path = self.worktree_mgr.create(gh_failure_commits[0], wt_name)\n", - " try:\n", - " failure = BuildFailure(\n", - " bug_id=bug_id,\n", - " bug_title=pre_fix_bug.get(\"title\"),\n", - " bug_comments=pre_fix_bug.get(\"comments\"),\n", - " git_commit=gh_failure_commits[0],\n", - " failure_tasks=[\n", - " f\n", - " for f in failures\n", - " if \"build\" in f[\"task_name\"] and \"test\" not in f[\"task_name\"]\n", - " ],\n", - " )\n", - " result: AgentResponse = await self.tool.run(\n", - " failure, worktree_path=worktree_path\n", - " )\n", - " return result.model_dump()\n", - " except Exception as e:\n", - " return {\n", - " \"error\": str(e),\n", - " \"diff\": \"\",\n", - " \"summary\": \"\",\n", - " \"analysis\": \"\",\n", - " \"cost_usd\": 0,\n", - " \"num_turns\": 0,\n", - " \"local_build_passed\": None,\n", - " \"try_build_passed\": None,\n", - " \"lando_job_id\": None,\n", - " \"treeherder_url\": None,\n", - " }\n", - " finally:\n", - " self.worktree_mgr.cleanup(wt_name)\n", - "\n", - "\n", - "model = BuildRepairModel(firefox_repo=FIREFOX_REPO)" + "model = BuildRepairModel(\n", + " firefox_repo=FIREFOX_REPO, analysis_only=True, no_try_push=True\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, - "source": [ - "## Run Evaluation" - ] + "source": "## Run Evaluation" }, { "cell_type": "code", @@ -154,12 +86,16 @@ "from bugbug.tools.build_repair.scorer import (\n", " BasicMetricsScorer,\n", " BuildPassRateScorer,\n", - " LLMFixMatchingScorer,\n", ")\n", "\n", "evaluation = weave.Evaluation(\n", + " name=\"build-repair-test\",\n", " dataset=dataset,\n", - " scorers=[BasicMetricsScorer(), BuildPassRateScorer(), LLMFixMatchingScorer()],\n", + " scorers=[\n", + " BasicMetricsScorer(),\n", + " BuildPassRateScorer(),\n", + " # LLMFixMatchingScorer()\n", + " ],\n", ")\n", "\n", "results = await evaluation.evaluate(model)" @@ -243,4 +179,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index 653faf16bc..8f423e4d4f 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -16,12 +16,15 @@ import argparse import asyncio +import logging import os +from datetime import date from functools import cached_property import weave from bugbug.tools.build_repair.agent import AgentResponse, BuildFailure, BuildRepairTool +from bugbug.tools.build_repair.config import MODEL_CUTOFF_DATES from bugbug.tools.build_repair.scorer import ( BasicMetricsScorer, BuildPassRateScorer, @@ -29,6 +32,8 @@ ) from bugbug.tools.build_repair.worktree import WorktreeManager +logger = logging.getLogger(__name__) + class BuildRepairModel(weave.Model): """Weave Model wrapper that creates a worktree per example and runs BuildRepairTool.""" @@ -53,21 +58,33 @@ async def invoke( pre_fix_bug: dict, gh_failure_commits: list[str], failures: list[dict], + fix_commit_date: str, **kwargs, ) -> dict: wt_name = f"bug-{bug_id}-trial-{self.trial_id}" - worktree_path = self.worktree_mgr.create(gh_failure_commits[0], wt_name) + try: + cutoff = max( + MODEL_CUTOFF_DATES[self.tool.analysis_model], + MODEL_CUTOFF_DATES[self.tool.fix_model], + ) + if date.fromisoformat(fix_commit_date) < cutoff: + logger.warning( + "Skipping bug %d: fix date %s is before model cutoff %s", + bug_id, + fix_commit_date, + cutoff, + ) + raise ValueError("skipped_data_contamination") + + worktree_path = self.worktree_mgr.create(gh_failure_commits[0], wt_name) + failure = BuildFailure( bug_id=bug_id, - bug_title=pre_fix_bug.get("title"), - bug_comments=pre_fix_bug.get("comments"), + bug_title=pre_fix_bug["title"], + bug_comments=pre_fix_bug["comments"], git_commit=gh_failure_commits[0], - failure_tasks=[ - f - for f in failures - if "build" in f["task_name"] and "test" not in f["task_name"] - ], + failure_tasks=failures, ) result: AgentResponse = await self.tool.run( failure, @@ -96,7 +113,7 @@ def main() -> None: parser = argparse.ArgumentParser(description="Build repair evaluation") parser.add_argument("--limit", type=int, default=None) parser.add_argument("--trials", type=int, default=1) - parser.add_argument("--parallelism", type=int, default=8) + parser.add_argument("--parallelism", type=int, default=4) parser.add_argument("--firefox-repo", default=os.environ.get("FIREFOX_GIT_REPO")) parser.add_argument("--dataset", default="build_repair_one_commit_eval") parser.add_argument("--analysis-only", action="store_true") From 305d7c905303d805f3ba02a8632d3aa1c118b743 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 19 Feb 2026 14:49:30 -0800 Subject: [PATCH 05/31] Adjust agent configuration and add logs --- bugbug/tools/build_repair/agent.py | 105 +++++++++++++++++++----- bugbug/tools/build_repair/config.py | 95 +++++++++++---------- bugbug/tools/build_repair/scorer.py | 8 +- bugbug/tools/build_repair/try_server.py | 63 +++++++++++++- bugbug/tools/build_repair/worktree.py | 11 +++ scripts/build_repair_eval.py | 43 +++++++++- 6 files changed, 252 insertions(+), 73 deletions(-) diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py index 1b16176edf..e010ff7d6f 100644 --- a/bugbug/tools/build_repair/agent.py +++ b/bugbug/tools/build_repair/agent.py @@ -3,7 +3,6 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. -import json import subprocess from logging import getLogger from pathlib import Path @@ -13,11 +12,12 @@ from bugbug.tools.base import GenerativeModelTool from bugbug.tools.build_repair.config import ( + ADDITIONAL_DIRS, + ALLOWED_TOOLS, ANALYSIS_MODEL, - CLAUDE_PERMISSIONS_CONFIG, - DEFAULT_MAX_TURNS, FIREFOX_MCP_URL, FIX_MODEL, + SANDBOX_CONFIG, ) from bugbug.tools.build_repair.prompts import ( ANALYSIS_TEMPLATE, @@ -71,13 +71,11 @@ def __init__( analysis_only: bool = False, analysis_model: str = ANALYSIS_MODEL, fix_model: str = FIX_MODEL, - max_turns: int = DEFAULT_MAX_TURNS, ) -> None: self.target_software = target_software self.analysis_only = analysis_only self.analysis_model = analysis_model self.fix_model = fix_model - self.max_turns = max_turns @classmethod def create(cls, **kwargs): @@ -101,11 +99,11 @@ def _prepare_input_files(self, failure: BuildFailure, worktree_path: Path) -> No out_dir = worktree_path / "repair_agent" / "out" / str(failure.bug_id) out_dir.mkdir(parents=True, exist_ok=True) - def _write_settings(self, worktree_path: Path) -> None: - settings_dir = worktree_path / ".claude" - settings_dir.mkdir(exist_ok=True) - (settings_dir / "settings.json").write_text( - json.dumps(CLAUDE_PERMISSIONS_CONFIG, indent=2) + logger.info( + "Prepared input files for bug %d at %s (%d failure tasks)", + failure.bug_id, + in_dir, + len(failure.failure_tasks), ) def _read_output(self, failure: BuildFailure, worktree_path: Path, key: str) -> str: @@ -122,27 +120,41 @@ async def run( worktree_path: Path, skip_try_push: bool = False, ) -> AgentResponse: + logger.info( + "Starting build repair for bug %d (commit=%s, worktree=%s, " + "analysis_only=%s, skip_try_push=%s)", + failure.bug_id, + failure.git_commit, + worktree_path, + self.analysis_only, + skip_try_push, + ) self._prepare_input_files(failure, worktree_path) - self._write_settings(worktree_path) system_prompt = SYSTEM_PROMPT_TEMPLATE.format( target_software=self.target_software ) - mcp_servers = [{"url": FIREFOX_MCP_URL, "name": "firefox"}] + mcp_servers = {"firefox": {"type": "http", "url": FIREFOX_MCP_URL}} disallowed = ["AskUserQuestion", "Task"] total_cost = 0.0 total_turns = 0 + logger.info( + "Bug %d: starting Stage 1 (analysis) with model=%s", + failure.bug_id, + self.analysis_model, + ) # Stage 1: Analysis stage1_options = ClaudeAgentOptions( system_prompt=system_prompt, model=self.analysis_model, cwd=str(worktree_path), + allowed_tools=ALLOWED_TOOLS, disallowed_tools=disallowed, - permission_mode="default", - setting_sources=["project"], - max_turns=self.max_turns, - max_thinking_tokens=16000, + add_dirs=ADDITIONAL_DIRS, + sandbox=SANDBOX_CONFIG, + permission_mode="plan", + effort="high", mcp_servers=mcp_servers, ) analysis_prompt = ANALYSIS_TEMPLATE.format( @@ -155,16 +167,36 @@ async def run( total_cost += message.total_cost_usd or 0 total_turns += message.num_turns or 0 except Exception as e: + logger.error( + "Bug %d: Stage 1 (analysis) failed: %s", + failure.bug_id, + e, + exc_info=True, + ) return AgentResponse( error=str(e), cost_usd=total_cost, num_turns=total_turns, ) + logger.info( + "Bug %d: Stage 1 complete (cost=$%.4f, turns=%d)", + failure.bug_id, + total_cost, + total_turns, + ) + summary = self._read_output(failure, worktree_path, "summary") analysis = self._read_output(failure, worktree_path, "analysis") + logger.info( + "Bug %d: read output files (summary=%d chars, analysis=%d chars)", + failure.bug_id, + len(summary), + len(analysis), + ) if self.analysis_only: + logger.info("Bug %d: analysis-only mode, skipping Stage 2", failure.bug_id) return AgentResponse( summary=summary, analysis=analysis, @@ -172,15 +204,22 @@ async def run( num_turns=total_turns, ) + logger.info( + "Bug %d: starting Stage 2 (fix) with model=%s", + failure.bug_id, + self.fix_model, + ) # Stage 2: Fix stage2_options = ClaudeAgentOptions( system_prompt=system_prompt, model=self.fix_model, cwd=str(worktree_path), + allowed_tools=ALLOWED_TOOLS, disallowed_tools=disallowed, - permission_mode="default", - setting_sources=["project"], - max_turns=self.max_turns, + add_dirs=ADDITIONAL_DIRS, + sandbox=SANDBOX_CONFIG, + permission_mode="acceptEdits", + effort="low", mcp_servers=mcp_servers, ) fix_prompt = FIX_TEMPLATE.format(bug_id=failure.bug_id) @@ -190,6 +229,9 @@ async def run( total_cost += message.total_cost_usd or 0 total_turns += message.num_turns or 0 except Exception as e: + logger.error( + "Bug %d: Stage 2 (fix) failed: %s", failure.bug_id, e, exc_info=True + ) return AgentResponse( summary=summary, analysis=analysis, @@ -198,6 +240,13 @@ async def run( num_turns=total_turns, ) + logger.info( + "Bug %d: Stage 2 complete (cost=$%.4f, turns=%d)", + failure.bug_id, + total_cost, + total_turns, + ) + diff_result = subprocess.run( ["git", "diff", "HEAD"], cwd=worktree_path, @@ -205,8 +254,10 @@ async def run( text=True, ) diff = diff_result.stdout + logger.info("Bug %d: git diff produced %d chars", failure.bug_id, len(diff)) if not diff.strip(): + logger.warning("Bug %d: no diff produced, returning early", failure.bug_id) return AgentResponse( summary=summary, analysis=analysis, @@ -220,6 +271,12 @@ async def run( task_name = ( failure.failure_tasks[0]["task_name"] if failure.failure_tasks else "" ) + logger.info( + "Bug %d: starting try verification (task=%s, skip_try_push=%s)", + failure.bug_id, + task_name, + skip_try_push, + ) try_result = run_try_verification( worktree_path=worktree_path, bug_id=failure.bug_id, @@ -227,6 +284,16 @@ async def run( skip_try_push=skip_try_push, ) + logger.info( + "Bug %d: try verification done (local_build=%s, try_build=%s, " + "lando_job=%s, total_cost=$%.4f, total_turns=%d)", + failure.bug_id, + try_result.local_build_passed, + try_result.try_build_passed, + try_result.lando_job_id, + total_cost, + total_turns, + ) return AgentResponse( summary=summary, analysis=analysis, diff --git a/bugbug/tools/build_repair/config.py b/bugbug/tools/build_repair/config.py index faba55a81e..f4bde13a3a 100644 --- a/bugbug/tools/build_repair/config.py +++ b/bugbug/tools/build_repair/config.py @@ -5,6 +5,8 @@ from datetime import date +from claude_agent_sdk import SandboxNetworkConfig, SandboxSettings + ANALYSIS_MODEL = "claude-opus-4-6" FIX_MODEL = "claude-opus-4-6" DEFAULT_MAX_TURNS = 80 @@ -30,52 +32,47 @@ "claude-opus-4-20250514": date(2025, 3, 1), } -CLAUDE_PERMISSIONS_CONFIG = { - "permissions": { - "allow": [ - "Edit(~/.mozbuild)", - "Edit(~/.cache/uv)", - "Bash(./mach build:*)", - "Bash(./mach clobber:*)", - "Bash(./mach configure:*)", - "Bash(./mach run:*)", - "Bash(./mach test:*)", - "Bash(./mach wpt:*)", - "Bash(./mach lint:*)", - "Bash(./mach format:*)", - "Bash(./mach clang-format:*)", - "Bash(./mach try:*)", - "Bash(./mach help:*)", - "Bash(./mach vendor:*)", - "Bash(./mach bootstrap:*)", - "Bash(./mach artifact:*)", - "Bash(clang++:*)", - "Bash(rm:*)", - "Bash(timeout:*)", - "Bash(find:*)", - "Bash(grep:*)", - "Bash(tee:*)", - "Bash(kill:*)", - "Bash(searchfox-cli:*)", - "Bash(treeherder-cli:*)", - "Bash(jj:*)", - "WebFetch(domain:firefox-source-docs.mozilla.org)", - "WebFetch(domain:treeherder.mozilla.org)", - "WebFetch(domain:searchfox.org)", - "WebFetch(o1069899.ingest.sentry.io)", - ], - "deny": [], - "additionalDirectories": [ - "~/.mozbuild", - "~/.cache/uv/", - ], - }, - "sandbox": { - "enabled": True, - "autoAllowBashIfSandboxed": True, - "allowUnsandboxedCommands": False, - "network": { - "allowLocalBinding": True, - }, - }, -} +ALLOWED_TOOLS = [ + "Edit(~/.mozbuild)", + "Edit(~/.cache/uv)", + "Bash(./mach build:*)", + "Bash(./mach clobber:*)", + "Bash(./mach configure:*)", + "Bash(./mach run:*)", + "Bash(./mach test:*)", + "Bash(./mach wpt:*)", + "Bash(./mach lint:*)", + "Bash(./mach format:*)", + "Bash(./mach clang-format:*)", + "Bash(./mach try:*)", + "Bash(./mach help:*)", + "Bash(./mach vendor:*)", + "Bash(./mach bootstrap:*)", + "Bash(./mach artifact:*)", + "Bash(clang++:*)", + "Bash(rm:*)", + "Bash(timeout:*)", + "Bash(find:*)", + "Bash(grep:*)", + "Bash(tee:*)", + "Bash(kill:*)", + "Bash(searchfox-cli:*)", + "Bash(treeherder-cli:*)", + "Bash(jj:*)", + "WebFetch(domain:firefox-source-docs.mozilla.org)", + "WebFetch(domain:treeherder.mozilla.org)", + "WebFetch(domain:searchfox.org)", + "WebFetch(o1069899.ingest.sentry.io)", +] + +ADDITIONAL_DIRS = [ + "~/.mozbuild", + "~/.cache/uv/", +] + +SANDBOX_CONFIG = SandboxSettings( + enabled=True, + autoAllowBashIfSandboxed=True, + allowUnsandboxedCommands=False, + network=SandboxNetworkConfig(allowLocalBinding=True), +) diff --git a/bugbug/tools/build_repair/scorer.py b/bugbug/tools/build_repair/scorer.py index 7cfac108a3..5efbbb1827 100644 --- a/bugbug/tools/build_repair/scorer.py +++ b/bugbug/tools/build_repair/scorer.py @@ -25,13 +25,15 @@ def score(self, output: dict) -> dict: def summarize(self, score_rows: list[dict]) -> dict: n = len(score_rows) costs = [r["cost_usd"] for r in score_rows] - return { + summary = { "success_rate": sum(r["successful"] for r in score_rows) / n if n else 0, "diff_rate": sum(r["has_diff"] for r in score_rows) / n if n else 0, "avg_cost_usd": sum(costs) / n if n else 0, "total_cost_usd": sum(costs), "num_examples": n, } + logger.info("BasicMetrics summary: %s", summary) + return summary class BuildPassRateScorer(weave.Scorer): @@ -49,7 +51,7 @@ def summarize(self, score_rows: list[dict]) -> dict: local_passed = sum(1 for r in score_rows if r["local_build_passed"] is True) try_known = [r for r in score_rows if r["try_build_passed"] is not None] try_passed = sum(1 for r in try_known if r["try_build_passed"] is True) - return { + summary = { "local_build_pass_rate": local_passed / n if n else 0, "local_builds_passed": local_passed, "try_build_pass_rate": try_passed / len(try_known) if try_known else 0, @@ -57,6 +59,8 @@ def summarize(self, score_rows: list[dict]) -> dict: "try_builds_timed_out": n - len(try_known), "num_examples": n, } + logger.info("BuildPassRate summary: %s", summary) + return summary class LLMFixMatchingScorer(weave.Scorer): diff --git a/bugbug/tools/build_repair/try_server.py b/bugbug/tools/build_repair/try_server.py index e09acb2e7f..fb17e51703 100644 --- a/bugbug/tools/build_repair/try_server.py +++ b/bugbug/tools/build_repair/try_server.py @@ -35,6 +35,7 @@ class TryPushResult: def _commit_fix(worktree_path: Path, bug_id: int) -> None: + logger.info("Committing fix for bug %d in %s", bug_id, worktree_path) subprocess.run( ["git", "add", "-A"], cwd=worktree_path, @@ -45,17 +46,26 @@ def _commit_fix(worktree_path: Path, bug_id: int) -> None: cwd=worktree_path, check=True, ) + logger.info("Bug %d: fix committed", bug_id) def _run_local_build(worktree_path: Path) -> bool: + logger.info("Running local build in %s", worktree_path) result = subprocess.run( ["./mach", "build"], cwd=worktree_path, ) - return result.returncode == 0 + passed = result.returncode == 0 + logger.info( + "Local build %s (returncode=%d)", + "passed" if passed else "failed", + result.returncode, + ) + return passed def _submit_try(worktree_path: Path, task_name: str) -> tuple[str | None, str | None]: + logger.info("Submitting try push for task=%s in %s", task_name, worktree_path) result = subprocess.run( ["./mach", "try", "fuzzy", "--query", task_name], cwd=worktree_path, @@ -63,6 +73,7 @@ def _submit_try(worktree_path: Path, task_name: str) -> tuple[str | None, str | text=True, ) stdout = result.stdout + result.stderr + logger.debug("Try push output: %s", stdout) match = _LANDO_JOB_ID_RE.search(stdout) if not match: logger.warning("Could not parse Lando job ID from try output: %s", stdout) @@ -70,6 +81,11 @@ def _submit_try(worktree_path: Path, task_name: str) -> tuple[str | None, str | lando_job_id = match.group(1) treeherder_url = f"{TREEHERDER_BASE_URL}/jobs?repo=try&landoCommitID={lando_job_id}" + logger.info( + "Try push submitted: lando_job_id=%s, treeherder=%s", + lando_job_id, + treeherder_url, + ) return lando_job_id, treeherder_url @@ -126,27 +142,59 @@ def _get_build_job_result(push_id: int, task_name: str) -> str | None: def _poll_treeherder(lando_job_id: str, task_name: str) -> bool | None: + logger.info( + "Polling Treeherder for lando_job_id=%s, task=%s (timeout=%ds, interval=%ds)", + lando_job_id, + task_name, + TRY_PUSH_TIMEOUT_SECONDS, + TRY_PUSH_POLL_INTERVAL_SECONDS, + ) deadline = time.monotonic() + TRY_PUSH_TIMEOUT_SECONDS push_id: int | None = None + poll_count = 0 while time.monotonic() < deadline: + poll_count += 1 if push_id is None: revision = _get_push_revision(lando_job_id) if revision: + logger.info( + "Resolved revision=%s for lando_job_id=%s", revision, lando_job_id + ) push = _get_push_by_revision(revision) if push: push_id = push["id"] + logger.info( + "Resolved push_id=%d for revision=%s", push_id, revision + ) if push_id is not None: result = _get_build_job_result(push_id, task_name) + logger.debug( + "Poll #%d: job result=%s for push_id=%d", poll_count, result, push_id + ) if result == "success": + logger.info("Try build succeeded for lando_job_id=%s", lando_job_id) return True if result in ("busted", "testfailed", "exception"): + logger.info( + "Try build failed (%s) for lando_job_id=%s", result, lando_job_id + ) return False + else: + logger.debug( + "Poll #%d: push not yet available for lando_job_id=%s", + poll_count, + lando_job_id, + ) time.sleep(TRY_PUSH_POLL_INTERVAL_SECONDS) - logger.warning("Try push polling timed out for lando job %s", lando_job_id) + logger.warning( + "Try push polling timed out after %d polls for lando job %s", + poll_count, + lando_job_id, + ) return None @@ -156,10 +204,17 @@ def run_try_verification( task_name: str, skip_try_push: bool = False, ) -> TryPushResult: + logger.info( + "Starting try verification for bug %d (task=%s, skip_try_push=%s)", + bug_id, + task_name, + skip_try_push, + ) _commit_fix(worktree_path, bug_id) local_passed = _run_local_build(worktree_path) if not local_passed: + logger.warning("Bug %d: local build failed, skipping try push", bug_id) return TryPushResult( local_build_passed=False, try_build_passed=None, @@ -168,6 +223,9 @@ def run_try_verification( ) if skip_try_push: + logger.info( + "Bug %d: local build passed, skipping try push as requested", bug_id + ) return TryPushResult( local_build_passed=True, try_build_passed=None, @@ -177,6 +235,7 @@ def run_try_verification( lando_job_id, treeherder_url = _submit_try(worktree_path, task_name) if not lando_job_id: + logger.warning("Bug %d: try push submission failed, no lando job ID", bug_id) return TryPushResult( local_build_passed=True, try_build_passed=None, diff --git a/bugbug/tools/build_repair/worktree.py b/bugbug/tools/build_repair/worktree.py index 10026ec545..68279feb2c 100644 --- a/bugbug/tools/build_repair/worktree.py +++ b/bugbug/tools/build_repair/worktree.py @@ -4,10 +4,13 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. import subprocess +from logging import getLogger from pathlib import Path from bugbug.tools.build_repair.config import WORKTREE_BASE_DIR +logger = getLogger(__name__) + class WorktreeManager: """Manages git worktrees for parallel evaluation runs against a Firefox repo.""" @@ -23,23 +26,31 @@ def __init__( def create(self, commit_hash: str, name: str) -> Path: worktree_path = self.base_dir / name + logger.info( + "Creating worktree %s at %s (commit=%s)", name, worktree_path, commit_hash + ) subprocess.run( ["git", "worktree", "add", str(worktree_path), commit_hash], cwd=self.repo, check=True, ) + logger.info("Worktree %s created", name) return worktree_path def cleanup(self, name: str) -> None: + logger.info("Cleaning up worktree %s", name) subprocess.run( ["git", "worktree", "remove", str(self.base_dir / name), "--force"], cwd=self.repo, check=True, ) + logger.info("Worktree %s removed", name) def cleanup_all(self) -> None: + logger.info("Cleaning up all worktrees in %s", self.base_dir) for entry in self.base_dir.iterdir(): if entry.is_dir(): + logger.info("Removing worktree %s", entry) subprocess.run( ["git", "worktree", "remove", str(entry), "--force"], cwd=self.repo, diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index 8f423e4d4f..67ed82c3f1 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -62,6 +62,13 @@ async def invoke( **kwargs, ) -> dict: wt_name = f"bug-{bug_id}-trial-{self.trial_id}" + logger.info( + "Invoking bug %d (trial=%d, commit=%s, %d failures)", + bug_id, + self.trial_id, + gh_failure_commits[0][:12], + len(failures), + ) try: cutoff = max( @@ -91,8 +98,20 @@ async def invoke( worktree_path=worktree_path, skip_try_push=self.no_try_push, ) + logger.info( + "Bug %d completed: error=%s, diff_len=%d, cost=$%.4f, turns=%d, " + "local_build=%s, try_build=%s", + bug_id, + result.error, + len(result.diff), + result.cost_usd, + result.num_turns, + result.local_build_passed, + result.try_build_passed, + ) return result.model_dump() except Exception as e: + logger.error("Bug %d failed with exception: %s", bug_id, e, exc_info=True) return { "error": str(e), "diff": "", @@ -106,6 +125,7 @@ async def invoke( "treeherder_url": None, } finally: + logger.info("Bug %d: cleaning up worktree %s", bug_id, wt_name) self.worktree_mgr.cleanup(wt_name) @@ -123,19 +143,40 @@ def main() -> None: if not args.firefox_repo: parser.error("--firefox-repo or FIREFOX_GIT_REPO env var is required") + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + ) + + logger.info( + "Starting evaluation: dataset=%s, limit=%s, trials=%d, parallelism=%d, " + "analysis_only=%s, no_try_push=%s, firefox_repo=%s", + args.dataset, + args.limit, + args.trials, + args.parallelism, + args.analysis_only, + args.no_try_push, + args.firefox_repo, + ) + os.environ["WEAVE_PARALLELISM"] = str(args.parallelism) weave.init("bugbug-build-repair-eval") dataset = weave.ref(args.dataset).get() rows = dataset.rows + logger.info("Loaded dataset %s with %d rows", args.dataset, len(rows)) if args.limit: rows = rows[: args.limit] + logger.info("Limited to %d rows", len(rows)) scorers = [BasicMetricsScorer(), LLMFixMatchingScorer()] if not args.analysis_only: scorers.insert(1, BuildPassRateScorer()) + logger.info("Scorers: %s", [type(s).__name__ for s in scorers]) for trial in range(args.trials): + logger.info("Starting trial %d/%d", trial + 1, args.trials) model = BuildRepairModel( firefox_repo=args.firefox_repo, analysis_only=args.analysis_only, @@ -148,7 +189,7 @@ def main() -> None: scorers=scorers, ) results = asyncio.run(evaluation.evaluate(model)) - print(f"Trial {trial} results: {results}") + logger.info("Trial %d/%d results: %s", trial + 1, args.trials, results) # TODO: To compute pass@k across trials, collect per-row scores from each # trial via the Weave API (weave.ref(...).get() on individual evaluation From 5cb55a5344452d44f651dcda3748a65f6ef44494 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 23 Feb 2026 15:46:32 -0800 Subject: [PATCH 06/31] Improve tracing --- bugbug/tools/build_repair/agent.py | 180 ++++++++++++++------- bugbug/tools/build_repair/scorer.py | 4 +- bugbug/tools/build_repair/try_server.py | 73 ++++----- bugbug/tools/build_repair/worktree.py | 35 +++-- docker/build_repair/Dockerfile | 2 - notebooks/build_repair_evaluation.ipynb | 182 ---------------------- scripts/build_repair_eval.py | 199 +++++++++++++++++++----- 7 files changed, 339 insertions(+), 336 deletions(-) delete mode 100644 notebooks/build_repair_evaluation.ipynb diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py index e010ff7d6f..08c02a626a 100644 --- a/bugbug/tools/build_repair/agent.py +++ b/bugbug/tools/build_repair/agent.py @@ -4,6 +4,7 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. import subprocess +from collections.abc import Callable from logging import getLogger from pathlib import Path @@ -55,6 +56,8 @@ class AgentResponse(BaseModel): try_build_passed: bool | None = Field(default=None) lando_job_id: str | None = Field(default=None) treeherder_url: str | None = Field(default=None) + stage1_transcript: list[dict] = Field(default_factory=list) + stage2_transcript: list[dict] = Field(default_factory=list) class BuildRepairTool(GenerativeModelTool): @@ -81,6 +84,68 @@ def __init__( def create(cls, **kwargs): return cls(**kwargs) + @staticmethod + def _serialize_message(message) -> dict: + data = {"type": type(message).__name__} + if hasattr(message, "model_dump"): + data.update(message.model_dump()) + elif hasattr(message, "__dict__"): + data.update(vars(message)) + else: + data["raw"] = str(message) + return data + + async def _run_stage( + self, + stage_name: str, + prompt: str, + system_prompt: str, + model: str, + options: ClaudeAgentOptions, + bug_id: int, + on_message: Callable[[str, dict], None] | None = None, + ) -> tuple[list[dict], float, int]: + transcript: list[dict] = [] + cost = 0.0 + turns = 0 + result_data: dict = {} + + if on_message: + on_message( + stage_name, + { + "type": "stage_start", + "prompt": prompt, + "system_prompt": system_prompt, + "model": model, + }, + ) + try: + async for message in query(prompt=prompt, options=options): + serialized = self._serialize_message(message) + transcript.append(serialized) + logger.info(f"Bug {bug_id}: {stage_name} [{serialized['type']}]") + logger.debug(f"Bug {bug_id}: {stage_name} detail: {serialized}") + if on_message: + on_message(stage_name, serialized) + if isinstance(message, ResultMessage): + cost += message.total_cost_usd or 0 + turns += message.num_turns or 0 + result_data = serialized + finally: + if on_message: + on_message( + stage_name, + { + "type": "stage_end", + "cost_usd": cost, + "num_turns": turns, + "result_data": result_data, + }, + ) + + return transcript, cost, turns + def _prepare_input_files(self, failure: BuildFailure, worktree_path: Path) -> None: in_dir = worktree_path / "repair_agent" / "in" / str(failure.bug_id) in_dir.mkdir(parents=True, exist_ok=True) @@ -100,10 +165,8 @@ def _prepare_input_files(self, failure: BuildFailure, worktree_path: Path) -> No out_dir.mkdir(parents=True, exist_ok=True) logger.info( - "Prepared input files for bug %d at %s (%d failure tasks)", - failure.bug_id, - in_dir, - len(failure.failure_tasks), + f"Prepared input files for bug {failure.bug_id} at {in_dir} " + f"({len(failure.failure_tasks)} failure tasks)" ) def _read_output(self, failure: BuildFailure, worktree_path: Path, key: str) -> str: @@ -119,15 +182,12 @@ async def run( failure: BuildFailure, worktree_path: Path, skip_try_push: bool = False, + on_message: Callable[[str, dict], None] | None = None, ) -> AgentResponse: logger.info( - "Starting build repair for bug %d (commit=%s, worktree=%s, " - "analysis_only=%s, skip_try_push=%s)", - failure.bug_id, - failure.git_commit, - worktree_path, - self.analysis_only, - skip_try_push, + f"Starting build repair for bug {failure.bug_id} " + f"(commit={failure.git_commit}, worktree={worktree_path}, " + f"analysis_only={self.analysis_only}, skip_try_push={skip_try_push})" ) self._prepare_input_files(failure, worktree_path) @@ -140,11 +200,9 @@ async def run( total_turns = 0 logger.info( - "Bug %d: starting Stage 1 (analysis) with model=%s", - failure.bug_id, - self.analysis_model, + f"Bug {failure.bug_id}: starting Stage 1 (analysis) " + f"with model={self.analysis_model}" ) - # Stage 1: Analysis stage1_options = ClaudeAgentOptions( system_prompt=system_prompt, model=self.analysis_model, @@ -153,7 +211,7 @@ async def run( disallowed_tools=disallowed, add_dirs=ADDITIONAL_DIRS, sandbox=SANDBOX_CONFIG, - permission_mode="plan", + permission_mode="acceptEdits", effort="high", mcp_servers=mcp_servers, ) @@ -162,16 +220,20 @@ async def run( target_software=self.target_software, ) try: - async for message in query(prompt=analysis_prompt, options=stage1_options): - if isinstance(message, ResultMessage): - total_cost += message.total_cost_usd or 0 - total_turns += message.num_turns or 0 + stage1_transcript, stage1_cost, stage1_turns = await self._run_stage( + "analysis", + analysis_prompt, + system_prompt, + self.analysis_model, + stage1_options, + failure.bug_id, + on_message, + ) + total_cost += stage1_cost + total_turns += stage1_turns except Exception as e: logger.error( - "Bug %d: Stage 1 (analysis) failed: %s", - failure.bug_id, - e, - exc_info=True, + f"Bug {failure.bug_id}: Stage 1 (analysis) failed: {e}", exc_info=True ) return AgentResponse( error=str(e), @@ -180,36 +242,30 @@ async def run( ) logger.info( - "Bug %d: Stage 1 complete (cost=$%.4f, turns=%d)", - failure.bug_id, - total_cost, - total_turns, + f"Bug {failure.bug_id}: Stage 1 complete " + f"(cost=${total_cost:.4f}, turns={total_turns})" ) summary = self._read_output(failure, worktree_path, "summary") analysis = self._read_output(failure, worktree_path, "analysis") logger.info( - "Bug %d: read output files (summary=%d chars, analysis=%d chars)", - failure.bug_id, - len(summary), - len(analysis), + f"Bug {failure.bug_id}: read output files " + f"(summary={len(summary)} chars, analysis={len(analysis)} chars)" ) if self.analysis_only: - logger.info("Bug %d: analysis-only mode, skipping Stage 2", failure.bug_id) + logger.info(f"Bug {failure.bug_id}: analysis-only mode, skipping Stage 2") return AgentResponse( summary=summary, analysis=analysis, cost_usd=total_cost, num_turns=total_turns, + stage1_transcript=stage1_transcript, ) logger.info( - "Bug %d: starting Stage 2 (fix) with model=%s", - failure.bug_id, - self.fix_model, + f"Bug {failure.bug_id}: starting Stage 2 (fix) with model={self.fix_model}" ) - # Stage 2: Fix stage2_options = ClaudeAgentOptions( system_prompt=system_prompt, model=self.fix_model, @@ -224,13 +280,20 @@ async def run( ) fix_prompt = FIX_TEMPLATE.format(bug_id=failure.bug_id) try: - async for message in query(prompt=fix_prompt, options=stage2_options): - if isinstance(message, ResultMessage): - total_cost += message.total_cost_usd or 0 - total_turns += message.num_turns or 0 + stage2_transcript, stage2_cost, stage2_turns = await self._run_stage( + "fix", + fix_prompt, + system_prompt, + self.fix_model, + stage2_options, + failure.bug_id, + on_message, + ) + total_cost += stage2_cost + total_turns += stage2_turns except Exception as e: logger.error( - "Bug %d: Stage 2 (fix) failed: %s", failure.bug_id, e, exc_info=True + f"Bug {failure.bug_id}: Stage 2 (fix) failed: {e}", exc_info=True ) return AgentResponse( summary=summary, @@ -241,10 +304,8 @@ async def run( ) logger.info( - "Bug %d: Stage 2 complete (cost=$%.4f, turns=%d)", - failure.bug_id, - total_cost, - total_turns, + f"Bug {failure.bug_id}: Stage 2 complete " + f"(cost=${total_cost:.4f}, turns={total_turns})" ) diff_result = subprocess.run( @@ -254,16 +315,18 @@ async def run( text=True, ) diff = diff_result.stdout - logger.info("Bug %d: git diff produced %d chars", failure.bug_id, len(diff)) + logger.info(f"Bug {failure.bug_id}: git diff produced {len(diff)} chars") if not diff.strip(): - logger.warning("Bug %d: no diff produced, returning early", failure.bug_id) + logger.warning(f"Bug {failure.bug_id}: no diff produced, returning early") return AgentResponse( summary=summary, analysis=analysis, diff=diff, cost_usd=total_cost, num_turns=total_turns, + stage1_transcript=stage1_transcript, + stage2_transcript=stage2_transcript, ) from bugbug.tools.build_repair.try_server import run_try_verification @@ -272,10 +335,8 @@ async def run( failure.failure_tasks[0]["task_name"] if failure.failure_tasks else "" ) logger.info( - "Bug %d: starting try verification (task=%s, skip_try_push=%s)", - failure.bug_id, - task_name, - skip_try_push, + f"Bug {failure.bug_id}: starting try verification " + f"(task={task_name}, skip_try_push={skip_try_push})" ) try_result = run_try_verification( worktree_path=worktree_path, @@ -285,14 +346,11 @@ async def run( ) logger.info( - "Bug %d: try verification done (local_build=%s, try_build=%s, " - "lando_job=%s, total_cost=$%.4f, total_turns=%d)", - failure.bug_id, - try_result.local_build_passed, - try_result.try_build_passed, - try_result.lando_job_id, - total_cost, - total_turns, + f"Bug {failure.bug_id}: try verification done " + f"(local_build={try_result.local_build_passed}, " + f"try_build={try_result.try_build_passed}, " + f"lando_job={try_result.lando_job_id}, " + f"total_cost=${total_cost:.4f}, total_turns={total_turns})" ) return AgentResponse( summary=summary, @@ -304,4 +362,6 @@ async def run( try_build_passed=try_result.try_build_passed, lando_job_id=try_result.lando_job_id, treeherder_url=try_result.treeherder_url, + stage1_transcript=stage1_transcript, + stage2_transcript=stage2_transcript, ) diff --git a/bugbug/tools/build_repair/scorer.py b/bugbug/tools/build_repair/scorer.py index 5efbbb1827..709df41f45 100644 --- a/bugbug/tools/build_repair/scorer.py +++ b/bugbug/tools/build_repair/scorer.py @@ -32,7 +32,7 @@ def summarize(self, score_rows: list[dict]) -> dict: "total_cost_usd": sum(costs), "num_examples": n, } - logger.info("BasicMetrics summary: %s", summary) + logger.info(f"BasicMetrics summary: {summary}") return summary @@ -59,7 +59,7 @@ def summarize(self, score_rows: list[dict]) -> dict: "try_builds_timed_out": n - len(try_known), "num_examples": n, } - logger.info("BuildPassRate summary: %s", summary) + logger.info(f"BuildPassRate summary: {summary}") return summary diff --git a/bugbug/tools/build_repair/try_server.py b/bugbug/tools/build_repair/try_server.py index fb17e51703..dd547c81a7 100644 --- a/bugbug/tools/build_repair/try_server.py +++ b/bugbug/tools/build_repair/try_server.py @@ -35,7 +35,7 @@ class TryPushResult: def _commit_fix(worktree_path: Path, bug_id: int) -> None: - logger.info("Committing fix for bug %d in %s", bug_id, worktree_path) + logger.info(f"Committing fix for bug {bug_id} in {worktree_path}") subprocess.run( ["git", "add", "-A"], cwd=worktree_path, @@ -46,26 +46,23 @@ def _commit_fix(worktree_path: Path, bug_id: int) -> None: cwd=worktree_path, check=True, ) - logger.info("Bug %d: fix committed", bug_id) + logger.info(f"Bug {bug_id}: fix committed") def _run_local_build(worktree_path: Path) -> bool: - logger.info("Running local build in %s", worktree_path) + logger.info(f"Running local build in {worktree_path}") result = subprocess.run( ["./mach", "build"], cwd=worktree_path, ) passed = result.returncode == 0 - logger.info( - "Local build %s (returncode=%d)", - "passed" if passed else "failed", - result.returncode, - ) + status = "passed" if passed else "failed" + logger.info(f"Local build {status} (returncode={result.returncode})") return passed def _submit_try(worktree_path: Path, task_name: str) -> tuple[str | None, str | None]: - logger.info("Submitting try push for task=%s in %s", task_name, worktree_path) + logger.info(f"Submitting try push for task={task_name} in {worktree_path}") result = subprocess.run( ["./mach", "try", "fuzzy", "--query", task_name], cwd=worktree_path, @@ -73,18 +70,16 @@ def _submit_try(worktree_path: Path, task_name: str) -> tuple[str | None, str | text=True, ) stdout = result.stdout + result.stderr - logger.debug("Try push output: %s", stdout) + logger.debug(f"Try push output: {stdout}") match = _LANDO_JOB_ID_RE.search(stdout) if not match: - logger.warning("Could not parse Lando job ID from try output: %s", stdout) + logger.warning(f"Could not parse Lando job ID from try output: {stdout}") return None, None lando_job_id = match.group(1) treeherder_url = f"{TREEHERDER_BASE_URL}/jobs?repo=try&landoCommitID={lando_job_id}" logger.info( - "Try push submitted: lando_job_id=%s, treeherder=%s", - lando_job_id, - treeherder_url, + f"Try push submitted: lando_job_id={lando_job_id}, treeherder={treeherder_url}" ) return lando_job_id, treeherder_url @@ -102,7 +97,7 @@ def _get_push_revision(lando_job_id: str) -> str | None: if results: return results[0].get("revision") except Exception: - logger.exception("Error fetching push revision for lando job %s", lando_job_id) + logger.exception(f"Error fetching push revision for lando job {lando_job_id}") return None @@ -118,7 +113,7 @@ def _get_push_by_revision(revision: str) -> dict | None: results = resp.json().get("results", []) return results[0] if results else None except Exception: - logger.exception("Error fetching push by revision %s", revision) + logger.exception(f"Error fetching push by revision {revision}") return None @@ -137,17 +132,15 @@ def _get_build_job_result(push_id: int, task_name: str) -> str | None: return job["state"] return job["result"] except Exception: - logger.exception("Error fetching build job result for push %d", push_id) + logger.exception(f"Error fetching build job result for push {push_id}") return None def _poll_treeherder(lando_job_id: str, task_name: str) -> bool | None: logger.info( - "Polling Treeherder for lando_job_id=%s, task=%s (timeout=%ds, interval=%ds)", - lando_job_id, - task_name, - TRY_PUSH_TIMEOUT_SECONDS, - TRY_PUSH_POLL_INTERVAL_SECONDS, + f"Polling Treeherder for lando_job_id={lando_job_id}, task={task_name} " + f"(timeout={TRY_PUSH_TIMEOUT_SECONDS}s, " + f"interval={TRY_PUSH_POLL_INTERVAL_SECONDS}s)" ) deadline = time.monotonic() + TRY_PUSH_TIMEOUT_SECONDS push_id: int | None = None @@ -159,41 +152,37 @@ def _poll_treeherder(lando_job_id: str, task_name: str) -> bool | None: revision = _get_push_revision(lando_job_id) if revision: logger.info( - "Resolved revision=%s for lando_job_id=%s", revision, lando_job_id + f"Resolved revision={revision} for lando_job_id={lando_job_id}" ) push = _get_push_by_revision(revision) if push: push_id = push["id"] - logger.info( - "Resolved push_id=%d for revision=%s", push_id, revision - ) + logger.info(f"Resolved push_id={push_id} for revision={revision}") if push_id is not None: result = _get_build_job_result(push_id, task_name) logger.debug( - "Poll #%d: job result=%s for push_id=%d", poll_count, result, push_id + f"Poll #{poll_count}: job result={result} for push_id={push_id}" ) if result == "success": - logger.info("Try build succeeded for lando_job_id=%s", lando_job_id) + logger.info(f"Try build succeeded for lando_job_id={lando_job_id}") return True if result in ("busted", "testfailed", "exception"): logger.info( - "Try build failed (%s) for lando_job_id=%s", result, lando_job_id + f"Try build failed ({result}) for lando_job_id={lando_job_id}" ) return False else: logger.debug( - "Poll #%d: push not yet available for lando_job_id=%s", - poll_count, - lando_job_id, + f"Poll #{poll_count}: push not yet available for " + f"lando_job_id={lando_job_id}" ) time.sleep(TRY_PUSH_POLL_INTERVAL_SECONDS) logger.warning( - "Try push polling timed out after %d polls for lando job %s", - poll_count, - lando_job_id, + f"Try push polling timed out after {poll_count} polls " + f"for lando job {lando_job_id}" ) return None @@ -205,16 +194,14 @@ def run_try_verification( skip_try_push: bool = False, ) -> TryPushResult: logger.info( - "Starting try verification for bug %d (task=%s, skip_try_push=%s)", - bug_id, - task_name, - skip_try_push, + f"Starting try verification for bug {bug_id} " + f"(task={task_name}, skip_try_push={skip_try_push})" ) _commit_fix(worktree_path, bug_id) local_passed = _run_local_build(worktree_path) if not local_passed: - logger.warning("Bug %d: local build failed, skipping try push", bug_id) + logger.warning(f"Bug {bug_id}: local build failed, skipping try push") return TryPushResult( local_build_passed=False, try_build_passed=None, @@ -223,9 +210,7 @@ def run_try_verification( ) if skip_try_push: - logger.info( - "Bug %d: local build passed, skipping try push as requested", bug_id - ) + logger.info(f"Bug {bug_id}: local build passed, skipping try push as requested") return TryPushResult( local_build_passed=True, try_build_passed=None, @@ -235,7 +220,7 @@ def run_try_verification( lando_job_id, treeherder_url = _submit_try(worktree_path, task_name) if not lando_job_id: - logger.warning("Bug %d: try push submission failed, no lando job ID", bug_id) + logger.warning(f"Bug {bug_id}: try push submission failed, no lando job ID") return TryPushResult( local_build_passed=True, try_build_passed=None, diff --git a/bugbug/tools/build_repair/worktree.py b/bugbug/tools/build_repair/worktree.py index 68279feb2c..8ae10ea7da 100644 --- a/bugbug/tools/build_repair/worktree.py +++ b/bugbug/tools/build_repair/worktree.py @@ -27,32 +27,49 @@ def __init__( def create(self, commit_hash: str, name: str) -> Path: worktree_path = self.base_dir / name logger.info( - "Creating worktree %s at %s (commit=%s)", name, worktree_path, commit_hash + f"Creating worktree {name} at {worktree_path} (commit={commit_hash})" ) + if worktree_path.exists(): + self.cleanup(name) subprocess.run( - ["git", "worktree", "add", str(worktree_path), commit_hash], + [ + "git", + "worktree", + "add", + "--force", + "--force", + str(worktree_path), + commit_hash, + ], cwd=self.repo, check=True, ) - logger.info("Worktree %s created", name) + logger.info(f"Worktree {name} created") return worktree_path def cleanup(self, name: str) -> None: - logger.info("Cleaning up worktree %s", name) + logger.info(f"Cleaning up worktree {name}") subprocess.run( - ["git", "worktree", "remove", str(self.base_dir / name), "--force"], + [ + "git", + "worktree", + "remove", + "--force", + "--force", + str(self.base_dir / name), + ], cwd=self.repo, check=True, ) - logger.info("Worktree %s removed", name) + logger.info(f"Worktree {name} removed") def cleanup_all(self) -> None: - logger.info("Cleaning up all worktrees in %s", self.base_dir) + logger.info(f"Cleaning up all worktrees in {self.base_dir}") for entry in self.base_dir.iterdir(): if entry.is_dir(): - logger.info("Removing worktree %s", entry) + logger.info(f"Removing worktree {entry}") subprocess.run( - ["git", "worktree", "remove", str(entry), "--force"], + ["git", "worktree", "remove", "--force", "--force", str(entry)], cwd=self.repo, check=False, ) diff --git a/docker/build_repair/Dockerfile b/docker/build_repair/Dockerfile index 6aaff90873..1b7928cf52 100644 --- a/docker/build_repair/Dockerfile +++ b/docker/build_repair/Dockerfile @@ -8,5 +8,3 @@ RUN pip install -e . RUN pip install claude-agent-sdk jupyter ENV FIREFOX_GIT_REPO=/workspace/firefox - -ENTRYPOINT ["python", "scripts/build_repair_eval.py"] diff --git a/notebooks/build_repair_evaluation.ipynb b/notebooks/build_repair_evaluation.ipynb deleted file mode 100644 index 3813cb00e9..0000000000 --- a/notebooks/build_repair_evaluation.ipynb +++ /dev/null @@ -1,182 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build Repair Agent Evaluation\n", - "\n", - "This notebook runs W&B Weave evaluations for the automatic build repair agent." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import weave\n", - "\n", - "os.environ[\"WEAVE_PARALLELISM\"] = \"4\"\n", - "\n", - "PROJECT_NAME = \"bugbug-build-repair-eval\"\n", - "_ = weave.init(PROJECT_NAME)\n", - "\n", - "FIREFOX_REPO = os.environ[\"FIREFOX_GIT_REPO\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = weave.ref(\"build_repair_one_commit_eval\").get()\n", - "dataset = dataset.rows[:1]\n", - "print(f\"Dataset has {len(dataset)} examples\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configure Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from scripts.build_repair_eval import BuildRepairModel\n", - "\n", - "model = BuildRepairModel(\n", - " firefox_repo=FIREFOX_REPO, analysis_only=True, no_try_push=True\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": "## Run Evaluation" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from bugbug.tools.build_repair.scorer import (\n", - " BasicMetricsScorer,\n", - " BuildPassRateScorer,\n", - ")\n", - "\n", - "evaluation = weave.Evaluation(\n", - " name=\"build-repair-test\",\n", - " dataset=dataset,\n", - " scorers=[\n", - " BasicMetricsScorer(),\n", - " BuildPassRateScorer(),\n", - " # LLMFixMatchingScorer()\n", - " ],\n", - ")\n", - "\n", - "results = await evaluation.evaluate(model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Visualizations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "basic = results.get(\"BasicMetricsScorer\", {})\n", - "build = results.get(\"BuildPassRateScorer\", {})\n", - "\n", - "metrics = {\n", - " \"success_rate\": basic.get(\"success_rate\", 0),\n", - " \"diff_rate\": basic.get(\"diff_rate\", 0),\n", - " \"local_build_pass_rate\": build.get(\"local_build_pass_rate\", 0),\n", - " \"try_build_pass_rate\": build.get(\"try_build_pass_rate\", 0),\n", - "}\n", - "\n", - "fig, ax = plt.subplots(figsize=(8, 4))\n", - "ax.bar(metrics.keys(), metrics.values())\n", - "ax.set_ylim(0, 1)\n", - "ax.set_ylabel(\"Rate\")\n", - "ax.set_title(\"Build Repair Evaluation Results\")\n", - "plt.tight_layout()\n", - "plt.show()\n", - "\n", - "print(f\"Total cost: ${basic.get('total_cost_usd', 0):.2f}\")\n", - "print(f\"Avg cost per example: ${basic.get('avg_cost_usd', 0):.2f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Basic metrics: {basic}\")\n", - "print(f\"Build pass rates: {build}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 7. View in W&B\n", - "\n", - "Visit [W&B Weave](https://wandb.ai) to see detailed traces, compare evaluations, and explore individual predictions." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "bugbug (3.12.7)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index 67ed82c3f1..9cfd7f1417 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -16,10 +16,12 @@ import argparse import asyncio +import json import logging import os -from datetime import date +from datetime import datetime from functools import cached_property +from typing import Any import weave @@ -34,6 +36,139 @@ logger = logging.getLogger(__name__) +# todo: verify tracing code + + +def _attr(obj, key, default=None): + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _to_chat_message(data: dict) -> dict | None: + """Convert a serialized claude_agent_sdk message to OpenAI chat format. + + Content blocks may be dicts (from model_dump) or dataclass instances + (from vars), so we use _attr() for uniform access. + """ + msg_type = data.get("type", "") + + if msg_type == "AssistantMessage": + blocks = data.get("content", []) + text_parts = [] + tool_calls = [] + for block in blocks: + text = _attr(block, "text") + if text is not None: + text_parts.append(text) + continue + name = _attr(block, "name") + block_id = _attr(block, "id") + if name is not None and block_id is not None: + tool_calls.append( + { + "id": block_id, + "type": "function", + "function": { + "name": name, + "arguments": json.dumps(_attr(block, "input", {})), + }, + } + ) + if not text_parts and not tool_calls: + return None + msg: dict = {"role": "assistant"} + if text_parts: + msg["content"] = "\n".join(text_parts) + if tool_calls: + msg["tool_calls"] = tool_calls + return msg + + if msg_type == "UserMessage": + content = data.get("content", "") + if isinstance(content, list): + for block in content: + tool_use_id = _attr(block, "tool_use_id") + if tool_use_id: + block_content = _attr(block, "content", "") + return { + "role": "tool", + "tool_call_id": tool_use_id, + "content": str(block_content) if block_content else "", + } + + return None + + +@weave.op(kind="llm") +def trace_llm_stage( + stage: str, + messages: list[dict], + model: str, + result_data: dict | None = None, +) -> dict: + last_assistant = "" + for msg in reversed(messages): + if msg.get("role") == "assistant" and msg.get("content"): + last_assistant = msg["content"] + break + + result: dict[str, Any] = { + "choices": [ + { + "message": {"role": "assistant", "content": last_assistant}, + } + ], + } + if result_data: + usage = { + k: result_data[k] + for k in ( + "input_tokens", + "output_tokens", + "total_tokens", + "cache_read_input_tokens", + "cache_creation_input_tokens", + "total_cost_usd", + "num_turns", + ) + if k in result_data + } + if usage: + result["usage"] = {model: usage} + return result + + +def _make_weave_callback(): + stages: dict[str, dict] = {} + + def on_message(stage: str, data: dict) -> None: + msg_type = data["type"] + if msg_type == "stage_start": + stages[stage] = { + "model": data["model"], + "messages": [ + {"role": "system", "content": data["system_prompt"]}, + {"role": "user", "content": data["prompt"]}, + ], + } + elif msg_type == "stage_end": + if stage in stages: + s = stages.pop(stage) + trace_llm_stage( + stage=stage, + messages=s["messages"], + model=s["model"], + result_data=data.get("result_data") or None, + ) + else: + if stage in stages: + chat_msg = _to_chat_message(data) + if chat_msg: + stages[stage]["messages"].append(chat_msg) + + return on_message + class BuildRepairModel(weave.Model): """Weave Model wrapper that creates a worktree per example and runs BuildRepairTool.""" @@ -63,11 +198,8 @@ async def invoke( ) -> dict: wt_name = f"bug-{bug_id}-trial-{self.trial_id}" logger.info( - "Invoking bug %d (trial=%d, commit=%s, %d failures)", - bug_id, - self.trial_id, - gh_failure_commits[0][:12], - len(failures), + f"Invoking bug {bug_id} (trial={self.trial_id}, " + f"commit={gh_failure_commits[0][:12]}, {len(failures)} failures)" ) try: @@ -75,12 +207,10 @@ async def invoke( MODEL_CUTOFF_DATES[self.tool.analysis_model], MODEL_CUTOFF_DATES[self.tool.fix_model], ) - if date.fromisoformat(fix_commit_date) < cutoff: + if datetime.fromisoformat(fix_commit_date).date() < cutoff: logger.warning( - "Skipping bug %d: fix date %s is before model cutoff %s", - bug_id, - fix_commit_date, - cutoff, + f"Skipping bug {bug_id}: fix date {fix_commit_date} " + f"is before model cutoff {cutoff}" ) raise ValueError("skipped_data_contamination") @@ -97,21 +227,18 @@ async def invoke( failure, worktree_path=worktree_path, skip_try_push=self.no_try_push, + on_message=_make_weave_callback(), ) logger.info( - "Bug %d completed: error=%s, diff_len=%d, cost=$%.4f, turns=%d, " - "local_build=%s, try_build=%s", - bug_id, - result.error, - len(result.diff), - result.cost_usd, - result.num_turns, - result.local_build_passed, - result.try_build_passed, + f"Bug {bug_id} completed: error={result.error}, " + f"diff_len={len(result.diff)}, cost=${result.cost_usd:.4f}, " + f"turns={result.num_turns}, " + f"local_build={result.local_build_passed}, " + f"try_build={result.try_build_passed}" ) return result.model_dump() except Exception as e: - logger.error("Bug %d failed with exception: %s", bug_id, e, exc_info=True) + logger.error(f"Bug {bug_id} failed with exception: {e}", exc_info=True) return { "error": str(e), "diff": "", @@ -123,9 +250,11 @@ async def invoke( "try_build_passed": None, "lando_job_id": None, "treeherder_url": None, + "stage1_transcript": [], + "stage2_transcript": [], } finally: - logger.info("Bug %d: cleaning up worktree %s", bug_id, wt_name) + logger.info(f"Bug {bug_id}: cleaning up worktree {wt_name}") self.worktree_mgr.cleanup(wt_name) @@ -133,7 +262,7 @@ def main() -> None: parser = argparse.ArgumentParser(description="Build repair evaluation") parser.add_argument("--limit", type=int, default=None) parser.add_argument("--trials", type=int, default=1) - parser.add_argument("--parallelism", type=int, default=4) + parser.add_argument("--parallelism", type=int, default=1) parser.add_argument("--firefox-repo", default=os.environ.get("FIREFOX_GIT_REPO")) parser.add_argument("--dataset", default="build_repair_one_commit_eval") parser.add_argument("--analysis-only", action="store_true") @@ -147,17 +276,13 @@ def main() -> None: level=logging.DEBUG, format="%(asctime)s %(levelname)s %(name)s: %(message)s", ) + logging.getLogger("httpx").setLevel(logging.WARNING) logger.info( - "Starting evaluation: dataset=%s, limit=%s, trials=%d, parallelism=%d, " - "analysis_only=%s, no_try_push=%s, firefox_repo=%s", - args.dataset, - args.limit, - args.trials, - args.parallelism, - args.analysis_only, - args.no_try_push, - args.firefox_repo, + f"Starting evaluation: dataset={args.dataset}, limit={args.limit}, " + f"trials={args.trials}, parallelism={args.parallelism}, " + f"analysis_only={args.analysis_only}, no_try_push={args.no_try_push}, " + f"firefox_repo={args.firefox_repo}" ) os.environ["WEAVE_PARALLELISM"] = str(args.parallelism) @@ -165,18 +290,18 @@ def main() -> None: dataset = weave.ref(args.dataset).get() rows = dataset.rows - logger.info("Loaded dataset %s with %d rows", args.dataset, len(rows)) + logger.info(f"Loaded dataset {args.dataset} with {len(rows)} rows") if args.limit: rows = rows[: args.limit] - logger.info("Limited to %d rows", len(rows)) + logger.info(f"Limited to {len(rows)} rows") scorers = [BasicMetricsScorer(), LLMFixMatchingScorer()] if not args.analysis_only: scorers.insert(1, BuildPassRateScorer()) - logger.info("Scorers: %s", [type(s).__name__ for s in scorers]) + logger.info(f"Scorers: {[type(s).__name__ for s in scorers]}") for trial in range(args.trials): - logger.info("Starting trial %d/%d", trial + 1, args.trials) + logger.info(f"Starting trial {trial + 1}/{args.trials}") model = BuildRepairModel( firefox_repo=args.firefox_repo, analysis_only=args.analysis_only, @@ -189,7 +314,7 @@ def main() -> None: scorers=scorers, ) results = asyncio.run(evaluation.evaluate(model)) - logger.info("Trial %d/%d results: %s", trial + 1, args.trials, results) + logger.info(f"Trial {trial + 1}/{args.trials} results: {results}") # TODO: To compute pass@k across trials, collect per-row scores from each # trial via the Weave API (weave.ref(...).get() on individual evaluation From 79871bc00335e73c41668e59bea77385d48f0b12 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 23 Feb 2026 16:38:58 -0800 Subject: [PATCH 07/31] Add eval mode --- bugbug/tools/build_repair/agent.py | 10 ++++++++-- bugbug/tools/build_repair/prompts.py | 7 +++++++ scripts/build_repair_eval.py | 2 +- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py index 08c02a626a..21daac36c8 100644 --- a/bugbug/tools/build_repair/agent.py +++ b/bugbug/tools/build_repair/agent.py @@ -22,6 +22,7 @@ ) from bugbug.tools.build_repair.prompts import ( ANALYSIS_TEMPLATE, + EVAL_PROMPT, FIX_TEMPLATE, SYSTEM_PROMPT_TEMPLATE, ) @@ -72,9 +73,11 @@ def __init__( self, target_software: str = "Mozilla Firefox", analysis_only: bool = False, + eval_mode: bool = False, analysis_model: str = ANALYSIS_MODEL, fix_model: str = FIX_MODEL, ) -> None: + self.eval_mode = eval_mode self.target_software = target_software self.analysis_only = analysis_only self.analysis_model = analysis_model @@ -212,12 +215,13 @@ async def run( add_dirs=ADDITIONAL_DIRS, sandbox=SANDBOX_CONFIG, permission_mode="acceptEdits", - effort="high", + effort="low", mcp_servers=mcp_servers, ) analysis_prompt = ANALYSIS_TEMPLATE.format( bug_id=failure.bug_id, target_software=self.target_software, + eval=EVAL_PROMPT if self.eval_mode else "", ) try: stage1_transcript, stage1_cost, stage1_turns = await self._run_stage( @@ -278,7 +282,9 @@ async def run( effort="low", mcp_servers=mcp_servers, ) - fix_prompt = FIX_TEMPLATE.format(bug_id=failure.bug_id) + fix_prompt = FIX_TEMPLATE.format( + bug_id=failure.bug_id, eval=EVAL_PROMPT if self.eval_mode else "" + ) try: stage2_transcript, stage2_cost, stage2_turns = await self._run_stage( "fix", diff --git a/bugbug/tools/build_repair/prompts.py b/bugbug/tools/build_repair/prompts.py index 6ba37636a2..003342fb76 100644 --- a/bugbug/tools/build_repair/prompts.py +++ b/bugbug/tools/build_repair/prompts.py @@ -23,6 +23,7 @@ 3. repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction Do not prompt to edit those documents. +{eval} Do not write any code yet. Work fully autonomously, do not ask any questions. Think hard. """ @@ -30,6 +31,12 @@ FIX_TEMPLATE = """Read the following files and implement a fix of the failure: 1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues 2. repair_agent/out/{bug_id}/planning.md with a fixing plan +{eval} Do not prompt to edit files. Work fully autonomously, do not ask any questions. Use all allowed tools without prompting. """ + +EVAL_PROMPT = """ +Do not request bug info from Bugzilla or Phabricator. Use only the provided file with bug description. +Do not look at git commits other than the specified last commit. +""" diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index 9cfd7f1417..976677ab39 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -180,7 +180,7 @@ class BuildRepairModel(weave.Model): @cached_property def tool(self) -> BuildRepairTool: - return BuildRepairTool.create(analysis_only=self.analysis_only) + return BuildRepairTool.create(analysis_only=self.analysis_only, eval_mode=True) @cached_property def worktree_mgr(self) -> WorktreeManager: From 19addbade226957f86eae32c45f728d9fd24150f Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Tue, 24 Feb 2026 17:17:02 -0800 Subject: [PATCH 08/31] Fix local building --- bugbug/tools/build_repair/try_server.py | 31 ++++++++++++++++++++++++- docker/build_repair/Dockerfile | 16 +++++++++---- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/bugbug/tools/build_repair/try_server.py b/bugbug/tools/build_repair/try_server.py index dd547c81a7..a51bb59500 100644 --- a/bugbug/tools/build_repair/try_server.py +++ b/bugbug/tools/build_repair/try_server.py @@ -3,6 +3,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. +import os import re import subprocess import time @@ -24,6 +25,12 @@ _LANDO_JOB_ID_RE = re.compile(r"landoCommitID=([A-Za-z0-9_-]+)") +def _mach_env(worktree_path: Path) -> dict[str, str]: + env = os.environ.copy() + env["MOZBUILD_STATE_PATH"] = str(worktree_path / ".mozbuild") + return env + + @dataclass class TryPushResult: """Result of local build verification and optional try push submission.""" @@ -42,7 +49,16 @@ def _commit_fix(worktree_path: Path, bug_id: int) -> None: check=True, ) subprocess.run( - ["git", "commit", "-m", f"Build repair fix for bug {bug_id}"], + [ + "git", + "-c", + "user.name=bugbug", + "-c", + "user.email=bugbug@mozilla.com", + "commit", + "-m", + f"Build repair fix for bug {bug_id}", + ], cwd=worktree_path, check=True, ) @@ -50,10 +66,22 @@ def _commit_fix(worktree_path: Path, bug_id: int) -> None: def _run_local_build(worktree_path: Path) -> bool: + logger.info(f"Running bootstrap in {worktree_path}") + result = subprocess.run( + ["./mach", "--no-interactive", "bootstrap"], + cwd=worktree_path, + env=_mach_env(worktree_path), + ) + if result.returncode != 0: + raise RuntimeError( + f"Local bootstrap failed with return code {result.returncode}" + ) + logger.info(f"Running local build in {worktree_path}") result = subprocess.run( ["./mach", "build"], cwd=worktree_path, + env=_mach_env(worktree_path), ) passed = result.returncode == 0 status = "passed" if passed else "failed" @@ -68,6 +96,7 @@ def _submit_try(worktree_path: Path, task_name: str) -> tuple[str | None, str | cwd=worktree_path, capture_output=True, text=True, + env=_mach_env(worktree_path), ) stdout = result.stdout + result.stderr logger.debug(f"Try push output: {stdout}") diff --git a/docker/build_repair/Dockerfile b/docker/build_repair/Dockerfile index 1b7928cf52..c473fbc124 100644 --- a/docker/build_repair/Dockerfile +++ b/docker/build_repair/Dockerfile @@ -1,10 +1,16 @@ -FROM python:3.12-slim - -RUN apt-get update && apt-get install -y git nodejs npm && rm -rf /var/lib/apt/lists/* +# ./mach taskgraph load-image +FROM debian12-amd64-build WORKDIR /app + +RUN apt-get install -y python3-pip python3-venv + +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +RUN apt-get install -y git nodejs npm && rm -rf /var/lib/apt/lists/* +RUN pip install weave>=0.50.0 pydantic claude-agent-sdk requests + COPY . /app -RUN pip install -e . -RUN pip install claude-agent-sdk jupyter ENV FIREFOX_GIT_REPO=/workspace/firefox From 7a32858a6ee9c1464e991b7345292d6f1919b532 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Fri, 27 Feb 2026 14:29:51 -0800 Subject: [PATCH 09/31] Improve cost tracing --- bugbug/tools/build_repair/agent.py | 45 +++++++++++++++++++-- bugbug/tools/build_repair/scorer.py | 14 +++++++ scripts/build_repair_eval.py | 63 ++++++++++++++++++++++------- 3 files changed, 103 insertions(+), 19 deletions(-) diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py index 21daac36c8..f2484ba133 100644 --- a/bugbug/tools/build_repair/agent.py +++ b/bugbug/tools/build_repair/agent.py @@ -53,6 +53,10 @@ class AgentResponse(BaseModel): error: str | None = Field(default=None) cost_usd: float = Field(default=0.0) num_turns: int = Field(default=0) + input_tokens: int = Field(default=0) + output_tokens: int = Field(default=0) + cache_read_input_tokens: int = Field(default=0) + cache_creation_input_tokens: int = Field(default=0) local_build_passed: bool | None = Field(default=None) try_build_passed: bool | None = Field(default=None) lando_job_id: str | None = Field(default=None) @@ -87,6 +91,15 @@ def __init__( def create(cls, **kwargs): return cls(**kwargs) + @staticmethod + def _usage_fields(usage: dict) -> dict: + return { + "input_tokens": usage.get("input_tokens", 0), + "output_tokens": usage.get("output_tokens", 0), + "cache_read_input_tokens": usage.get("cache_read_input_tokens", 0), + "cache_creation_input_tokens": usage.get("cache_creation_input_tokens", 0), + } + @staticmethod def _serialize_message(message) -> dict: data = {"type": type(message).__name__} @@ -107,11 +120,12 @@ async def _run_stage( options: ClaudeAgentOptions, bug_id: int, on_message: Callable[[str, dict], None] | None = None, - ) -> tuple[list[dict], float, int]: + ) -> tuple[list[dict], float, int, dict]: transcript: list[dict] = [] cost = 0.0 turns = 0 result_data: dict = {} + usage: dict = {} if on_message: on_message( @@ -134,6 +148,7 @@ async def _run_stage( if isinstance(message, ResultMessage): cost += message.total_cost_usd or 0 turns += message.num_turns or 0 + usage = getattr(message, "usage", {}) or {} result_data = serialized finally: if on_message: @@ -147,7 +162,7 @@ async def _run_stage( }, ) - return transcript, cost, turns + return transcript, cost, turns, usage def _prepare_input_files(self, failure: BuildFailure, worktree_path: Path) -> None: in_dir = worktree_path / "repair_agent" / "in" / str(failure.bug_id) @@ -201,6 +216,7 @@ async def run( disallowed = ["AskUserQuestion", "Task"] total_cost = 0.0 total_turns = 0 + total_usage: dict = {} logger.info( f"Bug {failure.bug_id}: starting Stage 1 (analysis) " @@ -224,7 +240,12 @@ async def run( eval=EVAL_PROMPT if self.eval_mode else "", ) try: - stage1_transcript, stage1_cost, stage1_turns = await self._run_stage( + ( + stage1_transcript, + stage1_cost, + stage1_turns, + stage1_usage, + ) = await self._run_stage( "analysis", analysis_prompt, system_prompt, @@ -235,6 +256,9 @@ async def run( ) total_cost += stage1_cost total_turns += stage1_turns + for k, v in stage1_usage.items(): + if isinstance(v, (int, float)): + total_usage[k] = total_usage.get(k, 0) + v except Exception as e: logger.error( f"Bug {failure.bug_id}: Stage 1 (analysis) failed: {e}", exc_info=True @@ -243,6 +267,7 @@ async def run( error=str(e), cost_usd=total_cost, num_turns=total_turns, + **self._usage_fields(total_usage), ) logger.info( @@ -264,6 +289,7 @@ async def run( analysis=analysis, cost_usd=total_cost, num_turns=total_turns, + **self._usage_fields(total_usage), stage1_transcript=stage1_transcript, ) @@ -286,7 +312,12 @@ async def run( bug_id=failure.bug_id, eval=EVAL_PROMPT if self.eval_mode else "" ) try: - stage2_transcript, stage2_cost, stage2_turns = await self._run_stage( + ( + stage2_transcript, + stage2_cost, + stage2_turns, + stage2_usage, + ) = await self._run_stage( "fix", fix_prompt, system_prompt, @@ -297,6 +328,9 @@ async def run( ) total_cost += stage2_cost total_turns += stage2_turns + for k, v in stage2_usage.items(): + if isinstance(v, (int, float)): + total_usage[k] = total_usage.get(k, 0) + v except Exception as e: logger.error( f"Bug {failure.bug_id}: Stage 2 (fix) failed: {e}", exc_info=True @@ -307,6 +341,7 @@ async def run( error=str(e), cost_usd=total_cost, num_turns=total_turns, + **self._usage_fields(total_usage), ) logger.info( @@ -331,6 +366,7 @@ async def run( diff=diff, cost_usd=total_cost, num_turns=total_turns, + **self._usage_fields(total_usage), stage1_transcript=stage1_transcript, stage2_transcript=stage2_transcript, ) @@ -364,6 +400,7 @@ async def run( diff=diff, cost_usd=total_cost, num_turns=total_turns, + **self._usage_fields(total_usage), local_build_passed=try_result.local_build_passed, try_build_passed=try_result.try_build_passed, lando_job_id=try_result.lando_job_id, diff --git a/bugbug/tools/build_repair/scorer.py b/bugbug/tools/build_repair/scorer.py index 709df41f45..f1bde3d02b 100644 --- a/bugbug/tools/build_repair/scorer.py +++ b/bugbug/tools/build_repair/scorer.py @@ -20,16 +20,30 @@ def score(self, output: dict) -> dict: "has_diff": bool(output.get("diff", "").strip()), "cost_usd": output.get("cost_usd", 0), "num_turns": output.get("num_turns", 0), + "input_tokens": output.get("input_tokens", 0), + "output_tokens": output.get("output_tokens", 0), + "cache_read_input_tokens": output.get("cache_read_input_tokens", 0), + "cache_creation_input_tokens": output.get("cache_creation_input_tokens", 0), } def summarize(self, score_rows: list[dict]) -> dict: n = len(score_rows) costs = [r["cost_usd"] for r in score_rows] + input_toks = [r["input_tokens"] for r in score_rows] + output_toks = [r["output_tokens"] for r in score_rows] summary = { "success_rate": sum(r["successful"] for r in score_rows) / n if n else 0, "diff_rate": sum(r["has_diff"] for r in score_rows) / n if n else 0, "avg_cost_usd": sum(costs) / n if n else 0, "total_cost_usd": sum(costs), + "total_input_tokens": sum(input_toks), + "total_output_tokens": sum(output_toks), + "total_cache_read_tokens": sum( + r["cache_read_input_tokens"] for r in score_rows + ), + "total_cache_creation_tokens": sum( + r["cache_creation_input_tokens"] for r in score_rows + ), "num_examples": n, } logger.info(f"BasicMetrics summary: {summary}") diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index 976677ab39..1879db4282 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -114,6 +114,7 @@ def trace_llm_stage( break result: dict[str, Any] = { + "model": model, "choices": [ { "message": {"role": "assistant", "content": last_assistant}, @@ -121,24 +122,51 @@ def trace_llm_stage( ], } if result_data: - usage = { - k: result_data[k] - for k in ( - "input_tokens", - "output_tokens", - "total_tokens", - "cache_read_input_tokens", - "cache_creation_input_tokens", - "total_cost_usd", - "num_turns", - ) - if k in result_data + raw_usage = result_data.get("usage", {}) or {} + input_tokens = raw_usage.get("input_tokens", 0) + output_tokens = raw_usage.get("output_tokens", 0) + result["usage"] = { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + "cache_read_input_tokens": raw_usage.get("cache_read_input_tokens", 0), + "cache_creation_input_tokens": raw_usage.get( + "cache_creation_input_tokens", 0 + ), + "total_cost_usd": result_data.get("total_cost_usd", 0), + "num_turns": result_data.get("num_turns", 0), } - if usage: - result["usage"] = {model: usage} return result +# Per-token costs in USD (standard, non-cached rates). +# Weave uses these for its built-in cost UI; the SDK's total_cost_usd +# (which accounts for cache pricing) is tracked separately as the authoritative cost. +ANTHROPIC_TOKEN_COSTS: dict[str, tuple[float, float]] = { + "claude-opus-4-6": (15.0e-6, 75.0e-6), + "claude-sonnet-4-6": (3.0e-6, 15.0e-6), + "claude-haiku-4-5-20251001": (0.8e-6, 4.0e-6), + "claude-sonnet-4-5-20250929": (3.0e-6, 15.0e-6), + "claude-opus-4-5-20251101": (15.0e-6, 75.0e-6), + "claude-opus-4-1-20250805": (15.0e-6, 75.0e-6), + "claude-sonnet-4-20250514": (3.0e-6, 15.0e-6), + "claude-3-7-sonnet-20250219": (3.0e-6, 15.0e-6), + "claude-opus-4-20250514": (15.0e-6, 75.0e-6), +} + + +def _register_model_costs(client) -> None: + for model_id, (prompt_cost, completion_cost) in ANTHROPIC_TOKEN_COSTS.items(): + try: + client.add_cost( + llm_id=model_id, + prompt_token_cost=prompt_cost, + completion_token_cost=completion_cost, + ) + except Exception as e: + logger.debug(f"Could not register cost for {model_id}: {e}") + + def _make_weave_callback(): stages: dict[str, dict] = {} @@ -246,6 +274,10 @@ async def invoke( "analysis": "", "cost_usd": 0, "num_turns": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_input_tokens": 0, + "cache_creation_input_tokens": 0, "local_build_passed": None, "try_build_passed": None, "lando_job_id": None, @@ -286,7 +318,8 @@ def main() -> None: ) os.environ["WEAVE_PARALLELISM"] = str(args.parallelism) - weave.init("bugbug-build-repair-eval") + client = weave.init("bugbug-build-repair-eval") + _register_model_costs(client) dataset = weave.ref(args.dataset).get() rows = dataset.rows From 6e497df1efad81a2e8c10463906f6d17a5a7fd95 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Fri, 27 Feb 2026 14:35:15 -0800 Subject: [PATCH 10/31] Think more on analysis --- bugbug/tools/build_repair/agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py index f2484ba133..f1677f2d2a 100644 --- a/bugbug/tools/build_repair/agent.py +++ b/bugbug/tools/build_repair/agent.py @@ -231,7 +231,7 @@ async def run( add_dirs=ADDITIONAL_DIRS, sandbox=SANDBOX_CONFIG, permission_mode="acceptEdits", - effort="low", + effort="high", mcp_servers=mcp_servers, ) analysis_prompt = ANALYSIS_TEMPLATE.format( From 2fa7c6263956430ae64898b50b8fa163b7bc8de3 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Fri, 27 Feb 2026 15:09:23 -0800 Subject: [PATCH 11/31] Increase parallelizm --- scripts/build_repair_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index 1879db4282..1a9bcd923b 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -294,7 +294,7 @@ def main() -> None: parser = argparse.ArgumentParser(description="Build repair evaluation") parser.add_argument("--limit", type=int, default=None) parser.add_argument("--trials", type=int, default=1) - parser.add_argument("--parallelism", type=int, default=1) + parser.add_argument("--parallelism", type=int, default=8) parser.add_argument("--firefox-repo", default=os.environ.get("FIREFOX_GIT_REPO")) parser.add_argument("--dataset", default="build_repair_one_commit_eval") parser.add_argument("--analysis-only", action="store_true") From 2f2832ef914127ac1d807839b8527271533bf368 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 2 Mar 2026 11:18:18 -0800 Subject: [PATCH 12/31] Use default system prompt --- bugbug/tools/build_repair/agent.py | 10 ---------- bugbug/tools/build_repair/prompts.py | 4 ++-- scripts/build_repair_eval.py | 10 ++++++---- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py index f1677f2d2a..d142aec11f 100644 --- a/bugbug/tools/build_repair/agent.py +++ b/bugbug/tools/build_repair/agent.py @@ -24,7 +24,6 @@ ANALYSIS_TEMPLATE, EVAL_PROMPT, FIX_TEMPLATE, - SYSTEM_PROMPT_TEMPLATE, ) logger = getLogger(__name__) @@ -115,7 +114,6 @@ async def _run_stage( self, stage_name: str, prompt: str, - system_prompt: str, model: str, options: ClaudeAgentOptions, bug_id: int, @@ -133,7 +131,6 @@ async def _run_stage( { "type": "stage_start", "prompt": prompt, - "system_prompt": system_prompt, "model": model, }, ) @@ -209,9 +206,6 @@ async def run( ) self._prepare_input_files(failure, worktree_path) - system_prompt = SYSTEM_PROMPT_TEMPLATE.format( - target_software=self.target_software - ) mcp_servers = {"firefox": {"type": "http", "url": FIREFOX_MCP_URL}} disallowed = ["AskUserQuestion", "Task"] total_cost = 0.0 @@ -223,7 +217,6 @@ async def run( f"with model={self.analysis_model}" ) stage1_options = ClaudeAgentOptions( - system_prompt=system_prompt, model=self.analysis_model, cwd=str(worktree_path), allowed_tools=ALLOWED_TOOLS, @@ -248,7 +241,6 @@ async def run( ) = await self._run_stage( "analysis", analysis_prompt, - system_prompt, self.analysis_model, stage1_options, failure.bug_id, @@ -297,7 +289,6 @@ async def run( f"Bug {failure.bug_id}: starting Stage 2 (fix) with model={self.fix_model}" ) stage2_options = ClaudeAgentOptions( - system_prompt=system_prompt, model=self.fix_model, cwd=str(worktree_path), allowed_tools=ALLOWED_TOOLS, @@ -320,7 +311,6 @@ async def run( ) = await self._run_stage( "fix", fix_prompt, - system_prompt, self.fix_model, stage2_options, failure.bug_id, diff --git a/bugbug/tools/build_repair/prompts.py b/bugbug/tools/build_repair/prompts.py index 003342fb76..cdab7e11e0 100644 --- a/bugbug/tools/build_repair/prompts.py +++ b/bugbug/tools/build_repair/prompts.py @@ -5,9 +5,9 @@ """Prompt templates for build repair agent.""" -SYSTEM_PROMPT_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure. """ +ANALYSIS_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure. -ANALYSIS_TEMPLATE = """Investigate why the last commit broke {target_software} build. +Investigate why the last commit broke {target_software} build. The last commit attempted to fix a bug from Bugzilla. diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index 1a9bcd923b..8672af19ca 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -173,12 +173,14 @@ def _make_weave_callback(): def on_message(stage: str, data: dict) -> None: msg_type = data["type"] if msg_type == "stage_start": + messages = [] + if "system_prompt" in data: + messages.append({"role": "system", "content": data["system_prompt"]}) + messages.append({"role": "user", "content": data["prompt"]}) + stages[stage] = { "model": data["model"], - "messages": [ - {"role": "system", "content": data["system_prompt"]}, - {"role": "user", "content": data["prompt"]}, - ], + "messages": messages, } elif msg_type == "stage_end": if stage in stages: From 983643e45a172c188ec6c8993fabf85b018d0638 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 2 Mar 2026 11:50:49 -0800 Subject: [PATCH 13/31] Update Weave --- docker/build_repair/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/build_repair/Dockerfile b/docker/build_repair/Dockerfile index c473fbc124..597713be73 100644 --- a/docker/build_repair/Dockerfile +++ b/docker/build_repair/Dockerfile @@ -9,7 +9,7 @@ RUN python3 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" RUN apt-get install -y git nodejs npm && rm -rf /var/lib/apt/lists/* -RUN pip install weave>=0.50.0 pydantic claude-agent-sdk requests +RUN pip install weave>=0.52.29 pydantic claude-agent-sdk requests COPY . /app From 1cf9351e6bf0440ddd894928bbc73a8e45795a5f Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 2 Mar 2026 11:51:11 -0800 Subject: [PATCH 14/31] Fix dataset --- scripts/build_repair_eval.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index 8672af19ca..b2410cdfe6 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -324,11 +324,10 @@ def main() -> None: _register_model_costs(client) dataset = weave.ref(args.dataset).get() - rows = dataset.rows - logger.info(f"Loaded dataset {args.dataset} with {len(rows)} rows") + logger.info(f"Loaded dataset {args.dataset} with {len(dataset.rows)} rows") if args.limit: - rows = rows[: args.limit] - logger.info(f"Limited to {len(rows)} rows") + dataset.rows = dataset.rows[: args.limit] + logger.info(f"Limited to {len(dataset.rows)} rows") scorers = [BasicMetricsScorer(), LLMFixMatchingScorer()] if not args.analysis_only: @@ -345,7 +344,7 @@ def main() -> None: ) evaluation = weave.Evaluation( name=f"build-repair-trial-{trial}", - dataset=rows, + dataset=dataset, scorers=scorers, ) results = asyncio.run(evaluation.evaluate(model)) From ffe5685b6642ed7be0116c5422af1bb1b277ad94 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 2 Mar 2026 13:50:52 -0800 Subject: [PATCH 15/31] Log errors in weave --- bugbug/tools/build_repair/agent.py | 7 +++++ bugbug/tools/build_repair/scorer.py | 27 ++++++++++++++++--- scripts/build_repair_eval.py | 41 +++++++++++++---------------- 3 files changed, 49 insertions(+), 26 deletions(-) diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py index d142aec11f..0845315871 100644 --- a/bugbug/tools/build_repair/agent.py +++ b/bugbug/tools/build_repair/agent.py @@ -4,6 +4,7 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. import subprocess +import traceback from collections.abc import Callable from logging import getLogger from pathlib import Path @@ -50,6 +51,8 @@ class AgentResponse(BaseModel): analysis: str = Field(default="") diff: str = Field(default="") error: str | None = Field(default=None) + error_traceback: str | None = Field(default=None) + failure_stage: str | None = Field(default=None) cost_usd: float = Field(default=0.0) num_turns: int = Field(default=0) input_tokens: int = Field(default=0) @@ -257,6 +260,8 @@ async def run( ) return AgentResponse( error=str(e), + error_traceback=traceback.format_exc(), + failure_stage="analysis", cost_usd=total_cost, num_turns=total_turns, **self._usage_fields(total_usage), @@ -329,6 +334,8 @@ async def run( summary=summary, analysis=analysis, error=str(e), + error_traceback=traceback.format_exc(), + failure_stage="fix", cost_usd=total_cost, num_turns=total_turns, **self._usage_fields(total_usage), diff --git a/bugbug/tools/build_repair/scorer.py b/bugbug/tools/build_repair/scorer.py index f1bde3d02b..5e0eb983b2 100644 --- a/bugbug/tools/build_repair/scorer.py +++ b/bugbug/tools/build_repair/scorer.py @@ -14,7 +14,18 @@ class BasicMetricsScorer(weave.Scorer): """Scores success rate, diff production rate, cost, and turn count.""" @weave.op() - def score(self, output: dict) -> dict: + def score(self, output: dict | None) -> dict: + if output is None: + return { + "successful": False, + "has_diff": False, + "cost_usd": 0, + "num_turns": 0, + "input_tokens": 0, + "output_tokens": 0, + "cache_read_input_tokens": 0, + "cache_creation_input_tokens": 0, + } return { "successful": output.get("error") is None, "has_diff": bool(output.get("diff", "").strip()), @@ -54,7 +65,12 @@ class BuildPassRateScorer(weave.Scorer): """Scores local ./mach build and try push pass rates.""" @weave.op() - def score(self, output: dict) -> dict: + def score(self, output: dict | None) -> dict: + if output is None: + return { + "local_build_passed": None, + "try_build_passed": None, + } return { "local_build_passed": output.get("local_build_passed"), "try_build_passed": output.get("try_build_passed"), @@ -85,7 +101,12 @@ class LLMFixMatchingScorer(weave.Scorer): """ @weave.op() - async def score(self, output: dict, gh_fix_commits: list[str]) -> dict: + async def score(self, output: dict | None, gh_fix_commits: list[str]) -> dict: + if output is None: + return { + "match_score": None, + "match_category": "errored", + } return { "match_score": None, "match_category": "not_implemented", diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index b2410cdfe6..888fe7e042 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -200,6 +200,14 @@ def on_message(stage: str, data: dict) -> None: return on_message +class BuildRepairError(Exception): + """Raised when the agent completes but reports an error.""" + + def __init__(self, output: dict): + self.output = output + super().__init__(output.get("error", "Unknown error")) + + class BuildRepairModel(weave.Model): """Weave Model wrapper that creates a worktree per example and runs BuildRepairTool.""" @@ -232,6 +240,7 @@ async def invoke( f"commit={gh_failure_commits[0][:12]}, {len(failures)} failures)" ) + worktree_created = False try: cutoff = max( MODEL_CUTOFF_DATES[self.tool.analysis_model], @@ -245,6 +254,7 @@ async def invoke( raise ValueError("skipped_data_contamination") worktree_path = self.worktree_mgr.create(gh_failure_commits[0], wt_name) + worktree_created = True failure = BuildFailure( bug_id=bug_id, @@ -266,30 +276,15 @@ async def invoke( f"local_build={result.local_build_passed}, " f"try_build={result.try_build_passed}" ) - return result.model_dump() - except Exception as e: - logger.error(f"Bug {bug_id} failed with exception: {e}", exc_info=True) - return { - "error": str(e), - "diff": "", - "summary": "", - "analysis": "", - "cost_usd": 0, - "num_turns": 0, - "input_tokens": 0, - "output_tokens": 0, - "cache_read_input_tokens": 0, - "cache_creation_input_tokens": 0, - "local_build_passed": None, - "try_build_passed": None, - "lando_job_id": None, - "treeherder_url": None, - "stage1_transcript": [], - "stage2_transcript": [], - } + + output = result.model_dump() + if result.error: + raise BuildRepairError(output) + return output finally: - logger.info(f"Bug {bug_id}: cleaning up worktree {wt_name}") - self.worktree_mgr.cleanup(wt_name) + if worktree_created: + logger.info(f"Bug {bug_id}: cleaning up worktree {wt_name}") + self.worktree_mgr.cleanup(wt_name) def main() -> None: From 915a2b84744462d8c736bd1034ca9a50709a6319 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 2 Mar 2026 14:08:29 -0800 Subject: [PATCH 16/31] Make logging less verbose --- bugbug/tools/build_repair/try_server.py | 38 +++++++++++++++++++------ scripts/build_repair_eval.py | 11 +++++-- 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/bugbug/tools/build_repair/try_server.py b/bugbug/tools/build_repair/try_server.py index a51bb59500..6ef558556e 100644 --- a/bugbug/tools/build_repair/try_server.py +++ b/bugbug/tools/build_repair/try_server.py @@ -3,6 +3,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. +import logging import os import re import subprocess @@ -65,27 +66,46 @@ def _commit_fix(worktree_path: Path, bug_id: int) -> None: logger.info(f"Bug {bug_id}: fix committed") -def _run_local_build(worktree_path: Path) -> bool: - logger.info(f"Running bootstrap in {worktree_path}") - result = subprocess.run( - ["./mach", "--no-interactive", "bootstrap"], +def _run_subprocess( + cmd: list[str], worktree_path: Path, capture: bool +) -> subprocess.CompletedProcess[str]: + if capture: + return subprocess.run( + cmd, + cwd=worktree_path, + env=_mach_env(worktree_path), + capture_output=True, + text=True, + ) + return subprocess.run( + cmd, cwd=worktree_path, env=_mach_env(worktree_path), + text=True, + ) + + +def _run_local_build(worktree_path: Path) -> bool: + capture = not logger.isEnabledFor(logging.DEBUG) + + logger.info(f"Running bootstrap in {worktree_path}") + result = _run_subprocess( + ["./mach", "--no-interactive", "bootstrap"], worktree_path, capture ) if result.returncode != 0: + if capture and result.stderr: + logger.warning(f"Bootstrap stderr:\n{result.stderr[-2000:]}") raise RuntimeError( f"Local bootstrap failed with return code {result.returncode}" ) logger.info(f"Running local build in {worktree_path}") - result = subprocess.run( - ["./mach", "build"], - cwd=worktree_path, - env=_mach_env(worktree_path), - ) + result = _run_subprocess(["./mach", "build"], worktree_path, capture) passed = result.returncode == 0 status = "passed" if passed else "failed" logger.info(f"Local build {status} (returncode={result.returncode})") + if not passed and capture and result.stderr: + logger.warning(f"Build stderr:\n{result.stderr[-2000:]}") return passed diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index 888fe7e042..dbf5f132ce 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -12,6 +12,7 @@ python scripts/build_repair_eval.py --limit 5 python scripts/build_repair_eval.py --parallelism 4 python scripts/build_repair_eval.py --no-try-push + python scripts/build_repair_eval.py --verbose """ import argparse @@ -296,16 +297,22 @@ def main() -> None: parser.add_argument("--dataset", default="build_repair_one_commit_eval") parser.add_argument("--analysis-only", action="store_true") parser.add_argument("--no-try-push", action="store_true") + parser.add_argument("--verbose", action="store_true", help="Enable DEBUG logging") args = parser.parse_args() if not args.firefox_repo: parser.error("--firefox-repo or FIREFOX_GIT_REPO env var is required") + log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig( - level=logging.DEBUG, + level=log_level, format="%(asctime)s %(levelname)s %(name)s: %(message)s", ) - logging.getLogger("httpx").setLevel(logging.WARNING) + if not args.verbose: + logging.getLogger("httpx").setLevel(logging.WARNING) + logging.getLogger("httpcore").setLevel(logging.WARNING) + logging.getLogger("hgitaly").setLevel(logging.WARNING) + logging.getLogger("urllib3").setLevel(logging.WARNING) logger.info( f"Starting evaluation: dataset={args.dataset}, limit={args.limit}, " From ab4fce1bbdcf0f655d2afabece21adb979e65c08 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 2 Mar 2026 14:32:54 -0800 Subject: [PATCH 17/31] Support multiple trials --- bugbug/tools/build_repair/scorer.py | 44 +++++++++++++--------- scripts/build_repair_eval.py | 58 +++++++++++++++++------------ 2 files changed, 61 insertions(+), 41 deletions(-) diff --git a/bugbug/tools/build_repair/scorer.py b/bugbug/tools/build_repair/scorer.py index 5e0eb983b2..24b35d08c1 100644 --- a/bugbug/tools/build_repair/scorer.py +++ b/bugbug/tools/build_repair/scorer.py @@ -117,35 +117,43 @@ def summarize(self, score_rows: list[dict]) -> dict: def compute_pass_at_k( - trial_results: list[list[dict]], + result_rows: list[dict], + num_examples: int, + num_trials: int, + scorer_name: str, metric: str, -) -> dict: - """Compute pass@k metrics across multiple trial runs. +) -> dict[str, float]: + """Compute pass@k from Weave evaluation results with trials. - Args: - trial_results: list of k trial result lists, each with per-example scores - metric: which boolean metric to use (e.g. "local_build_passed", "successful") - - Returns: - pass@1, pass@3, pass@k and pass^k metrics + Rows are ordered: first num_examples = trial 0, next = trial 1, etc. """ - k = len(trial_results) - num_examples = len(trial_results[0]) + if num_trials < 2: + return {} - pass_at = {} - for n in [1, 3, k]: - if n > k: + pass_at: dict[str, float] = {} + for n in {1, 3, num_trials}: + if n > num_trials: continue successes = sum( - any(trial_results[t][i][metric] is True for t in range(n)) + any( + result_rows[t * num_examples + i]["scores"] + .get(scorer_name, {}) + .get(metric) + is True + for t in range(n) + ) for i in range(num_examples) ) - pass_at[f"pass@{n}"] = successes / num_examples + pass_at[f"pass@{n}"] = successes / num_examples if num_examples else 0 all_pass = sum( - all(trial_results[t][i][metric] is True for t in range(k)) + all( + result_rows[t * num_examples + i]["scores"].get(scorer_name, {}).get(metric) + is True + for t in range(num_trials) + ) for i in range(num_examples) ) - pass_at[f"pass^{k}"] = all_pass / num_examples + pass_at[f"pass^{num_trials}"] = all_pass / num_examples if num_examples else 0 return pass_at diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index dbf5f132ce..e787c6d206 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -20,6 +20,7 @@ import json import logging import os +import uuid from datetime import datetime from functools import cached_property from typing import Any @@ -32,6 +33,7 @@ BasicMetricsScorer, BuildPassRateScorer, LLMFixMatchingScorer, + compute_pass_at_k, ) from bugbug.tools.build_repair.worktree import WorktreeManager @@ -215,7 +217,6 @@ class BuildRepairModel(weave.Model): firefox_repo: str analysis_only: bool = False no_try_push: bool = False - trial_id: int = 0 @cached_property def tool(self) -> BuildRepairTool: @@ -235,10 +236,10 @@ async def invoke( fix_commit_date: str, **kwargs, ) -> dict: - wt_name = f"bug-{bug_id}-trial-{self.trial_id}" + wt_name = f"bug-{bug_id}-{uuid.uuid4().hex[:8]}" logger.info( - f"Invoking bug {bug_id} (trial={self.trial_id}, " - f"commit={gh_failure_commits[0][:12]}, {len(failures)} failures)" + f"Invoking bug {bug_id} " + f"(commit={gh_failure_commits[0][:12]}, {len(failures)} failures)" ) worktree_created = False @@ -336,26 +337,37 @@ def main() -> None: scorers.insert(1, BuildPassRateScorer()) logger.info(f"Scorers: {[type(s).__name__ for s in scorers]}") - for trial in range(args.trials): - logger.info(f"Starting trial {trial + 1}/{args.trials}") - model = BuildRepairModel( - firefox_repo=args.firefox_repo, - analysis_only=args.analysis_only, - no_try_push=args.no_try_push, - trial_id=trial, - ) - evaluation = weave.Evaluation( - name=f"build-repair-trial-{trial}", - dataset=dataset, - scorers=scorers, - ) - results = asyncio.run(evaluation.evaluate(model)) - logger.info(f"Trial {trial + 1}/{args.trials} results: {results}") + model = BuildRepairModel( + firefox_repo=args.firefox_repo, + analysis_only=args.analysis_only, + no_try_push=args.no_try_push, + ) + evaluation = weave.Evaluation( + name="build-repair", + dataset=dataset, + scorers=scorers, + trials=args.trials, + ) - # TODO: To compute pass@k across trials, collect per-row scores from each - # trial via the Weave API (weave.ref(...).get() on individual evaluation - # runs) and pass them to compute_pass_at_k(). The evaluate() return value - # only contains aggregated summaries, not per-row data. + async def run_eval(): + eval_results = await evaluation.get_eval_results(model) + summary = await evaluation.summarize(eval_results) + return eval_results, summary + + eval_results, summary = asyncio.run(run_eval()) + logger.info(f"Evaluation results: {summary}") + + if args.trials > 1: + num_examples = len(dataset.rows) + rows = list(eval_results.rows) + pass_k_metrics = [("BasicMetricsScorer", "successful")] + if not args.analysis_only: + pass_k_metrics.append(("BuildPassRateScorer", "local_build_passed")) + for scorer_name, metric in pass_k_metrics: + pass_k = compute_pass_at_k( + rows, num_examples, args.trials, scorer_name, metric + ) + logger.info(f"pass@k ({metric}): {pass_k}") if __name__ == "__main__": From 955e048fc1645f7c079ddd9cd1ed9f95f03c3800 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 2 Mar 2026 16:41:32 -0800 Subject: [PATCH 18/31] Fix scoring --- bugbug/tools/build_repair/scorer.py | 110 ++++++++++++++-------------- scripts/build_repair_eval.py | 27 +------ 2 files changed, 61 insertions(+), 76 deletions(-) diff --git a/bugbug/tools/build_repair/scorer.py b/bugbug/tools/build_repair/scorer.py index 24b35d08c1..566b384a6a 100644 --- a/bugbug/tools/build_repair/scorer.py +++ b/bugbug/tools/build_repair/scorer.py @@ -10,9 +10,44 @@ logger = getLogger(__name__) +def _pass_at_k( + score_rows: list[dict], + num_trials: int, + metric: str, +) -> dict[str, float]: + """Compute pass@k from scorer rows ordered by trial. + + Rows are ordered: first num_examples = trial 0, next = trial 1, etc. + Rows may be empty dicts when the model raised an exception. + """ + num_examples = len(score_rows) // num_trials + pass_at: dict[str, float] = {} + for n in sorted({1, 3, num_trials}): + if n > num_trials: + continue + successes = sum( + any(score_rows[t * num_examples + i].get(metric) is True for t in range(n)) + for i in range(num_examples) + ) + pass_at[f"pass@{n}"] = successes / num_examples if num_examples else 0 + + all_pass = sum( + all( + score_rows[t * num_examples + i].get(metric) is True + for t in range(num_trials) + ) + for i in range(num_examples) + ) + pass_at[f"pass^{num_trials}"] = all_pass / num_examples if num_examples else 0 + + return pass_at + + class BasicMetricsScorer(weave.Scorer): """Scores success rate, diff production rate, cost, and turn count.""" + num_trials: int = 1 + @weave.op() def score(self, output: dict | None) -> dict: if output is None: @@ -39,24 +74,30 @@ def score(self, output: dict | None) -> dict: def summarize(self, score_rows: list[dict]) -> dict: n = len(score_rows) - costs = [r["cost_usd"] for r in score_rows] - input_toks = [r["input_tokens"] for r in score_rows] - output_toks = [r["output_tokens"] for r in score_rows] + costs = [r.get("cost_usd", 0) for r in score_rows] + input_toks = [r.get("input_tokens", 0) for r in score_rows] + output_toks = [r.get("output_tokens", 0) for r in score_rows] summary = { - "success_rate": sum(r["successful"] for r in score_rows) / n if n else 0, - "diff_rate": sum(r["has_diff"] for r in score_rows) / n if n else 0, + "success_rate": sum(r.get("successful", False) for r in score_rows) / n + if n + else 0, + "diff_rate": sum(r.get("has_diff", False) for r in score_rows) / n + if n + else 0, "avg_cost_usd": sum(costs) / n if n else 0, "total_cost_usd": sum(costs), "total_input_tokens": sum(input_toks), "total_output_tokens": sum(output_toks), "total_cache_read_tokens": sum( - r["cache_read_input_tokens"] for r in score_rows + r.get("cache_read_input_tokens", 0) for r in score_rows ), "total_cache_creation_tokens": sum( - r["cache_creation_input_tokens"] for r in score_rows + r.get("cache_creation_input_tokens", 0) for r in score_rows ), "num_examples": n, } + if self.num_trials > 1: + summary.update(_pass_at_k(score_rows, self.num_trials, "successful")) logger.info(f"BasicMetrics summary: {summary}") return summary @@ -64,6 +105,8 @@ def summarize(self, score_rows: list[dict]) -> dict: class BuildPassRateScorer(weave.Scorer): """Scores local ./mach build and try push pass rates.""" + num_trials: int = 1 + @weave.op() def score(self, output: dict | None) -> dict: if output is None: @@ -78,9 +121,9 @@ def score(self, output: dict | None) -> dict: def summarize(self, score_rows: list[dict]) -> dict: n = len(score_rows) - local_passed = sum(1 for r in score_rows if r["local_build_passed"] is True) - try_known = [r for r in score_rows if r["try_build_passed"] is not None] - try_passed = sum(1 for r in try_known if r["try_build_passed"] is True) + local_passed = sum(1 for r in score_rows if r.get("local_build_passed") is True) + try_known = [r for r in score_rows if r.get("try_build_passed") is not None] + try_passed = sum(1 for r in try_known if r.get("try_build_passed") is True) summary = { "local_build_pass_rate": local_passed / n if n else 0, "local_builds_passed": local_passed, @@ -89,6 +132,10 @@ def summarize(self, score_rows: list[dict]) -> dict: "try_builds_timed_out": n - len(try_known), "num_examples": n, } + if self.num_trials > 1: + summary.update( + _pass_at_k(score_rows, self.num_trials, "local_build_passed") + ) logger.info(f"BuildPassRate summary: {summary}") return summary @@ -114,46 +161,3 @@ async def score(self, output: dict | None, gh_fix_commits: list[str]) -> dict: def summarize(self, score_rows: list[dict]) -> dict: return {"status": "not_implemented"} - - -def compute_pass_at_k( - result_rows: list[dict], - num_examples: int, - num_trials: int, - scorer_name: str, - metric: str, -) -> dict[str, float]: - """Compute pass@k from Weave evaluation results with trials. - - Rows are ordered: first num_examples = trial 0, next = trial 1, etc. - """ - if num_trials < 2: - return {} - - pass_at: dict[str, float] = {} - for n in {1, 3, num_trials}: - if n > num_trials: - continue - successes = sum( - any( - result_rows[t * num_examples + i]["scores"] - .get(scorer_name, {}) - .get(metric) - is True - for t in range(n) - ) - for i in range(num_examples) - ) - pass_at[f"pass@{n}"] = successes / num_examples if num_examples else 0 - - all_pass = sum( - all( - result_rows[t * num_examples + i]["scores"].get(scorer_name, {}).get(metric) - is True - for t in range(num_trials) - ) - for i in range(num_examples) - ) - pass_at[f"pass^{num_trials}"] = all_pass / num_examples if num_examples else 0 - - return pass_at diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index e787c6d206..e925c2795b 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -33,7 +33,6 @@ BasicMetricsScorer, BuildPassRateScorer, LLMFixMatchingScorer, - compute_pass_at_k, ) from bugbug.tools.build_repair.worktree import WorktreeManager @@ -332,9 +331,9 @@ def main() -> None: dataset.rows = dataset.rows[: args.limit] logger.info(f"Limited to {len(dataset.rows)} rows") - scorers = [BasicMetricsScorer(), LLMFixMatchingScorer()] + scorers = [BasicMetricsScorer(num_trials=args.trials), LLMFixMatchingScorer()] if not args.analysis_only: - scorers.insert(1, BuildPassRateScorer()) + scorers.insert(1, BuildPassRateScorer(num_trials=args.trials)) logger.info(f"Scorers: {[type(s).__name__ for s in scorers]}") model = BuildRepairModel( @@ -348,26 +347,8 @@ def main() -> None: scorers=scorers, trials=args.trials, ) - - async def run_eval(): - eval_results = await evaluation.get_eval_results(model) - summary = await evaluation.summarize(eval_results) - return eval_results, summary - - eval_results, summary = asyncio.run(run_eval()) - logger.info(f"Evaluation results: {summary}") - - if args.trials > 1: - num_examples = len(dataset.rows) - rows = list(eval_results.rows) - pass_k_metrics = [("BasicMetricsScorer", "successful")] - if not args.analysis_only: - pass_k_metrics.append(("BuildPassRateScorer", "local_build_passed")) - for scorer_name, metric in pass_k_metrics: - pass_k = compute_pass_at_k( - rows, num_examples, args.trials, scorer_name, metric - ) - logger.info(f"pass@k ({metric}): {pass_k}") + results = asyncio.run(evaluation.evaluate(model)) + logger.info(f"Evaluation results: {results}") if __name__ == "__main__": From 33dc52c0c73aef2ffa1594f7f47075910f7748e7 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 2 Mar 2026 16:46:47 -0800 Subject: [PATCH 19/31] Add docker compose --- docker-compose.dev.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 docker-compose.dev.yml diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 0000000000..ff72a232b1 --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,18 @@ +services: + # Base Docker image requires running `./mach taskgraph load-image debian12-amd64-build:latest` + build-repair: + # TO minimize rebuilding use `DOCKER_DEFAULT_PLATFORM=linux/amd64 docker build -t build-repair-debian-base -f docker/build_repair/Dockerfile .` + # image: build-repair-debian-base + build: + context: . + dockerfile: docker/build_repair/Dockerfile + volumes: + - .:/app # live code editing + - ${FIREFOX_REPO}:/workspace/firefox # Firefox repo + - build-repair-tmp:/tmp/build_repair_worktrees + environment: + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} + - WANDB_API_KEY=${WANDB_API_KEY} # for weave + - FIREFOX_GIT_REPO=/workspace/firefox +volumes: + build-repair-tmp: From 2a26a6cf2506c8a87384da43ecfb10718aec63e9 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 2 Mar 2026 16:49:48 -0800 Subject: [PATCH 20/31] Change todo --- scripts/build_repair_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_repair_eval.py b/scripts/build_repair_eval.py index e925c2795b..aa7d91cc1f 100644 --- a/scripts/build_repair_eval.py +++ b/scripts/build_repair_eval.py @@ -38,7 +38,7 @@ logger = logging.getLogger(__name__) -# todo: verify tracing code +# TODO: replace with native tracing for Anthropic Agents SDK when released by W&B def _attr(obj, key, default=None): From daeb4bdc19b3f906f7b558bc1f34ecfb527770b6 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 9 Mar 2026 16:21:44 -0700 Subject: [PATCH 21/31] Fix exception and typo --- notebooks/build_repair_create_dataset.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/build_repair_create_dataset.ipynb b/notebooks/build_repair_create_dataset.ipynb index f1da8fee77..ed3bb28a1d 100644 --- a/notebooks/build_repair_create_dataset.ipynb +++ b/notebooks/build_repair_create_dataset.ipynb @@ -107,7 +107,7 @@ "cell_type": "markdown", "id": "9dc12cf3f6f9b844", "metadata": {}, - "source": "### Ger bugzilla comments before the fix" + "source": "### Get bugzilla comments before the fix" }, { "cell_type": "code", @@ -143,7 +143,7 @@ " and fix_commit[:12] in comment[\"raw_text\"]\n", " ):\n", " return comment[\"time\"]\n", - " raise None\n", + " raise ValueError(\"No fix commit date\")\n", "\n", "\n", "def get_bug_info_and_fix_date(build_fail):\n", From 43301c9aec9822e417cd3fc64c3e879e7d515246 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 9 Mar 2026 16:22:02 -0700 Subject: [PATCH 22/31] Remove unused definitions --- bugbug/tools/build_repair/__init__.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/bugbug/tools/build_repair/__init__.py b/bugbug/tools/build_repair/__init__.py index 2f3caac2c6..e69de29bb2 100644 --- a/bugbug/tools/build_repair/__init__.py +++ b/bugbug/tools/build_repair/__init__.py @@ -1,8 +0,0 @@ -# -*- coding: utf-8 -*- -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this file, -# You can obtain one at http://mozilla.org/MPL/2.0/. - -from bugbug.tools.build_repair.agent import AgentResponse, BuildFailure, BuildRepairTool - -__all__ = ["AgentResponse", "BuildFailure", "BuildRepairTool"] From 2bc233e61b354cf1ab4ee2094dcc69d9dd9c66dc Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 9 Mar 2026 16:22:25 -0700 Subject: [PATCH 23/31] Revert IDE specific paths --- .gitignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitignore b/.gitignore index 77ee39bd33..a00be42b73 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,3 @@ node_modules/ *.log # Desktop Service Store *.DS_Store - -# JetBrains IDEs -.idea From 91d7092f424ca2272112b145cd44cfa725ef6c47 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 9 Mar 2026 16:22:46 -0700 Subject: [PATCH 24/31] Clarify double force --- bugbug/tools/build_repair/worktree.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bugbug/tools/build_repair/worktree.py b/bugbug/tools/build_repair/worktree.py index 8ae10ea7da..caeebb7f98 100644 --- a/bugbug/tools/build_repair/worktree.py +++ b/bugbug/tools/build_repair/worktree.py @@ -31,6 +31,7 @@ def create(self, commit_hash: str, name: str) -> Path: ) if worktree_path.exists(): self.cleanup(name) + # --force twice to operate on locked worktrees (see https://git-scm.com/docs/git-worktree#_options) subprocess.run( [ "git", @@ -49,6 +50,7 @@ def create(self, commit_hash: str, name: str) -> Path: def cleanup(self, name: str) -> None: logger.info(f"Cleaning up worktree {name}") + # --force twice to operate on locked worktrees (see https://git-scm.com/docs/git-worktree#_options) subprocess.run( [ "git", From 69e2013b98a433f8361c687c48fdedf2e9c3c551 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 9 Mar 2026 16:23:06 -0700 Subject: [PATCH 25/31] Rename env --- .../buildrepair/docker-compose.dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docker-compose.dev.yml => services/buildrepair/docker-compose.dev.yml (91%) diff --git a/docker-compose.dev.yml b/services/buildrepair/docker-compose.dev.yml similarity index 91% rename from docker-compose.dev.yml rename to services/buildrepair/docker-compose.dev.yml index ff72a232b1..6bfb0d20bb 100644 --- a/docker-compose.dev.yml +++ b/services/buildrepair/docker-compose.dev.yml @@ -8,7 +8,7 @@ services: dockerfile: docker/build_repair/Dockerfile volumes: - .:/app # live code editing - - ${FIREFOX_REPO}:/workspace/firefox # Firefox repo + - ${FIREFOX_GIT_REPO}:/workspace/firefox # Firefox repo - build-repair-tmp:/tmp/build_repair_worktrees environment: - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} From c4f2eec167d71d9df0900e1da65f72d2835d419a Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Mon, 9 Mar 2026 16:23:44 -0700 Subject: [PATCH 26/31] Add task ID example --- {docker/build_repair => services/buildrepair}/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename {docker/build_repair => services/buildrepair}/Dockerfile (84%) diff --git a/docker/build_repair/Dockerfile b/services/buildrepair/Dockerfile similarity index 84% rename from docker/build_repair/Dockerfile rename to services/buildrepair/Dockerfile index 597713be73..1a78aef42c 100644 --- a/docker/build_repair/Dockerfile +++ b/services/buildrepair/Dockerfile @@ -1,4 +1,4 @@ -# ./mach taskgraph load-image +# ./mach taskgraph load-image --task-id aQDejwXUQsSHxvwE2qQcQg FROM debian12-amd64-build WORKDIR /app From 6c5314c352a996cddcb6c6cfe3a913f6722d6efe Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 12 Mar 2026 14:43:30 -0700 Subject: [PATCH 27/31] Adjust Docker infra to reuse bugbug requirements --- services/buildrepair/Dockerfile | 30 ++++++++++++++++----- services/buildrepair/docker-compose.dev.yml | 12 ++++----- services/buildrepair/pyproject.toml | 12 +++++++++ 3 files changed, 42 insertions(+), 12 deletions(-) create mode 100644 services/buildrepair/pyproject.toml diff --git a/services/buildrepair/Dockerfile b/services/buildrepair/Dockerfile index 1a78aef42c..178537d664 100644 --- a/services/buildrepair/Dockerfile +++ b/services/buildrepair/Dockerfile @@ -1,16 +1,34 @@ -# ./mach taskgraph load-image --task-id aQDejwXUQsSHxvwE2qQcQg +# Load the base image by running this from the Firefox repo: +# ./mach taskgraph load-image --task-id aQDejwXUQsSHxvwE2qQcQg FROM debian12-amd64-build WORKDIR /app -RUN apt-get install -y python3-pip python3-venv +RUN apt-get update && \ + apt-get install -y git nodejs npm build-essential zlib1g-dev \ + libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev \ + libffi-dev libsqlite3-dev wget libbz2-dev && \ + rm -rf /var/lib/apt/lists/* -RUN python3 -m venv /opt/venv +# bugbug requires Python 3.12 and there's no package for Debian 12 +RUN wget https://www.python.org/ftp/python/3.12.8/Python-3.12.8.tgz && \ + tar -xf Python-3.12.8.tgz && \ + cd Python-3.12.8 && \ + ./configure --enable-optimizations --prefix=/usr/local && \ + make -j$(nproc) && \ + make install && \ + cd .. && rm -rf Python-3.12.8 Python-3.12.8.tgz + +RUN python3.12 -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" -RUN apt-get install -y git nodejs npm && rm -rf /var/lib/apt/lists/* -RUN pip install weave>=0.52.29 pydantic claude-agent-sdk requests +COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv + +COPY requirements.txt /app/ +RUN uv pip install -r /app/requirements.txt COPY . /app -ENV FIREFOX_GIT_REPO=/workspace/firefox +ENV PYTHONPATH=/app +ENV PYTHONUNBUFFERED=1 +ENV FIREFOX_GIT_REPO=/workspace/firefox \ No newline at end of file diff --git a/services/buildrepair/docker-compose.dev.yml b/services/buildrepair/docker-compose.dev.yml index 6bfb0d20bb..ddd2d67364 100644 --- a/services/buildrepair/docker-compose.dev.yml +++ b/services/buildrepair/docker-compose.dev.yml @@ -1,13 +1,13 @@ services: - # Base Docker image requires running `./mach taskgraph load-image debian12-amd64-build:latest` build-repair: - # TO minimize rebuilding use `DOCKER_DEFAULT_PLATFORM=linux/amd64 docker build -t build-repair-debian-base -f docker/build_repair/Dockerfile .` - # image: build-repair-debian-base + # To minimize rebuilding use `DOCKER_DEFAULT_PLATFORM=linux/amd64 docker build -t build-repair-debian-base -f services/buildrepair/Dockerfile .` + # and replace the "build" section with: + # image: build-repair-debian-base build: - context: . - dockerfile: docker/build_repair/Dockerfile + context: ../.. + dockerfile: services/buildrepair/Dockerfile volumes: - - .:/app # live code editing + - ../../:/app # live code editing - ${FIREFOX_GIT_REPO}:/workspace/firefox # Firefox repo - build-repair-tmp:/tmp/build_repair_worktrees environment: diff --git a/services/buildrepair/pyproject.toml b/services/buildrepair/pyproject.toml new file mode 100644 index 0000000000..621735a0f9 --- /dev/null +++ b/services/buildrepair/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "bugbug-build-repair" +dynamic = ["version"] +description = "BugBug Build Repair Agent" +requires-python = ">=3.12" +dependencies = [ + "bugbug", +] + +[tool.uv.sources] +bugbug = { path = "../..", editable = true } + From 1ed87c3d9a82e6dc0c5138db8889f5ae8904d2c4 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 12 Mar 2026 14:43:37 -0700 Subject: [PATCH 28/31] Add readme --- services/buildrepair/README.md | 60 ++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 services/buildrepair/README.md diff --git a/services/buildrepair/README.md b/services/buildrepair/README.md new file mode 100644 index 0000000000..240196088b --- /dev/null +++ b/services/buildrepair/README.md @@ -0,0 +1,60 @@ +# Build Repair Agent + +It can automatically analyze a build failure in Firefox and propose a fix. + +## Evaluation + +Weights and Biases Weave [dashboard](https://wandb.ai/moz-bugbug/bugbug-build-repair-eval/weave/evaluations). + +To run locally: + +1. Clone Firefox to a separate directory + +2. Prepare the Docker image + +Pull the base Docker image to build Firefox from Taskcluster. +From the Firefox repo run: +```bash +./mach taskgraph load-image --task-id aQDejwXUQsSHxvwE2qQcQg +``` + +Make sure to have enough resources available for the Docker engine (at least 16gb RAM and 128GB disk, better 256GB). + +3. Set environment variables + +```bash +# Full path to the Firefox repo +export FIREFOX_GIT_REPO=$(pwd) +export ANTHROPIC_API_KEY= +export WANDB_API_KEY= +# If on Mac with ARM CPU +export DOCKER_DEFAULT_PLATFORM=linux/amd64 +``` + +4. `cd` to this repo + +5. (Optional) Prebuild the Docker image and use `image: build-repair-debian-base` in `docker-compose.dev.yml` +```bash +docker build -t build-repair-debian-base -f services/buildrepair/Dockerfile . +``` + +6. Attach to the container by running: + +```bash +docker compose -f services/buildrepair/docker-compose.dev.yml run build-repair +``` + +7. Run the evaluation script. + +To test: +```bash +/opt/venv/bin/python scripts/build_repair_eval.py --no-try-push --limit 1 +``` + +To run full evaluation (with 3 trials): +```bash +/opt/venv/bin/python scripts/build_repair_eval.py --no-try-push --parallellism 8 --trials 3 +``` + +It will run each of 85 examples from the evaluation dataset 3 times. +It will build Firefox each time with the proposed fix, then write results to Weave. From 91f31c6b0387bc8e961a2b39b628e2170d286a3f Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 12 Mar 2026 14:49:09 -0700 Subject: [PATCH 29/31] Catch cleaning error --- bugbug/tools/build_repair/worktree.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bugbug/tools/build_repair/worktree.py b/bugbug/tools/build_repair/worktree.py index caeebb7f98..b3c1c6dfee 100644 --- a/bugbug/tools/build_repair/worktree.py +++ b/bugbug/tools/build_repair/worktree.py @@ -51,7 +51,7 @@ def create(self, commit_hash: str, name: str) -> Path: def cleanup(self, name: str) -> None: logger.info(f"Cleaning up worktree {name}") # --force twice to operate on locked worktrees (see https://git-scm.com/docs/git-worktree#_options) - subprocess.run( + result = subprocess.run( [ "git", "worktree", @@ -61,9 +61,13 @@ def cleanup(self, name: str) -> None: str(self.base_dir / name), ], cwd=self.repo, - check=True, + capture_output=True, + text=True, ) - logger.info(f"Worktree {name} removed") + if result.returncode != 0: + logger.error(f"Failed to remove worktree {name}: {result.stderr.strip()}") + else: + logger.info(f"Worktree {name} removed") def cleanup_all(self) -> None: logger.info(f"Cleaning up all worktrees in {self.base_dir}") From 40c112ca7e8cf3599c2a25f746e72255df5e8f48 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 12 Mar 2026 14:49:15 -0700 Subject: [PATCH 30/31] Reformat --- services/buildrepair/README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/services/buildrepair/README.md b/services/buildrepair/README.md index 240196088b..3fd6603b0b 100644 --- a/services/buildrepair/README.md +++ b/services/buildrepair/README.md @@ -12,8 +12,9 @@ To run locally: 2. Prepare the Docker image -Pull the base Docker image to build Firefox from Taskcluster. +Pull the base Docker image to build Firefox from Taskcluster. From the Firefox repo run: + ```bash ./mach taskgraph load-image --task-id aQDejwXUQsSHxvwE2qQcQg ``` @@ -34,6 +35,7 @@ export DOCKER_DEFAULT_PLATFORM=linux/amd64 4. `cd` to this repo 5. (Optional) Prebuild the Docker image and use `image: build-repair-debian-base` in `docker-compose.dev.yml` + ```bash docker build -t build-repair-debian-base -f services/buildrepair/Dockerfile . ``` @@ -47,14 +49,16 @@ docker compose -f services/buildrepair/docker-compose.dev.yml run build-repair 7. Run the evaluation script. To test: + ```bash /opt/venv/bin/python scripts/build_repair_eval.py --no-try-push --limit 1 ``` To run full evaluation (with 3 trials): + ```bash /opt/venv/bin/python scripts/build_repair_eval.py --no-try-push --parallellism 8 --trials 3 ``` -It will run each of 85 examples from the evaluation dataset 3 times. +It will run each of 85 examples from the evaluation dataset 3 times. It will build Firefox each time with the proposed fix, then write results to Weave. From 8c8d5bc339e5941a1a77e640eb59f93c9aafc2c2 Mon Sep 17 00:00:00 2001 From: Evgeny Pavlov Date: Thu, 12 Mar 2026 14:59:47 -0700 Subject: [PATCH 31/31] Stage all new files --- bugbug/tools/build_repair/agent.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py index 0845315871..9eada330db 100644 --- a/bugbug/tools/build_repair/agent.py +++ b/bugbug/tools/build_repair/agent.py @@ -346,8 +346,13 @@ async def run( f"(cost=${total_cost:.4f}, turns={total_turns})" ) + subprocess.run( + ["git", "add", "-A"], + cwd=worktree_path, + capture_output=True, + ) diff_result = subprocess.run( - ["git", "diff", "HEAD"], + ["git", "diff", "--staged", "HEAD"], cwd=worktree_path, capture_output=True, text=True,