From 342a65caeefb10f2059d9f7bf3937dcc97f03583 Mon Sep 17 00:00:00 2001 From: Menglin Li Date: Mon, 15 Jun 2026 15:23:54 +0800 Subject: [PATCH] fix(octo): surface real failure detail in task:failed IM replies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The task:failed event payload only carried the coarse failure_reason classifier (and the FailTask broadcast path carried neither error nor chat_session_id). The Octo outbound relay only read error/error_message, so it always fell back to the English 'The agent run failed.' — IM users never saw the actual cause and chat tasks failed via the sweeper path weren't routed at all. - task.go: include redacted error + failure_reason on both task:failed publish paths, and chat_session_id on the sweeper-batch path so chat tasks route to the IM relay. - octo/outbound.go: replace errorMessageFromPayload with failureMessageFromPayload — prefer explicit error, fall back to a friendly Chinese description of failure_reason, then a generic default. Unknown reasons downgrade gracefully (no raw enum leak). - Lark benefits too: it already read error, which the FailTask path now populates. Tests cover explicit-error precedence, failure_reason fallback, unknown reason downgrade, and empty/non-map payloads. --- server/internal/integrations/octo/outbound.go | 71 +++++++++++++++---- .../integrations/octo/outbound_test.go | 67 +++++++++++++++++ server/internal/service/task.go | 36 ++++++++-- 3 files changed, 154 insertions(+), 20 deletions(-) diff --git a/server/internal/integrations/octo/outbound.go b/server/internal/integrations/octo/outbound.go index 5b8b24cbf9..deb809209b 100644 --- a/server/internal/integrations/octo/outbound.go +++ b/server/internal/integrations/octo/outbound.go @@ -124,11 +124,7 @@ func (p *Patcher) processEvent(ctx context.Context, e events.Event) error { case protocol.EventChatDone: return p.sendReply(ctx, inst, binding, taskID, chatDoneContent(e.Payload), token) case protocol.EventTaskFailed: - msg := errorMessageFromPayload(e.Payload) - if msg == "" { - msg = "The agent run failed." - } - return p.sendReply(ctx, inst, binding, taskID, "⚠️ "+msg, token) + return p.sendReply(ctx, inst, binding, taskID, "⚠️ "+failureMessageFromPayload(e.Payload), token) } return nil } @@ -213,14 +209,63 @@ func chatDoneContent(payload any) string { return "" } -func errorMessageFromPayload(payload any) string { - if m, ok := payload.(map[string]any); ok { - if s, ok := m["error"].(string); ok && s != "" { - return s - } - if s, ok := m["error_message"].(string); ok && s != "" { - return s +// failureMessageFromPayload builds the user-facing text for a task:failed +// event. Precedence: +// 1. The explicit error / error_message string (the redacted detail the +// daemon reported) — most actionable. +// 2. A friendly Chinese description of the coarse failure_reason classifier. +// 3. A generic fallback when neither is present. +// +// The IM user should never be left with a bare "运行失败" when the backend +// actually knows what went wrong. +func failureMessageFromPayload(payload any) string { + m, ok := payload.(map[string]any) + if !ok { + return defaultFailureMessage + } + if s, ok := m["error"].(string); ok && s != "" { + return s + } + if s, ok := m["error_message"].(string); ok && s != "" { + return s + } + if reason, ok := m["failure_reason"].(string); ok && reason != "" { + if desc, ok := failureReasonText[reason]; ok { + return desc } + // Unknown reason (a classifier value added server-side later): + // downgrade to the generic message rather than leaking a raw enum. + return defaultFailureMessage } - return "" + return defaultFailureMessage +} + +const defaultFailureMessage = "Agent 运行失败,请稍后重试或联系工作区管理员。" + +// failureReasonText maps the taskfailure.Reason string values to friendly +// Chinese copy. Keep the keys in sync with server/pkg/taskfailure/failure.go; +// a missing key falls back to defaultFailureMessage, so drift downgrades +// gracefully rather than crashing. +var failureReasonText = map[string]string{ + "queued_expired": "任务排队超时,未被任何 runtime 领取。请确认 Agent 的 daemon 在线。", + "runtime_offline": "Agent 的 runtime 当前离线,消息已记录。请确认 daemon 在线后重试。", + "runtime_recovery": "Agent 的 runtime 正在恢复中,请稍后重试。", + "timeout": "Agent 运行超时,请稍后重试。", + "iteration_limit": "Agent 达到迭代上限,未能完成。请简化请求或重试。", + "agent_blocked": "Agent 被阻塞,无法继续。请联系工作区管理员。", + "api_invalid_request": "请求无效,Agent 无法处理。请调整后重试。", + "agent_error.provider_auth_or_access": "模型服务认证失败,请检查 Agent runtime 的 API Key 配置。", + "agent_error.provider_quota_limit": "模型服务额度已用尽,请检查账户额度。", + "agent_error.provider_capacity_or_rate_limit": "模型服务繁忙或触发限流,请稍后重试。", + "agent_error.provider_server_error": "模型服务返回错误,请稍后重试。", + "agent_error.provider_network": "连接模型服务失败,请检查网络后重试。", + "agent_error.process_failure": "Agent 进程异常退出,请联系工作区管理员。", + "agent_error.empty_or_unparseable_output": "Agent 未返回有效结果,请重试。", + "agent_error.agent_timeout": "Agent 运行超时,请稍后重试。", + "agent_error.context_overflow": "对话上下文过长,Agent 无法处理。请精简内容后重试。", + "agent_error.missing_config": "Agent runtime 缺少必要配置(如环境变量),请联系工作区管理员。", + "agent_error.model_not_found_or_unavailable": "指定的模型不存在或不可用,请检查 Agent 的模型配置。", + "agent_error.runtime_version_unsupported": "Agent runtime 版本不受支持,请升级后重试。", + "agent_error.runtime_missing_executable": "Agent runtime 缺少所需的可执行文件,请检查安装。", + "agent_error.unknown": defaultFailureMessage, } diff --git a/server/internal/integrations/octo/outbound_test.go b/server/internal/integrations/octo/outbound_test.go index b2d488ba4f..ddbf0111ac 100644 --- a/server/internal/integrations/octo/outbound_test.go +++ b/server/internal/integrations/octo/outbound_test.go @@ -119,6 +119,73 @@ func TestProcessEvent_TaskFailed_SendsError(t *testing.T) { } } +func TestProcessEvent_TaskFailed_FallsBackToFailureReason(t *testing.T) { + q := &fakePatcherQueries{binding: octoBinding(), inst: activeInst()} + s := &fakeSender{} + p := newPatcher(q, s) + + // No explicit error string — only the coarse classifier. The relay must + // translate it into the friendly Chinese copy, not the generic fallback. + e := events.Event{ + Type: protocol.EventTaskFailed, + TaskID: "11111111-1111-1111-1111-111111111111", + ChatSessionID: "22222222-2222-2222-2222-222222222222", + Payload: map[string]any{"failure_reason": "agent_error.provider_auth_or_access"}, + } + if err := p.processEvent(context.Background(), e); err != nil { + t.Fatalf("processEvent: %v", err) + } + want := "⚠️ " + failureReasonText["agent_error.provider_auth_or_access"] + if s.sent != 1 || s.lastTxt != want { + t.Errorf("sent=%d lastTxt=%q, want %q", s.sent, s.lastTxt, want) + } +} + +func TestProcessEvent_TaskFailed_DefaultWhenNoDetail(t *testing.T) { + q := &fakePatcherQueries{binding: octoBinding(), inst: activeInst()} + s := &fakeSender{} + p := newPatcher(q, s) + + // Neither error nor failure_reason — the user still gets an actionable, + // non-empty message rather than a bare or English fallback. + e := events.Event{ + Type: protocol.EventTaskFailed, + TaskID: "11111111-1111-1111-1111-111111111111", + ChatSessionID: "22222222-2222-2222-2222-222222222222", + Payload: map[string]any{}, + } + if err := p.processEvent(context.Background(), e); err != nil { + t.Fatalf("processEvent: %v", err) + } + if s.sent != 1 || s.lastTxt != "⚠️ "+defaultFailureMessage { + t.Errorf("sent=%d lastTxt=%q, want default", s.sent, s.lastTxt) + } +} + +func TestFailureMessageFromPayload(t *testing.T) { + cases := []struct { + name string + payload any + want string + }{ + {"explicit error wins", map[string]any{"error": "boom", "failure_reason": "timeout"}, "boom"}, + {"error_message alias", map[string]any{"error_message": "kaboom"}, "kaboom"}, + {"known reason", map[string]any{"failure_reason": "runtime_offline"}, failureReasonText["runtime_offline"]}, + {"unknown reason downgrades", map[string]any{"failure_reason": "some_future_reason"}, defaultFailureMessage}, + {"empty map", map[string]any{}, defaultFailureMessage}, + {"non-map payload", "not a map", defaultFailureMessage}, + {"nil payload", nil, defaultFailureMessage}, + {"empty error falls through to reason", map[string]any{"error": "", "failure_reason": "timeout"}, failureReasonText["timeout"]}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := failureMessageFromPayload(tc.payload); got != tc.want { + t.Errorf("failureMessageFromPayload(%v) = %q, want %q", tc.payload, got, tc.want) + } + }) + } +} + func TestProcessEvent_WebOnlySession_Skips(t *testing.T) { q := &fakePatcherQueries{bindingErr: pgx.ErrNoRows} s := &fakeSender{} diff --git a/server/internal/service/task.go b/server/internal/service/task.go index 58471129f4..37c28668d2 100644 --- a/server/internal/service/task.go +++ b/server/internal/service/task.go @@ -1766,17 +1766,28 @@ func (s *TaskService) HandleFailedTasks(ctx context.Context, tasks []db.AgentTas } if workspaceID != "" { + payload := map[string]any{ + "task_id": util.UUIDToString(t.ID), + "agent_id": util.UUIDToString(t.AgentID), + "issue_id": util.UUIDToString(t.IssueID), + "status": "failed", + "failure_reason": failureReason, + } + // Chat tasks carry no issue_id; the outbound relays route on the + // chat session, so it must be present or the IM reply is dropped. + if t.ChatSessionID.Valid { + payload["chat_session_id"] = util.UUIDToString(t.ChatSessionID) + } + // Redacted human-readable detail so the IM user sees the actual + // cause, not just "agent run failed". + if t.Error.Valid && t.Error.String != "" { + payload["error"] = redact.Text(t.Error.String) + } s.Bus.Publish(events.Event{ Type: protocol.EventTaskFailed, WorkspaceID: workspaceID, ActorType: "system", - Payload: map[string]any{ - "task_id": util.UUIDToString(t.ID), - "agent_id": util.UUIDToString(t.AgentID), - "issue_id": util.UUIDToString(t.IssueID), - "status": "failed", - "failure_reason": failureReason, - }, + Payload: payload, }) } @@ -2007,6 +2018,17 @@ func (s *TaskService) broadcastTaskEvent(ctx context.Context, eventType string, if task.ChatSessionID.Valid { payload["chat_session_id"] = util.UUIDToString(task.ChatSessionID) } + // Surface the failure detail to event consumers (e.g. the Octo/Lark + // outbound relays) so an IM user sees what went wrong instead of a generic + // "agent run failed". error is the redacted human-readable message; the + // coarse failure_reason is the machine classifier. Both are only set on a + // failed task row, so a completed/cancelled task simply omits them. + if task.FailureReason.Valid && task.FailureReason.String != "" { + payload["failure_reason"] = task.FailureReason.String + } + if task.Error.Valid && task.Error.String != "" { + payload["error"] = redact.Text(task.Error.String) + } s.Bus.Publish(events.Event{ Type: eventType, WorkspaceID: workspaceID,