From 342a65caeefb10f2059d9f7bf3937dcc97f03583 Mon Sep 17 00:00:00 2001
From: Menglin Li <limenglin5911@gmail.com>
Date: Mon, 15 Jun 2026 15:23:54 +0800
Subject: [PATCH] fix(octo): surface real failure detail in task:failed IM
 replies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The task:failed event payload only carried the coarse failure_reason
classifier (and the FailTask broadcast path carried neither error nor
chat_session_id). The Octo outbound relay only read error/error_message,
so it always fell back to the English 'The agent run failed.' — IM users
never saw the actual cause and chat tasks failed via the sweeper path
weren't routed at all.

- task.go: include redacted error + failure_reason on both task:failed
  publish paths, and chat_session_id on the sweeper-batch path so chat
  tasks route to the IM relay.
- octo/outbound.go: replace errorMessageFromPayload with
  failureMessageFromPayload — prefer explicit error, fall back to a
  friendly Chinese description of failure_reason, then a generic default.
  Unknown reasons downgrade gracefully (no raw enum leak).
- Lark benefits too: it already read error, which the FailTask path now
  populates.

Tests cover explicit-error precedence, failure_reason fallback, unknown
reason downgrade, and empty/non-map payloads.
---
 server/internal/integrations/octo/outbound.go | 71 +++++++++++++++----
 .../integrations/octo/outbound_test.go        | 67 +++++++++++++++++
 server/internal/service/task.go               | 36 ++++++++--
 3 files changed, 154 insertions(+), 20 deletions(-)

diff --git a/server/internal/integrations/octo/outbound.go b/server/internal/integrations/octo/outbound.go
index 5b8b24cbf9..deb809209b 100644
--- a/server/internal/integrations/octo/outbound.go
+++ b/server/internal/integrations/octo/outbound.go
@@ -124,11 +124,7 @@ func (p *Patcher) processEvent(ctx context.Context, e events.Event) error {
 	case protocol.EventChatDone:
 		return p.sendReply(ctx, inst, binding, taskID, chatDoneContent(e.Payload), token)
 	case protocol.EventTaskFailed:
-		msg := errorMessageFromPayload(e.Payload)
-		if msg == "" {
-			msg = "The agent run failed."
-		}
-		return p.sendReply(ctx, inst, binding, taskID, "⚠️ "+msg, token)
+		return p.sendReply(ctx, inst, binding, taskID, "⚠️ "+failureMessageFromPayload(e.Payload), token)
 	}
 	return nil
 }
@@ -213,14 +209,63 @@ func chatDoneContent(payload any) string {
 	return ""
 }
 
-func errorMessageFromPayload(payload any) string {
-	if m, ok := payload.(map[string]any); ok {
-		if s, ok := m["error"].(string); ok && s != "" {
-			return s
-		}
-		if s, ok := m["error_message"].(string); ok && s != "" {
-			return s
+// failureMessageFromPayload builds the user-facing text for a task:failed
+// event. Precedence:
+//  1. The explicit error / error_message string (the redacted detail the
+//     daemon reported) — most actionable.
+//  2. A friendly Chinese description of the coarse failure_reason classifier.
+//  3. A generic fallback when neither is present.
+//
+// The IM user should never be left with a bare "运行失败" when the backend
+// actually knows what went wrong.
+func failureMessageFromPayload(payload any) string {
+	m, ok := payload.(map[string]any)
+	if !ok {
+		return defaultFailureMessage
+	}
+	if s, ok := m["error"].(string); ok && s != "" {
+		return s
+	}
+	if s, ok := m["error_message"].(string); ok && s != "" {
+		return s
+	}
+	if reason, ok := m["failure_reason"].(string); ok && reason != "" {
+		if desc, ok := failureReasonText[reason]; ok {
+			return desc
 		}
+		// Unknown reason (a classifier value added server-side later):
+		// downgrade to the generic message rather than leaking a raw enum.
+		return defaultFailureMessage
 	}
-	return ""
+	return defaultFailureMessage
+}
+
+const defaultFailureMessage = "Agent 运行失败，请稍后重试或联系工作区管理员。"
+
+// failureReasonText maps the taskfailure.Reason string values to friendly
+// Chinese copy. Keep the keys in sync with server/pkg/taskfailure/failure.go;
+// a missing key falls back to defaultFailureMessage, so drift downgrades
+// gracefully rather than crashing.
+var failureReasonText = map[string]string{
+	"queued_expired":                              "任务排队超时，未被任何 runtime 领取。请确认 Agent 的 daemon 在线。",
+	"runtime_offline":                             "Agent 的 runtime 当前离线，消息已记录。请确认 daemon 在线后重试。",
+	"runtime_recovery":                            "Agent 的 runtime 正在恢复中，请稍后重试。",
+	"timeout":                                     "Agent 运行超时，请稍后重试。",
+	"iteration_limit":                             "Agent 达到迭代上限，未能完成。请简化请求或重试。",
+	"agent_blocked":                               "Agent 被阻塞，无法继续。请联系工作区管理员。",
+	"api_invalid_request":                         "请求无效，Agent 无法处理。请调整后重试。",
+	"agent_error.provider_auth_or_access":         "模型服务认证失败，请检查 Agent runtime 的 API Key 配置。",
+	"agent_error.provider_quota_limit":            "模型服务额度已用尽，请检查账户额度。",
+	"agent_error.provider_capacity_or_rate_limit": "模型服务繁忙或触发限流，请稍后重试。",
+	"agent_error.provider_server_error":           "模型服务返回错误，请稍后重试。",
+	"agent_error.provider_network":                "连接模型服务失败，请检查网络后重试。",
+	"agent_error.process_failure":                 "Agent 进程异常退出，请联系工作区管理员。",
+	"agent_error.empty_or_unparseable_output":     "Agent 未返回有效结果，请重试。",
+	"agent_error.agent_timeout":                   "Agent 运行超时，请稍后重试。",
+	"agent_error.context_overflow":                "对话上下文过长，Agent 无法处理。请精简内容后重试。",
+	"agent_error.missing_config":                  "Agent runtime 缺少必要配置（如环境变量），请联系工作区管理员。",
+	"agent_error.model_not_found_or_unavailable":  "指定的模型不存在或不可用，请检查 Agent 的模型配置。",
+	"agent_error.runtime_version_unsupported":     "Agent runtime 版本不受支持，请升级后重试。",
+	"agent_error.runtime_missing_executable":      "Agent runtime 缺少所需的可执行文件，请检查安装。",
+	"agent_error.unknown":                         defaultFailureMessage,
 }
diff --git a/server/internal/integrations/octo/outbound_test.go b/server/internal/integrations/octo/outbound_test.go
index b2d488ba4f..ddbf0111ac 100644
--- a/server/internal/integrations/octo/outbound_test.go
+++ b/server/internal/integrations/octo/outbound_test.go
@@ -119,6 +119,73 @@ func TestProcessEvent_TaskFailed_SendsError(t *testing.T) {
 	}
 }
 
+func TestProcessEvent_TaskFailed_FallsBackToFailureReason(t *testing.T) {
+	q := &fakePatcherQueries{binding: octoBinding(), inst: activeInst()}
+	s := &fakeSender{}
+	p := newPatcher(q, s)
+
+	// No explicit error string — only the coarse classifier. The relay must
+	// translate it into the friendly Chinese copy, not the generic fallback.
+	e := events.Event{
+		Type:          protocol.EventTaskFailed,
+		TaskID:        "11111111-1111-1111-1111-111111111111",
+		ChatSessionID: "22222222-2222-2222-2222-222222222222",
+		Payload:       map[string]any{"failure_reason": "agent_error.provider_auth_or_access"},
+	}
+	if err := p.processEvent(context.Background(), e); err != nil {
+		t.Fatalf("processEvent: %v", err)
+	}
+	want := "⚠️ " + failureReasonText["agent_error.provider_auth_or_access"]
+	if s.sent != 1 || s.lastTxt != want {
+		t.Errorf("sent=%d lastTxt=%q, want %q", s.sent, s.lastTxt, want)
+	}
+}
+
+func TestProcessEvent_TaskFailed_DefaultWhenNoDetail(t *testing.T) {
+	q := &fakePatcherQueries{binding: octoBinding(), inst: activeInst()}
+	s := &fakeSender{}
+	p := newPatcher(q, s)
+
+	// Neither error nor failure_reason — the user still gets an actionable,
+	// non-empty message rather than a bare or English fallback.
+	e := events.Event{
+		Type:          protocol.EventTaskFailed,
+		TaskID:        "11111111-1111-1111-1111-111111111111",
+		ChatSessionID: "22222222-2222-2222-2222-222222222222",
+		Payload:       map[string]any{},
+	}
+	if err := p.processEvent(context.Background(), e); err != nil {
+		t.Fatalf("processEvent: %v", err)
+	}
+	if s.sent != 1 || s.lastTxt != "⚠️ "+defaultFailureMessage {
+		t.Errorf("sent=%d lastTxt=%q, want default", s.sent, s.lastTxt)
+	}
+}
+
+func TestFailureMessageFromPayload(t *testing.T) {
+	cases := []struct {
+		name    string
+		payload any
+		want    string
+	}{
+		{"explicit error wins", map[string]any{"error": "boom", "failure_reason": "timeout"}, "boom"},
+		{"error_message alias", map[string]any{"error_message": "kaboom"}, "kaboom"},
+		{"known reason", map[string]any{"failure_reason": "runtime_offline"}, failureReasonText["runtime_offline"]},
+		{"unknown reason downgrades", map[string]any{"failure_reason": "some_future_reason"}, defaultFailureMessage},
+		{"empty map", map[string]any{}, defaultFailureMessage},
+		{"non-map payload", "not a map", defaultFailureMessage},
+		{"nil payload", nil, defaultFailureMessage},
+		{"empty error falls through to reason", map[string]any{"error": "", "failure_reason": "timeout"}, failureReasonText["timeout"]},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := failureMessageFromPayload(tc.payload); got != tc.want {
+				t.Errorf("failureMessageFromPayload(%v) = %q, want %q", tc.payload, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestProcessEvent_WebOnlySession_Skips(t *testing.T) {
 	q := &fakePatcherQueries{bindingErr: pgx.ErrNoRows}
 	s := &fakeSender{}
diff --git a/server/internal/service/task.go b/server/internal/service/task.go
index 58471129f4..37c28668d2 100644
--- a/server/internal/service/task.go
+++ b/server/internal/service/task.go
@@ -1766,17 +1766,28 @@ func (s *TaskService) HandleFailedTasks(ctx context.Context, tasks []db.AgentTas
 		}
 
 		if workspaceID != "" {
+			payload := map[string]any{
+				"task_id":        util.UUIDToString(t.ID),
+				"agent_id":       util.UUIDToString(t.AgentID),
+				"issue_id":       util.UUIDToString(t.IssueID),
+				"status":         "failed",
+				"failure_reason": failureReason,
+			}
+			// Chat tasks carry no issue_id; the outbound relays route on the
+			// chat session, so it must be present or the IM reply is dropped.
+			if t.ChatSessionID.Valid {
+				payload["chat_session_id"] = util.UUIDToString(t.ChatSessionID)
+			}
+			// Redacted human-readable detail so the IM user sees the actual
+			// cause, not just "agent run failed".
+			if t.Error.Valid && t.Error.String != "" {
+				payload["error"] = redact.Text(t.Error.String)
+			}
 			s.Bus.Publish(events.Event{
 				Type:        protocol.EventTaskFailed,
 				WorkspaceID: workspaceID,
 				ActorType:   "system",
-				Payload: map[string]any{
-					"task_id":        util.UUIDToString(t.ID),
-					"agent_id":       util.UUIDToString(t.AgentID),
-					"issue_id":       util.UUIDToString(t.IssueID),
-					"status":         "failed",
-					"failure_reason": failureReason,
-				},
+				Payload:     payload,
 			})
 		}
 
@@ -2007,6 +2018,17 @@ func (s *TaskService) broadcastTaskEvent(ctx context.Context, eventType string,
 	if task.ChatSessionID.Valid {
 		payload["chat_session_id"] = util.UUIDToString(task.ChatSessionID)
 	}
+	// Surface the failure detail to event consumers (e.g. the Octo/Lark
+	// outbound relays) so an IM user sees what went wrong instead of a generic
+	// "agent run failed". error is the redacted human-readable message; the
+	// coarse failure_reason is the machine classifier. Both are only set on a
+	// failed task row, so a completed/cancelled task simply omits them.
+	if task.FailureReason.Valid && task.FailureReason.String != "" {
+		payload["failure_reason"] = task.FailureReason.String
+	}
+	if task.Error.Valid && task.Error.String != "" {
+		payload["error"] = redact.Text(task.Error.String)
+	}
 	s.Bus.Publish(events.Event{
 		Type:        eventType,
 		WorkspaceID: workspaceID,