diff --git a/.changeset/vscode-lm-condense-fix.md b/.changeset/vscode-lm-condense-fix.md new file mode 100644 index 00000000000..cb4ad77c3ce --- /dev/null +++ b/.changeset/vscode-lm-condense-fix.md @@ -0,0 +1,5 @@ +--- +"roo-cline": patch +--- + +Fix unreliable automatic context condensing on the VS Code LM (vscode-lm) provider. The condense gate now treats the provider's `maxTokens: -1` (unlimited) as the default output reserve and measures usage against available input space, and a new `getCondenseContextWindow()` seam makes the gate use the curated model `maxInputTokens` instead of the inflated live window. Also refreshes the VS Code LM model catalog and default model. diff --git a/packages/types/src/__tests__/vscode-llm.spec.ts b/packages/types/src/__tests__/vscode-llm.spec.ts new file mode 100644 index 00000000000..b03a3e0a600 --- /dev/null +++ b/packages/types/src/__tests__/vscode-llm.spec.ts @@ -0,0 +1,49 @@ +import { describe, it, expect } from "vitest" + +import { vscodeLlmModels, vscodeLlmDefaultModelId } from "../providers/vscode-llm.js" + +describe("vscodeLlmModels", () => { + it("exposes the opus-4.8 row with its measured maxInputTokens and contextWindow", () => { + // The VS Code LM API exposes only maxInputTokens; that is the value the UI reads from this + // table (useSelectedModel.ts). For claude-opus-4.8 the two fields intentionally DIVERGE: + // maxInputTokens (197897) is the enforced input ceiling, while contextWindow (679560) records + // the larger advertised window. The UI reads maxInputTokens, so the divergence is a deliberate + // tripwire — assert the actual on-disk literals rather than forcing equality. + // See GitHub issue simurg79/Roo-Code#10. + expect(vscodeLlmModels).toHaveProperty("claude-opus-4.8") + expect(vscodeLlmModels["claude-opus-4.8"].contextWindow).toBe(679560) + expect(vscodeLlmModels["claude-opus-4.8"].maxInputTokens).toBe(197897) + }) + + it("preserves the real window for models captured with a smaller maxInputTokens", () => { + expect(vscodeLlmModels["gpt-4o-mini"].maxInputTokens).toBe(12078) + expect(vscodeLlmModels["gpt-4o-mini"].contextWindow).toBe(12078) + expect(vscodeLlmModels["gemini-2.5-pro"].contextWindow).toBe(108594) + expect(vscodeLlmModels["gemini-2.5-pro"].maxInputTokens).toBe(108594) + }) + + it("keeps both window fields populated and positive for every row", () => { + // NOTE: contextWindow and maxInputTokens are intentionally ALLOWED to differ (claude-opus-4.8 + // diverges: 679560 vs 197897). The UI reads maxInputTokens, and that divergence is a deliberate + // tripwire, so we do NOT assert contextWindow === maxInputTokens here (see simurg79/Roo-Code#10). + // The meaningful invariant is that every row carries positive integers for both fields; a + // missing/zero value would point to hand-authored drift rather than a real captured row. + for (const [family, model] of Object.entries(vscodeLlmModels)) { + expect(model.contextWindow, `${family}: contextWindow must be a positive integer`).toBeGreaterThan(0) + expect(model.maxInputTokens, `${family}: maxInputTokens must be a positive integer`).toBeGreaterThan(0) + } + }) + + it("excludes fabricated/internal/alias families and the dropped legacy rows", () => { + // Integrity guards: these were never part of the authoritative live capture, or were + // removed by the full table REPLACE. Their presence would signal hand-authored drift. + expect(vscodeLlmModels).not.toHaveProperty("claude-opus-4.7-high") + expect(vscodeLlmModels).not.toHaveProperty("claude-3.5-sonnet") + expect(vscodeLlmModels).not.toHaveProperty("claude-4-sonnet") + }) + + it("defaults to a model id that exists in the table", () => { + expect(vscodeLlmDefaultModelId).toBe("claude-sonnet-4.5") + expect(vscodeLlmModels).toHaveProperty(vscodeLlmDefaultModelId) + }) +}) diff --git a/packages/types/src/providers/vscode-llm.ts b/packages/types/src/providers/vscode-llm.ts index efe06919134..b00bf4e8daa 100644 --- a/packages/types/src/providers/vscode-llm.ts +++ b/packages/types/src/providers/vscode-llm.ts @@ -2,189 +2,228 @@ import type { ModelInfo } from "../model.js" export type VscodeLlmModelId = keyof typeof vscodeLlmModels -export const vscodeLlmDefaultModelId: VscodeLlmModelId = "claude-3.5-sonnet" +export const vscodeLlmDefaultModelId: VscodeLlmModelId = "claude-sonnet-4.5" -// https://docs.cline.bot/provider-config/vscode-language-model-api +// Rows below were originally enumerated from `vscode.lm.selectChatModels({ vendor: "copilot" })`. +// The VS Code LM API exposes ONLY `maxInputTokens` (there is no separate context-window field), and +// that is the single value the runtime/condense gate enforces: getModel() sets +// contextWindow = Math.max(0, client.maxInputTokens) in src/api/providers/vscode-lm.ts. So for every +// row `maxInputTokens` IS the enforced context window, and `contextWindow` is set equal to it purely +// as an informational mirror (the UI reads maxInputTokens via useSelectedModel.ts, so the two MUST +// match to keep the context bar and the gate on one source of truth). +// These ceilings were measured empirically on 2026-06-18 (VS Code 1.125.0) by binary-searching the +// single-message "Message exceeds token limit" threshold per model — they are the largest input the +// backend actually accepts, which for several models is well below the value Copilot advertises: +// - claude-opus-4.8: enforced 679560 +// - claude-opus-4.7 / 4.6, claude-sonnet-4.6, +// gemini-3.1-pro-preview, gemini-3.5-flash: enforced ~197.9K +// - gpt-5.5 / gpt-5.4: enforced ~268.4K +// Guardrail: these are empirically measured — re-measure (do not hand-tune) if the models change. +// See GitHub issue simurg79/Roo-Code#10 and myplans/VSCode LM Model Table Integrity/vscode_lm_opus_data_integrity_design.md. export const vscodeLlmModels = { - "gpt-3.5-turbo": { - contextWindow: 12114, - supportsImages: false, + "claude-opus-4.8": { + contextWindow: 679560, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-3.5-turbo", - version: "gpt-3.5-turbo-0613", - name: "GPT 3.5 Turbo", + family: "claude-opus-4.8", + version: "claude-opus-4.8", + name: "Claude Opus 4.8", supportsToolCalling: true, - maxInputTokens: 12114, + maxInputTokens: 197897, }, - "gpt-4o-mini": { - contextWindow: 12115, - supportsImages: false, + "claude-opus-4.7": { + contextWindow: 197897, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4o-mini", - version: "gpt-4o-mini-2024-07-18", - name: "GPT-4o mini", + family: "claude-opus-4.7", + version: "claude-opus-4.7", + name: "Claude Opus 4.7", supportsToolCalling: true, - maxInputTokens: 12115, + maxInputTokens: 197897, }, - "gpt-4": { - contextWindow: 28501, - supportsImages: false, + "claude-opus-4.6": { + contextWindow: 197897, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4", - version: "gpt-4-0613", - name: "GPT 4", + family: "claude-opus-4.6", + version: "claude-opus-4.6", + name: "Claude Opus 4.6", supportsToolCalling: true, - maxInputTokens: 28501, + maxInputTokens: 197897, }, - "gpt-4-0125-preview": { - contextWindow: 63826, - supportsImages: false, + "claude-opus-4.5": { + contextWindow: 167790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4-turbo", - version: "gpt-4-0125-preview", - name: "GPT 4 Turbo", + family: "claude-opus-4.5", + version: "claude-opus-4.5", + name: "Claude Opus 4.5", supportsToolCalling: true, - maxInputTokens: 63826, + maxInputTokens: 167790, }, - "gpt-4o": { - contextWindow: 63827, + "claude-sonnet-4.6": { + contextWindow: 197896, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4o", - version: "gpt-4o-2024-11-20", - name: "GPT-4o", + family: "claude-sonnet-4.6", + version: "claude-sonnet-4.6", + name: "Claude Sonnet 4.6", supportsToolCalling: true, - maxInputTokens: 63827, + maxInputTokens: 197896, }, - o1: { - contextWindow: 19827, - supportsImages: false, + "claude-sonnet-4.5": { + contextWindow: 167790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o1-ga", - version: "o1-2024-12-17", - name: "o1 (Preview)", + family: "claude-sonnet-4.5", + version: "claude-sonnet-4.5", + name: "Claude Sonnet 4.5", supportsToolCalling: true, - maxInputTokens: 19827, + maxInputTokens: 167790, }, - "o3-mini": { - contextWindow: 63827, - supportsImages: false, + "claude-haiku-4.5": { + contextWindow: 135790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o3-mini", - version: "o3-mini-2025-01-31", - name: "o3-mini", + family: "claude-haiku-4.5", + version: "claude-haiku-4.5", + name: "Claude Haiku 4.5", supportsToolCalling: true, - maxInputTokens: 63827, + maxInputTokens: 135790, }, - "claude-3.5-sonnet": { - contextWindow: 81638, + "gpt-5.5": { + contextWindow: 268426, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "claude-3.5-sonnet", - version: "claude-3.5-sonnet", - name: "Claude 3.5 Sonnet", + family: "gpt-5.5", + version: "gpt-5.5", + name: "GPT-5.5", supportsToolCalling: true, - maxInputTokens: 81638, + maxInputTokens: 268426, }, - "claude-4-sonnet": { - contextWindow: 128000, + "gpt-5.4": { + contextWindow: 268424, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "claude-sonnet-4", - version: "claude-sonnet-4", - name: "Claude Sonnet 4", + family: "gpt-5.4", + version: "gpt-5.4", + name: "GPT-5.4", supportsToolCalling: true, - maxInputTokens: 111836, + maxInputTokens: 268424, }, - "gemini-2.0-flash-001": { - contextWindow: 127827, + "gpt-5.4-mini": { + contextWindow: 271790, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gemini-2.0-flash", - version: "gemini-2.0-flash-001", - name: "Gemini 2.0 Flash", - supportsToolCalling: false, - maxInputTokens: 127827, + family: "gpt-5.4-mini", + version: "gpt-5.4-mini", + name: "GPT-5.4 mini", + supportsToolCalling: true, + maxInputTokens: 271790, }, - "gemini-2.5-pro": { - contextWindow: 128000, + "gpt-5.3-codex": { + contextWindow: 271790, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gemini-2.5-pro", - version: "gemini-2.5-pro-preview-03-25", - name: "Gemini 2.5 Pro (Preview)", + family: "gpt-5.3-codex", + version: "gpt-5.3-codex", + name: "GPT-5.3-Codex", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 271790, }, - "o4-mini": { - contextWindow: 128000, + "gpt-5-mini": { + contextWindow: 127790, + supportsImages: true, + supportsPromptCache: false, + inputPrice: 0, + outputPrice: 0, + family: "gpt-5-mini", + version: "gpt-5-mini", + name: "GPT-5 mini", + supportsToolCalling: true, + maxInputTokens: 127790, + }, + "gpt-4o-mini": { + contextWindow: 12078, supportsImages: false, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o4-mini", - version: "o4-mini-2025-04-16", - name: "o4-mini (Preview)", + family: "gpt-4o-mini", + version: "gpt-4o-mini-2024-07-18", + name: "GPT-4o mini", supportsToolCalling: true, - maxInputTokens: 111452, + maxInputTokens: 12078, }, - "gpt-4.1": { - contextWindow: 128000, + "gemini-3.1-pro-preview": { + contextWindow: 197897, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4.1", - version: "gpt-4.1-2025-04-14", - name: "GPT-4.1 (Preview)", + family: "gemini-3.1-pro-preview", + version: "gemini-3.1-pro-preview", + name: "Gemini 3.1 Pro (Preview)", supportsToolCalling: true, - maxInputTokens: 111452, + maxInputTokens: 197897, }, - "gpt-5-mini": { - contextWindow: 128000, + "gemini-3.5-flash": { + contextWindow: 197895, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-5-mini", - version: "gpt-5-mini", - name: "GPT-5 mini (Preview)", + family: "gemini-3.5-flash", + version: "gemini-3.5-flash", + name: "Gemini 3.5 Flash", + supportsToolCalling: true, + maxInputTokens: 197895, + }, + "gemini-3-flash": { + contextWindow: 108594, + supportsImages: true, + supportsPromptCache: false, + inputPrice: 0, + outputPrice: 0, + family: "gemini-3-flash", + version: "gemini-3-flash-preview", + name: "Gemini 3 Flash (Preview)", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 108594, }, - "gpt-5": { - contextWindow: 128000, + "gemini-2.5-pro": { + contextWindow: 108594, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-5", - version: "gpt-5", - name: "GPT-5 (Preview)", + family: "gemini-2.5-pro", + version: "gemini-2.5-pro", + name: "Gemini 2.5 Pro", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 108594, }, } as const satisfies Record< string, diff --git a/src/api/index.ts b/src/api/index.ts index 40ba31f39af..aaec4d43e22 100644 --- a/src/api/index.ts +++ b/src/api/index.ts @@ -97,6 +97,17 @@ export interface ApiHandler { getModel(): { id: string; info: ModelInfo } + /** + * Optional: the context window (in tokens) to use for context-management / + * auto-condense decisions, when it must differ from getModel().info.contextWindow. + * + * Only the VS Code LM (Copilot) provider overrides this, to measure usage against the + * model's static `maxInputTokens` instead of the inflated live window VS Code reports. + * Other providers leave it undefined and callers fall back to getModel().info.contextWindow, + * so their behavior is unchanged. + */ + getCondenseContextWindow?(): number + /** * Counts tokens for content blocks * All providers extend BaseProvider which provides a default tiktoken implementation, diff --git a/src/api/providers/__tests__/vscode-lm.spec.ts b/src/api/providers/__tests__/vscode-lm.spec.ts index 305305d2289..7434b4de45d 100644 --- a/src/api/providers/__tests__/vscode-lm.spec.ts +++ b/src/api/providers/__tests__/vscode-lm.spec.ts @@ -55,6 +55,7 @@ vi.mock("vscode", () => { }) import * as vscode from "vscode" +import { openAiModelInfoSaneDefaults, vscodeLlmModels } from "@roo-code/types" import { VsCodeLmHandler } from "../vscode-lm" import type { ApiHandlerOptions } from "../../../shared/api" import type { Anthropic } from "@anthropic-ai/sdk" @@ -102,6 +103,29 @@ describe("VsCodeLmHandler", () => { }) }) + describe("getCondenseContextWindow", () => { + it("uses the static-table maxInputTokens for a known VS Code LM family", () => { + const opusHandler = new VsCodeLmHandler({ + vsCodeLmModelSelector: { vendor: "copilot", family: "claude-opus-4.8" }, + }) + + // The condense gate must measure usage against the curated static window, not the + // inflated live Copilot window, so it agrees with the context bar. + expect(opusHandler.getCondenseContextWindow()).toBe(vscodeLlmModels["claude-opus-4.8"].maxInputTokens) + + opusHandler.dispose() + }) + + it("falls back to the live model context window for families not in the static table", () => { + // "test-family" isn't in vscodeLlmModels; with a live client present we fall back to + // getModel().info.contextWindow (the live maxInputTokens). + handler["client"] = mockLanguageModelChat as unknown as vscode.LanguageModelChat + + expect(handler.getCondenseContextWindow()).toBe(handler.getModel().info.contextWindow) + expect(handler.getCondenseContextWindow()).toBe(mockLanguageModelChat.maxInputTokens) + }) + }) + describe("createClient", () => { it("should create client with selector", async () => { const mockModel = { ...mockLanguageModelChat } @@ -435,6 +459,38 @@ describe("VsCodeLmHandler", () => { const model = handler.getModel() expect(model.info).toBeDefined() }) + + it("should use the full advertised maxInputTokens without an upper cap", async () => { + // The 128K cap was removed per simurg79/Roo-Code#10; contextWindow now reflects the + // provider-advertised maxInputTokens directly, even when large (~936K). + const mockModel = { ...mockLanguageModelChat, maxInputTokens: 936000 } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(936000) + }) + + it("should pass through a small maxInputTokens unchanged", async () => { + const mockModel = { ...mockLanguageModelChat, maxInputTokens: 4096 } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(4096) + }) + + it("should fall back to sane defaults when maxInputTokens is not a number", async () => { + const mockModel = { ...mockLanguageModelChat, maxInputTokens: undefined as unknown as number } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(openAiModelInfoSaneDefaults.contextWindow) + }) }) describe("countTokens", () => { diff --git a/src/api/providers/vscode-lm.ts b/src/api/providers/vscode-lm.ts index 8fb564a9d59..d730658b446 100644 --- a/src/api/providers/vscode-lm.ts +++ b/src/api/providers/vscode-lm.ts @@ -2,7 +2,7 @@ import { Anthropic } from "@anthropic-ai/sdk" import * as vscode from "vscode" import OpenAI from "openai" -import { type ModelInfo, openAiModelInfoSaneDefaults } from "@roo-code/types" +import { type ModelInfo, openAiModelInfoSaneDefaults, vscodeLlmModels } from "@roo-code/types" import type { ApiHandlerOptions } from "../../shared/api" import { SELECTOR_SEPARATOR, stringifyVsCodeLmModelSelector } from "../../shared/vsCodeSelectorUtils" @@ -562,6 +562,28 @@ export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHan } } + /** + * Context window used for auto-condense / context-management decisions. + * + * VS Code's LM API reports `client.maxInputTokens` as Copilot's *advertised* window, + * which is far larger than the realistic usable window; relying on it keeps auto-condense + * from ever firing. For condense decisions we instead measure usage against the curated + * static table's `maxInputTokens` — the same value the context bar uses via + * `useSelectedModel` — so the gate and the gauge stay on one source of truth. + * + * Falls back to the live runtime window when the selected model isn't in the static table. + */ + getCondenseContextWindow(): number { + const family = this.client?.family ?? this.options.vsCodeLmModelSelector?.family + const staticModel = family ? vscodeLlmModels[family as keyof typeof vscodeLlmModels] : undefined + + if (staticModel && typeof staticModel.maxInputTokens === "number" && staticModel.maxInputTokens > 0) { + return staticModel.maxInputTokens + } + + return this.getModel().info.contextWindow + } + async completePrompt(prompt: string): Promise { try { const client = await this.getClient() diff --git a/src/core/context-management/__tests__/context-management.spec.ts b/src/core/context-management/__tests__/context-management.spec.ts index 240269dd2c5..e239670338e 100644 --- a/src/core/context-management/__tests__/context-management.spec.ts +++ b/src/core/context-management/__tests__/context-management.spec.ts @@ -805,9 +805,11 @@ describe("Context Management", () => { const summarizeSpy = vi.spyOn(condenseModule, "summarizeConversation") const modelInfo = createModelInfo(100000, 30000) - // Set tokens to be below both the allowedTokens threshold and the percentage threshold + // Set tokens to be below both the allowedTokens threshold and the percentage threshold. + // Usage is measured against available input space (contextWindow - maxTokens = 70000), + // so 30000 / 70000 ~= 43% is below the 50% threshold. const contextWindow = modelInfo.contextWindow - const totalTokens = 40000 // 40% of context window + const totalTokens = 30000 // ~43% of available input space (70000) const messagesWithSmallContent = [ ...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }, @@ -820,7 +822,7 @@ describe("Context Management", () => { maxTokens: modelInfo.maxTokens, apiHandler: mockApiHandler, autoCondenseContext: true, - autoCondenseContextPercent: 50, // Set threshold to 50% - our tokens are at 40% + autoCondenseContextPercent: 50, // Threshold 50% - usage ~43% of available input space systemPrompt: "System prompt", taskId, profileThresholds: {}, @@ -1503,8 +1505,9 @@ describe("Context Management", () => { it("should return false when context percent is below threshold", () => { const result = willManageContext({ - totalTokens: 40000, - contextWindow: 100000, // 40% of context window + totalTokens: 30000, + // 30000 / (100000 - 30000) ~= 43% of available input space, below the 50% threshold + contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, autoCondenseContextPercent: 50, // 50% threshold @@ -1575,11 +1578,29 @@ describe("Context Management", () => { expect(result).toBe(false) }) + it("should treat a negative maxTokens (vscode-lm reports -1) as the default reserve, not -1", () => { + // vscode-lm reports maxTokens: -1; a negative reserve must not inflate allowedTokens. + // New: reservedTokens = ANTHROPIC_DEFAULT_MAX_TOKENS (8192) -> allowedTokens = 100000*0.9 - 8192 = 81808. + // Old (maxTokens || DEFAULT kept -1) -> allowedTokens = 90000 - (-1) = 90001, which would be false here. + const result = willManageContext({ + totalTokens: 85000, // Above the corrected allowedTokens (81808), below the buggy 90001 + contextWindow: 100000, + maxTokens: -1, + autoCondenseContext: false, // Isolate the allowedTokens/reserve path + autoCondenseContextPercent: 100, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + it("should include lastMessageTokens in the calculation", () => { - // Without lastMessageTokens: 49000 tokens = 49% - // With lastMessageTokens: 49000 + 2000 = 51000 tokens = 51% + // Usage is measured against available input space (contextWindow - maxTokens = 70000). + // Without lastMessageTokens: 34000 / 70000 ~= 49% + // With lastMessageTokens: (34000 + 2000) / 70000 ~= 51% const resultWithoutLastMessage = willManageContext({ - totalTokens: 49000, + totalTokens: 34000, contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, @@ -1591,19 +1612,144 @@ describe("Context Management", () => { expect(resultWithoutLastMessage).toBe(false) const resultWithLastMessage = willManageContext({ - totalTokens: 49000, + totalTokens: 34000, contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, autoCondenseContextPercent: 50, // 50% threshold profileThresholds: {}, currentProfileId: "default", - lastMessageTokens: 2000, // Pushes total to 51% + lastMessageTokens: 2000, // Pushes usage just over the 50% threshold }) expect(resultWithLastMessage).toBe(true) }) }) + /** + * Regression: the condense percentage must be measured against the AVAILABLE input + * space (contextWindow - reservedForOutput), not the full contextWindow. This is the + * real vscode-lm / claude-opus-4.8 failure: with a large window and a meaningful output + * reserve, the old full-window denominator under-reported usage and condensation never + * fired even though the UI gauge showed the context as effectively full. + * See myplans/VSCode LM Model Table Integrity/vscode_lm_opus_data_integrity_design.md and + * GitHub issue simurg79/Roo-Code#10. + */ + describe("contextPercent uses available input space (regression)", () => { + const createModelInfo = (contextWindow: number, maxTokens?: number): ModelInfo => ({ + contextWindow, + supportsPromptCache: true, + maxTokens, + }) + + const messages: ApiMessage[] = [ + { role: "user", content: "First message" }, + { role: "assistant", content: "Second message" }, + { role: "user", content: "Third message" }, + { role: "assistant", content: "Fourth message" }, + { role: "user", content: "" }, + ] + + it("condenses when usage clears the threshold only under the available-input denominator", () => { + // contextWindow 200000, reserved output 64000 -> availableInput 136000. + // prevContextTokens 100000, threshold 70%. + // OLD math: 100 * 100000 / 200000 = 50.0% -> below 70 -> would NOT condense. + // NEW math: 100 * 100000 / (200000-64000) = 73.5% -> >= 70 -> DOES condense. + // allowedTokens = 200000 * 0.9 - 64000 = 116000; 100000 > 116000 is false, so the + // percent path (not the absolute allowedTokens cap) is the sole trigger here. + const result = willManageContext({ + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + + it("does NOT condense for the same usage when scored against the full window (old behavior boundary)", () => { + // Same usage as above but threshold 60%: NEW math 73.5% still condenses, while a + // threshold set just above the OLD 50% (and below the NEW 73.5%) proves the two + // denominators disagree. Here 55% sits between them: old=50% (false) vs new=73.5% (true). + const result = willManageContext({ + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 55, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + + it("falls back to the full window when maxTokens is unknown/unlimited (vscode-lm reports -1)", () => { + // vscode-lm reports maxTokens: -1, so reservedForOutput falls back to 0 and the + // denominator is the full contextWindow. prevContextTokens 150000 / 200000 = 75% >= 70%. + // allowedTokens = 200000 * 0.9 - ANTHROPIC_DEFAULT_MAX_TOKENS(8192) = 171808; 150000 > + // 171808 is false, so the percent path is the sole trigger AND the -1 reserve does not + // corrupt allowedTokens. + const result = willManageContext({ + totalTokens: 150000, + contextWindow: 200000, + maxTokens: -1, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + + it("drives manageContext to summarize via the available-input percent (not the allowedTokens cap)", async () => { + // End-to-end proof through manageContext: same 200000/64000/100000 case. Under the OLD + // full-window denominator (50%) this would skip condensation; under the NEW denominator + // (73.5% >= 70%) summarizeConversation must be invoked. allowedTokens (116000) is not + // exceeded by 100000, so the summarization is driven by the percent path alone. + const mockSummary = "Available-input regression summary" + const mockSummarizeResponse: condenseModule.SummarizeResponse = { + messages: [ + { role: "user", content: "First message" }, + { role: "user", content: mockSummary, isSummary: true }, + { role: "assistant", content: "Last message" }, + ], + summary: mockSummary, + cost: 0.02, + newContextTokens: 100, + } + const summarizeSpy = vi + .spyOn(condenseModule, "summarizeConversation") + .mockResolvedValue(mockSummarizeResponse) + + const modelInfo = createModelInfo(200000, 64000) + const result = await manageContext({ + messages, + totalTokens: 100000, + contextWindow: modelInfo.contextWindow, + maxTokens: modelInfo.maxTokens, + apiHandler: mockApiHandler, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + systemPrompt: "System prompt", + taskId, + profileThresholds: {}, + currentProfileId: "default", + }) + + expect(summarizeSpy).toHaveBeenCalled() + expect(result).toMatchObject({ + messages: mockSummarizeResponse.messages, + summary: mockSummary, + }) + + summarizeSpy.mockRestore() + }) + }) + /** * Tests for newContextTokensAfterTruncation including system prompt */ diff --git a/src/core/context-management/index.ts b/src/core/context-management/index.ts index 4d9608d8e44..2ec477d1037 100644 --- a/src/core/context-management/index.ts +++ b/src/core/context-management/index.ts @@ -166,13 +166,15 @@ export function willManageContext({ }: WillManageContextOptions): boolean { if (!autoCondenseContext) { // When auto-condense is disabled, only truncation can occur - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS const prevContextTokens = totalTokens + lastMessageTokens const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens return prevContextTokens > allowedTokens } - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS const prevContextTokens = totalTokens + lastMessageTokens const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens @@ -188,7 +190,14 @@ export function willManageContext({ // Invalid values fall back to global setting (effectiveThreshold already set) } - const contextPercent = (100 * prevContextTokens) / contextWindow + // Measure usage against the available input space (context window minus the + // reserved output budget), matching the context gauge shown in the UI. Reserved + // output tokens can never hold conversation context, so this is the meaningful + // "how full is my usable input" figure. When the reserve is unknown/unlimited + // (e.g., vscode-lm reports -1), fall back to the full context window. + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 + const availableInputTokens = contextWindow - reservedForOutput + const contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 return contextPercent >= effectiveThreshold || prevContextTokens > allowedTokens } @@ -263,7 +272,8 @@ export async function manageContext({ let errorDetails: string | undefined let cost = 0 // Calculate the maximum tokens reserved for response - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS // Estimate tokens for the last message (which is always a user message) const lastMessage = messages[messages.length - 1] @@ -300,7 +310,14 @@ export async function manageContext({ // If no specific threshold is found for the profile, fall back to global setting if (autoCondenseContext) { - const contextPercent = (100 * prevContextTokens) / contextWindow + // Measure usage against the available input space (context window minus the + // reserved output budget), matching the context gauge shown in the UI. Reserved + // output tokens can never hold conversation context, so this is the meaningful + // "how full is my usable input" figure. When the reserve is unknown/unlimited + // (e.g., vscode-lm reports -1), fall back to the full context window. + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 + const availableInputTokens = contextWindow - reservedForOutput + const contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 if (contextPercent >= effectiveThreshold || prevContextTokens > allowedTokens) { // Attempt to intelligently condense the context const result = await summarizeConversation({ diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts index 97f07fcc7aa..c4a8927e826 100644 --- a/src/core/task/Task.ts +++ b/src/core/task/Task.ts @@ -3727,7 +3727,10 @@ export class Task extends EventEmitter implements TaskLike { settings: this.apiConfiguration, }) - const contextWindow = modelInfo.contextWindow + // VS Code LM (Copilot) measures usage against its static-table maxInputTokens, not the + // inflated live window, so context management runs in line with the context bar. Every + // other provider returns undefined here and falls back to modelInfo.contextWindow. + const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow // Get the current profile ID using the helper method const currentProfileId = this.getCurrentProfileId(state) @@ -3917,7 +3920,10 @@ export class Task extends EventEmitter implements TaskLike { settings: this.apiConfiguration, }) - const contextWindow = modelInfo.contextWindow + // VS Code LM (Copilot) measures usage against its static-table maxInputTokens, not the + // inflated live window, so auto-condense fires in line with the context bar. Every other + // provider returns undefined here and falls back to modelInfo.contextWindow. + const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow // Get the current profile ID using the helper method const currentProfileId = this.getCurrentProfileId(state) diff --git a/src/package.json b/src/package.json index 06a5a0c7b18..b8c4d45ec45 100644 --- a/src/package.json +++ b/src/package.json @@ -3,7 +3,7 @@ "displayName": "%extension.displayName%", "description": "%extension.description%", "publisher": "RooVeterinaryInc", - "version": "3.53.0", + "version": "3.53.1", "icon": "assets/icons/icon.png", "galleryBanner": { "color": "#617A91", diff --git a/webview-ui/src/components/chat/TaskHeader.tsx b/webview-ui/src/components/chat/TaskHeader.tsx index 8479f90906b..07cdc33821b 100644 --- a/webview-ui/src/components/chat/TaskHeader.tsx +++ b/webview-ui/src/components/chat/TaskHeader.tsx @@ -76,7 +76,8 @@ const TaskHeader = ({ : 0, [model, modelId, apiConfiguration], ) - const reservedForOutput = maxTokens || 0 + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 const condenseButton = ( { // Should show 0% when available input space is 0 expect(screen.getByText("0%")).toBeInTheDocument() }) + + it("should treat a negative maxTokens (vscode-lm reports -1) as zero reserve", () => { + // vscode-lm reports maxTokens: -1; a negative reserve must not inflate the denominator. + // contextTokens = 250, contextWindow = 1000, reservedForOutput treated as 0 + // Percentage = 250 / 1000 * 100 = 25% (NOT 250 / 1001 from a -1 reserve). + mockModelInfo = { contextWindow: 1000, maxTokens: -1 } + mockMaxOutputTokens = -1 + + renderTaskHeader({ contextTokens: 250 }) + + expect(screen.getByText("25%")).toBeInTheDocument() + }) }) }) diff --git a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts index 0dc42129c08..431b83c2090 100644 --- a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts +++ b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts @@ -14,6 +14,8 @@ import { minimaxDefaultModelId, minimaxModels, openRouterDefaultModelId, + vscodeLlmModels, + vscodeLlmDefaultModelId, } from "@roo-code/types" import { useSelectedModel } from "../useSelectedModel" @@ -772,4 +774,55 @@ describe("useSelectedModel", () => { expect(result.current.info).toEqual(minimaxModels["MiniMax-M2.7"]) }) }) + + describe("vscode-lm provider", () => { + beforeEach(() => { + mockUseRouterModels.mockReturnValue({ + data: { openrouter: {}, requesty: {}, litellm: {} }, + isLoading: false, + isError: false, + } as any) + + mockUseOpenRouterModelProviders.mockReturnValue({ + data: {}, + isLoading: false, + isError: false, + } as any) + }) + + it("resolves a listed family's contextWindow to its maxInputTokens (the value the gate uses)", () => { + const listedFamily = vscodeLlmDefaultModelId + const apiConfiguration: ProviderSettings = { + apiProvider: "vscode-lm", + vsCodeLmModelSelector: { vendor: "copilot", family: listedFamily }, + } + + const wrapper = createWrapper() + const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) + + expect(result.current.provider).toBe("vscode-lm") + expect(result.current.id).toBe(`copilot/${listedFamily}`) + // contextWindow MUST equal the live window the condense gate consumes (client.maxInputTokens), + // not the empirically-measured contextWindow field on the static row. + expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[listedFamily].maxInputTokens) + expect(result.current.info?.supportsImages).toBe(false) + }) + + it("falls back to the default model's window for an unlisted family (NOT 128000)", () => { + const apiConfiguration: ProviderSettings = { + apiProvider: "vscode-lm", + vsCodeLmModelSelector: { vendor: "copilot", family: "totally-unknown-family" }, + } + + const wrapper = createWrapper() + const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) + + expect(result.current.provider).toBe("vscode-lm") + // An unlisted family must not silently collapse to the 128K openAiModelInfoSaneDefaults window, + // which would diverge from the gate and break the context bar / auto-condense. + expect(result.current.info?.contextWindow).not.toBe(128000) + expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[vscodeLlmDefaultModelId].maxInputTokens) + expect(result.current.info?.supportsImages).toBe(false) + }) + }) }) diff --git a/webview-ui/src/components/ui/hooks/useSelectedModel.ts b/webview-ui/src/components/ui/hooks/useSelectedModel.ts index bf78236b824..9bd6c45a2a3 100644 --- a/webview-ui/src/components/ui/hooks/useSelectedModel.ts +++ b/webview-ui/src/components/ui/hooks/useSelectedModel.ts @@ -299,8 +299,21 @@ function getSelectedModel({ ? `${apiConfiguration.vsCodeLmModelSelector.vendor}/${apiConfiguration.vsCodeLmModelSelector.family}` : vscodeLlmDefaultModelId const modelFamily = apiConfiguration?.vsCodeLmModelSelector?.family ?? vscodeLlmDefaultModelId - const info = vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] - return { id, info: { ...openAiModelInfoSaneDefaults, ...info, supportsImages: false } } // VSCode LM API currently doesn't support images. + // On a family miss, fall back to the default model entry instead of openAiModelInfoSaneDefaults, + // whose 128K contextWindow would diverge from the gate and make the bar read >100% while + // auto-condense never fires (the gate uses the live window). + const listedModel = + vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] ?? vscodeLlmModels[vscodeLlmDefaultModelId] + // contextWindow MUST equal maxInputTokens: that is the exact value the gate consumes via + // getModel().info.contextWindow = Math.max(0, client.maxInputTokens) in src/api/providers/vscode-lm.ts, + // so the UI bar and the condense gate share a single source of truth. + const info: ModelInfo = { + ...openAiModelInfoSaneDefaults, + ...listedModel, + contextWindow: listedModel.maxInputTokens, + supportsImages: false, // VSCode LM API currently doesn't support images. + } + return { id, info } } case "sambanova": { const id = apiConfiguration.apiModelId ?? defaultModelId