diff --git a/packages/types/src/__tests__/vscode-llm.spec.ts b/packages/types/src/__tests__/vscode-llm.spec.ts new file mode 100644 index 0000000000..041bc3c8b4 --- /dev/null +++ b/packages/types/src/__tests__/vscode-llm.spec.ts @@ -0,0 +1,33 @@ +import { describe, it, expect } from "vitest" +import { vscodeLlmModels, vscodeLlmDefaultModelId } from "../providers/vscode-llm.js" + +describe("vscodeLlmModels", () => { + it("exposes the opus-4.8 row with its measured maxInputTokens and contextWindow", () => { + // claude-opus-4.8 intentionally diverges: maxInputTokens (197897) is the enforced ceiling the + // UI reads, contextWindow (679560) the advertised window. Assert the on-disk literals as a tripwire. + expect(vscodeLlmModels).toHaveProperty("claude-opus-4.8") + expect(vscodeLlmModels["claude-opus-4.8"].contextWindow).toBe(679560) + expect(vscodeLlmModels["claude-opus-4.8"].maxInputTokens).toBe(197897) + }) + it("preserves the real window for models captured with a smaller maxInputTokens", () => { + expect(vscodeLlmModels["gpt-4o-mini"].maxInputTokens).toBe(12078) + expect(vscodeLlmModels["gpt-4o-mini"].contextWindow).toBe(12078) + expect(vscodeLlmModels["gemini-2.5-pro"].contextWindow).toBe(108594) + expect(vscodeLlmModels["gemini-2.5-pro"].maxInputTokens).toBe(108594) + }) + it("keeps both window fields populated and positive for every row", () => { + for (const [family, model] of Object.entries(vscodeLlmModels)) { + expect(model.contextWindow, `${family}: contextWindow must be a positive integer`).toBeGreaterThan(0) + expect(model.maxInputTokens, `${family}: maxInputTokens must be a positive integer`).toBeGreaterThan(0) + } + }) + it("excludes fabricated/internal/alias families and the dropped legacy rows", () => { + expect(vscodeLlmModels).not.toHaveProperty("claude-opus-4.7-high") + expect(vscodeLlmModels).not.toHaveProperty("claude-3.5-sonnet") + expect(vscodeLlmModels).not.toHaveProperty("claude-4-sonnet") + }) + it("defaults to a model id that exists in the table", () => { + expect(vscodeLlmDefaultModelId).toBe("claude-sonnet-4.5") + expect(vscodeLlmModels).toHaveProperty(vscodeLlmDefaultModelId) + }) +}) diff --git a/packages/types/src/providers/vscode-llm.ts b/packages/types/src/providers/vscode-llm.ts index efe0691913..5286b0ed28 100644 --- a/packages/types/src/providers/vscode-llm.ts +++ b/packages/types/src/providers/vscode-llm.ts @@ -2,189 +2,215 @@ import type { ModelInfo } from "../model.js" export type VscodeLlmModelId = keyof typeof vscodeLlmModels -export const vscodeLlmDefaultModelId: VscodeLlmModelId = "claude-3.5-sonnet" +export const vscodeLlmDefaultModelId: VscodeLlmModelId = "claude-sonnet-4.5" -// https://docs.cline.bot/provider-config/vscode-language-model-api +// Curated VS Code LM (GitHub Copilot) model catalog. +// The API exposes only `maxInputTokens`; the UI and condense gate read that. `contextWindow` is +// the advertised window, kept for rows where it diverges from the ceiling (e.g. claude-opus-4.8). export const vscodeLlmModels = { - "gpt-3.5-turbo": { - contextWindow: 12114, - supportsImages: false, + "claude-opus-4.8": { + contextWindow: 679560, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-3.5-turbo", - version: "gpt-3.5-turbo-0613", - name: "GPT 3.5 Turbo", + family: "claude-opus-4.8", + version: "claude-opus-4.8", + name: "Claude Opus 4.8", supportsToolCalling: true, - maxInputTokens: 12114, + maxInputTokens: 197897, }, - "gpt-4o-mini": { - contextWindow: 12115, - supportsImages: false, + "claude-opus-4.7": { + contextWindow: 197897, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4o-mini", - version: "gpt-4o-mini-2024-07-18", - name: "GPT-4o mini", + family: "claude-opus-4.7", + version: "claude-opus-4.7", + name: "Claude Opus 4.7", supportsToolCalling: true, - maxInputTokens: 12115, + maxInputTokens: 197897, }, - "gpt-4": { - contextWindow: 28501, - supportsImages: false, + "claude-opus-4.6": { + contextWindow: 197897, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4", - version: "gpt-4-0613", - name: "GPT 4", + family: "claude-opus-4.6", + version: "claude-opus-4.6", + name: "Claude Opus 4.6", supportsToolCalling: true, - maxInputTokens: 28501, + maxInputTokens: 197897, }, - "gpt-4-0125-preview": { - contextWindow: 63826, - supportsImages: false, + "claude-opus-4.5": { + contextWindow: 167790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4-turbo", - version: "gpt-4-0125-preview", - name: "GPT 4 Turbo", + family: "claude-opus-4.5", + version: "claude-opus-4.5", + name: "Claude Opus 4.5", supportsToolCalling: true, - maxInputTokens: 63826, + maxInputTokens: 167790, }, - "gpt-4o": { - contextWindow: 63827, + "claude-sonnet-4.6": { + contextWindow: 197896, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4o", - version: "gpt-4o-2024-11-20", - name: "GPT-4o", + family: "claude-sonnet-4.6", + version: "claude-sonnet-4.6", + name: "Claude Sonnet 4.6", supportsToolCalling: true, - maxInputTokens: 63827, + maxInputTokens: 197896, }, - o1: { - contextWindow: 19827, - supportsImages: false, + "claude-sonnet-4.5": { + contextWindow: 167790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o1-ga", - version: "o1-2024-12-17", - name: "o1 (Preview)", + family: "claude-sonnet-4.5", + version: "claude-sonnet-4.5", + name: "Claude Sonnet 4.5", supportsToolCalling: true, - maxInputTokens: 19827, + maxInputTokens: 167790, }, - "o3-mini": { - contextWindow: 63827, - supportsImages: false, + "claude-haiku-4.5": { + contextWindow: 135790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o3-mini", - version: "o3-mini-2025-01-31", - name: "o3-mini", + family: "claude-haiku-4.5", + version: "claude-haiku-4.5", + name: "Claude Haiku 4.5", supportsToolCalling: true, - maxInputTokens: 63827, + maxInputTokens: 135790, }, - "claude-3.5-sonnet": { - contextWindow: 81638, + "gpt-5.5": { + contextWindow: 268426, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "claude-3.5-sonnet", - version: "claude-3.5-sonnet", - name: "Claude 3.5 Sonnet", + family: "gpt-5.5", + version: "gpt-5.5", + name: "GPT-5.5", supportsToolCalling: true, - maxInputTokens: 81638, + maxInputTokens: 268426, }, - "claude-4-sonnet": { - contextWindow: 128000, + "gpt-5.4": { + contextWindow: 268424, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "claude-sonnet-4", - version: "claude-sonnet-4", - name: "Claude Sonnet 4", + family: "gpt-5.4", + version: "gpt-5.4", + name: "GPT-5.4", supportsToolCalling: true, - maxInputTokens: 111836, + maxInputTokens: 268424, }, - "gemini-2.0-flash-001": { - contextWindow: 127827, + "gpt-5.4-mini": { + contextWindow: 271790, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gemini-2.0-flash", - version: "gemini-2.0-flash-001", - name: "Gemini 2.0 Flash", - supportsToolCalling: false, - maxInputTokens: 127827, + family: "gpt-5.4-mini", + version: "gpt-5.4-mini", + name: "GPT-5.4 mini", + supportsToolCalling: true, + maxInputTokens: 271790, }, - "gemini-2.5-pro": { - contextWindow: 128000, + "gpt-5.3-codex": { + contextWindow: 271790, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gemini-2.5-pro", - version: "gemini-2.5-pro-preview-03-25", - name: "Gemini 2.5 Pro (Preview)", + family: "gpt-5.3-codex", + version: "gpt-5.3-codex", + name: "GPT-5.3-Codex", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 271790, }, - "o4-mini": { - contextWindow: 128000, + "gpt-5-mini": { + contextWindow: 127790, + supportsImages: true, + supportsPromptCache: false, + inputPrice: 0, + outputPrice: 0, + family: "gpt-5-mini", + version: "gpt-5-mini", + name: "GPT-5 mini", + supportsToolCalling: true, + maxInputTokens: 127790, + }, + "gpt-4o-mini": { + contextWindow: 12078, supportsImages: false, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o4-mini", - version: "o4-mini-2025-04-16", - name: "o4-mini (Preview)", + family: "gpt-4o-mini", + version: "gpt-4o-mini-2024-07-18", + name: "GPT-4o mini", supportsToolCalling: true, - maxInputTokens: 111452, + maxInputTokens: 12078, }, - "gpt-4.1": { - contextWindow: 128000, + "gemini-3.1-pro-preview": { + contextWindow: 197897, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4.1", - version: "gpt-4.1-2025-04-14", - name: "GPT-4.1 (Preview)", + family: "gemini-3.1-pro-preview", + version: "gemini-3.1-pro-preview", + name: "Gemini 3.1 Pro (Preview)", supportsToolCalling: true, - maxInputTokens: 111452, + maxInputTokens: 197897, }, - "gpt-5-mini": { - contextWindow: 128000, + "gemini-3.5-flash": { + contextWindow: 197895, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-5-mini", - version: "gpt-5-mini", - name: "GPT-5 mini (Preview)", + family: "gemini-3.5-flash", + version: "gemini-3.5-flash", + name: "Gemini 3.5 Flash", + supportsToolCalling: true, + maxInputTokens: 197895, + }, + "gemini-3-flash": { + contextWindow: 108594, + supportsImages: true, + supportsPromptCache: false, + inputPrice: 0, + outputPrice: 0, + family: "gemini-3-flash", + version: "gemini-3-flash-preview", + name: "Gemini 3 Flash (Preview)", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 108594, }, - "gpt-5": { - contextWindow: 128000, + "gemini-2.5-pro": { + contextWindow: 108594, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-5", - version: "gpt-5", - name: "GPT-5 (Preview)", + family: "gemini-2.5-pro", + version: "gemini-2.5-pro", + name: "Gemini 2.5 Pro", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 108594, }, } as const satisfies Record< string, diff --git a/src/api/index.ts b/src/api/index.ts index 0c901f8e23..9e4ba3bfb5 100644 --- a/src/api/index.ts +++ b/src/api/index.ts @@ -107,6 +107,13 @@ export interface ApiHandler { getModel(): { id: string; info: ModelInfo } + /** + * Optional context window for context-management / auto-condense when it must differ from + * getModel().info.contextWindow. Only VS Code LM overrides it (static `maxInputTokens` vs its + * inflated live window); others leave it undefined and callers fall back. + */ + getCondenseContextWindow?(): number + /** * Counts tokens for content blocks * All providers extend BaseProvider which provides a default tiktoken implementation, diff --git a/src/api/providers/__tests__/vscode-lm.spec.ts b/src/api/providers/__tests__/vscode-lm.spec.ts index a79a5a4bcb..5c425b5e25 100644 --- a/src/api/providers/__tests__/vscode-lm.spec.ts +++ b/src/api/providers/__tests__/vscode-lm.spec.ts @@ -63,6 +63,7 @@ import * as vscode from "vscode" import { VsCodeLmHandler } from "../vscode-lm" import type { ApiHandlerOptions } from "../../../shared/api" import type { Anthropic } from "@anthropic-ai/sdk" +import { openAiModelInfoSaneDefaults, vscodeLlmModels } from "@roo-code/types" const mockLanguageModelChat = { id: "test-model", @@ -440,6 +441,85 @@ describe("VsCodeLmHandler", () => { const model = handler.getModel() expect(model.info).toBeDefined() }) + + it("should use the full advertised maxInputTokens without an upper cap", async () => { + // A large advertised window is surfaced as-is, not clamped to a smaller default. + const mockModel = { ...mockLanguageModelChat, maxInputTokens: 936000 } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(936000) + }) + + it("should pass through a small maxInputTokens unchanged", async () => { + const mockModel = { ...mockLanguageModelChat, maxInputTokens: 4096 } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(4096) + }) + + it("should fall back to sane defaults when maxInputTokens is not a number", async () => { + const mockModel = { ...mockLanguageModelChat, maxInputTokens: undefined as unknown as number } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(openAiModelInfoSaneDefaults.contextWindow) + }) + }) + + describe("getCondenseContextWindow", () => { + it("uses the static-table maxInputTokens for a known VS Code LM family", () => { + const opusHandler = new VsCodeLmHandler({ + vsCodeLmModelSelector: { vendor: "copilot", family: "claude-opus-4.8" }, + }) + expect(opusHandler.getCondenseContextWindow()).toBe(vscodeLlmModels["claude-opus-4.8"].maxInputTokens) + opusHandler.dispose() + }) + + it("falls back to the live model context window for families not in the static table", () => { + // Not a curated row, so the gate uses the live runtime window. + handler["client"] = mockLanguageModelChat as unknown as vscode.LanguageModelChat + expect(handler.getCondenseContextWindow()).toBe(handler.getModel().info.contextWindow) + expect(handler.getCondenseContextWindow()).toBe(mockLanguageModelChat.maxInputTokens) + }) + + it("falls back to the live window when no family is resolvable (no client, no selector family)", () => { + // No client and no selector family means `family` is undefined, so the gate skips the + // static lookup and uses getModel().info.contextWindow. + const noFamilyHandler = new VsCodeLmHandler({ vsCodeLmModelSelector: { vendor: "copilot" } }) + noFamilyHandler["client"] = null + expect(noFamilyHandler.getCondenseContextWindow()).toBe(noFamilyHandler.getModel().info.contextWindow) + expect(noFamilyHandler.getCondenseContextWindow()).toBe(openAiModelInfoSaneDefaults.contextWindow) + noFamilyHandler.dispose() + }) + + it("falls back to the derived window when the static row exists but maxInputTokens is non-positive", () => { + // A curated row exists but its maxInputTokens is <= 0, so the `> 0` guard fails and the gate + // falls back to getModel().info.contextWindow. + const family = "claude-opus-4.8" + const original = vscodeLlmModels[family].maxInputTokens + try { + ;(vscodeLlmModels[family] as { maxInputTokens: number }).maxInputTokens = 0 + const guardHandler = new VsCodeLmHandler({ + vsCodeLmModelSelector: { vendor: "copilot", family }, + }) + // Leave the client unset so `family` resolves from the selector, forcing the zeroed + // static row to be read instead of a live client's family. + guardHandler["client"] = null + expect(guardHandler.getCondenseContextWindow()).toBe(guardHandler.getModel().info.contextWindow) + expect(guardHandler.getCondenseContextWindow()).toBe(openAiModelInfoSaneDefaults.contextWindow) + guardHandler.dispose() + } finally { + ;(vscodeLlmModels[family] as { maxInputTokens: number }).maxInputTokens = original + } + }) }) describe("countTokens", () => { diff --git a/src/api/providers/vscode-lm.ts b/src/api/providers/vscode-lm.ts index 8fb564a9d5..9adcefa972 100644 --- a/src/api/providers/vscode-lm.ts +++ b/src/api/providers/vscode-lm.ts @@ -2,7 +2,7 @@ import { Anthropic } from "@anthropic-ai/sdk" import * as vscode from "vscode" import OpenAI from "openai" -import { type ModelInfo, openAiModelInfoSaneDefaults } from "@roo-code/types" +import { type ModelInfo, openAiModelInfoSaneDefaults, vscodeLlmModels } from "@roo-code/types" import type { ApiHandlerOptions } from "../../shared/api" import { SELECTOR_SEPARATOR, stringifyVsCodeLmModelSelector } from "../../shared/vsCodeSelectorUtils" @@ -562,6 +562,23 @@ export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHan } } + /** + * Context window for auto-condense. The API's advertised `client.maxInputTokens` is far larger + * than usable, so relying on it stops auto-condense from firing; measure against the curated + * static table's `maxInputTokens` instead (the same value the bar uses). Fall back to the live + * window when the model isn't in the table. + */ + getCondenseContextWindow(): number { + const family = this.client?.family ?? this.options.vsCodeLmModelSelector?.family + const staticModel = family ? vscodeLlmModels[family as keyof typeof vscodeLlmModels] : undefined + + if (staticModel && typeof staticModel.maxInputTokens === "number" && staticModel.maxInputTokens > 0) { + return staticModel.maxInputTokens + } + + return this.getModel().info.contextWindow + } + async completePrompt(prompt: string): Promise { try { const client = await this.getClient() diff --git a/src/core/context-management/__tests__/context-management.spec.ts b/src/core/context-management/__tests__/context-management.spec.ts index 9950ec536b..89797b045f 100644 --- a/src/core/context-management/__tests__/context-management.spec.ts +++ b/src/core/context-management/__tests__/context-management.spec.ts @@ -810,9 +810,9 @@ describe("Context Management", () => { const summarizeSpy = vi.spyOn(condenseModule, "summarizeConversation") const modelInfo = createModelInfo(100000, 30000) - // Set tokens to be below both the allowedTokens threshold and the percentage threshold + // Usage measured against available input space stays below the threshold. const contextWindow = modelInfo.contextWindow - const totalTokens = 40000 // 40% of context window + const totalTokens = 30000 const messagesWithSmallContent = [ ...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }, @@ -825,7 +825,7 @@ describe("Context Management", () => { maxTokens: modelInfo.maxTokens, apiHandler: mockApiHandler, autoCondenseContext: true, - autoCondenseContextPercent: 50, // Set threshold to 50% - our tokens are at 40% + autoCondenseContextPercent: 50, // Set threshold to 50% - usage is ~43% of available input systemPrompt: "System prompt", taskId, profileThresholds: {}, @@ -1507,19 +1507,36 @@ describe("Context Management", () => { }) it("should return false when context percent is below threshold", () => { + // Opt-in available-input denominator: usage stays below threshold. const result = willManageContext({ - totalTokens: 40000, - contextWindow: 100000, // 40% of context window + totalTokens: 30000, + contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, - autoCondenseContextPercent: 50, // 50% threshold + autoCondenseContextPercent: 50, // 50% threshold; usage is ~43% of available input profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, + useAvailableInputForContextPercent: true, }) expect(result).toBe(false) }) + it("should treat a negative maxTokens (vscode-lm reports -1) as the default reserve, not -1", () => { + // A -1 reserve must be treated as unknown (default reserve), not kept as -1. + const result = willManageContext({ + totalTokens: 85000, + contextWindow: 100000, + maxTokens: -1, + autoCondenseContext: false, + autoCondenseContextPercent: 50, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + it("should return true when tokens exceed allowedTokens even if autoCondenseContext is false", () => { // allowedTokens = contextWindow * (1 - 0.1) - reservedTokens = 100000 * 0.9 - 30000 = 60000 const result = willManageContext({ @@ -1581,10 +1598,9 @@ describe("Context Management", () => { }) it("should include lastMessageTokens in the calculation", () => { - // Without lastMessageTokens: 49000 tokens = 49% - // With lastMessageTokens: 49000 + 2000 = 51000 tokens = 51% + // Adding lastMessageTokens pushes usage over the threshold (opt-in available-input denominator). const resultWithoutLastMessage = willManageContext({ - totalTokens: 49000, + totalTokens: 34000, contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, @@ -1592,18 +1608,20 @@ describe("Context Management", () => { profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, + useAvailableInputForContextPercent: true, }) expect(resultWithoutLastMessage).toBe(false) const resultWithLastMessage = willManageContext({ - totalTokens: 49000, + totalTokens: 34000, contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, autoCondenseContextPercent: 50, // 50% threshold profileThresholds: {}, currentProfileId: "default", - lastMessageTokens: 2000, // Pushes total to 51% + lastMessageTokens: 2000, // Pushes usage over 50% of available input + useAvailableInputForContextPercent: true, }) expect(resultWithLastMessage).toBe(true) }) @@ -1701,4 +1719,292 @@ describe("Context Management", () => { expect(result.newContextTokensAfterTruncation).toBeGreaterThan(0) }) }) + + /** + * Regression: with the opt-in flag on, the gate measures usage against available input space + * (contextWindow - reserved output) so it stays in lockstep with the UI gauge and fires for vscode-lm. + */ + describe("contextPercent uses available input space (opt-in, regression)", () => { + const createModelInfo = (contextWindow: number, maxTokens?: number): ModelInfo => ({ + contextWindow, + supportsPromptCache: true, + maxTokens, + }) + + const messages: ApiMessage[] = [ + { role: "user", content: "First message" }, + { role: "assistant", content: "Second message" }, + { role: "user", content: "Third message" }, + { role: "assistant", content: "Fourth message" }, + { role: "user", content: "Fifth message" }, + ] + + it("willManageContext measures the percentage against available input, not the full window", () => { + // Dividing by available input clears the threshold; the full window would keep the gate closed. + const result = willManageContext({ + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(true) + }) + + it("willManageContext stays below threshold when usage is under available input", () => { + // Usage under available input stays below threshold. + const result = willManageContext({ + totalTokens: 90000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(false) + }) + + it("willManageContext treats an unlimited (-1) reserve as zero reserve for the percentage", () => { + // A -1 reserve falls back to the full window (zero reserve) for the percentage. + const result = willManageContext({ + totalTokens: 150000, + contextWindow: 200000, + maxTokens: -1, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(true) + }) + + it("willManageContext falls back to 100% when the reserve is >= the window (availableInput <= 0)", () => { + // Non-positive available input must short-circuit contextPercent to 100 rather than divide. + const result = willManageContext({ + totalTokens: 1, + contextWindow: 50000, + maxTokens: 60000, // reserve > window → availableInput = -10000 + autoCondenseContext: true, + autoCondenseContextPercent: 80, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(true) + }) + + it("willManageContext falls back to 100% when the reserve exactly equals the window (availableInput === 0)", () => { + // Boundary: reserve === window → available input 0, still the non-positive guard. + const result = willManageContext({ + totalTokens: 1, + contextWindow: 50000, + maxTokens: 50000, + autoCondenseContext: true, + autoCondenseContextPercent: 90, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(true) + }) + + it("manageContext summarizes via the 100% fallback when the reserve >= the window (availableInput <= 0)", async () => { + // reserve >= window forces contextPercent to 100, so summarization triggers. + const mockSummary = "Reserve-exceeds-window summary" + const mockSummarizeResponse: condenseModule.SummarizeResponse = { + messages: [ + { role: "user", content: "First message" }, + { role: "user", content: mockSummary, isSummary: true }, + { role: "assistant", content: "Last message" }, + ], + summary: mockSummary, + cost: 0.05, + newContextTokens: 100, + } + const summarizeSpy = vi + .spyOn(condenseModule, "summarizeConversation") + .mockResolvedValue(mockSummarizeResponse) + + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + const result = await manageContext({ + messages: messagesWithSmallContent, + totalTokens: 1, + contextWindow: 50000, + maxTokens: 60000, + apiHandler: mockApiHandler, + autoCondenseContext: true, + autoCondenseContextPercent: 80, + systemPrompt: "System prompt", + taskId, + profileThresholds: {}, + currentProfileId: "default", + useAvailableInputForContextPercent: true, + }) + + expect(summarizeSpy).toHaveBeenCalled() + expect(result).toMatchObject({ + summary: mockSummary, + prevContextTokens: 1, + }) + + summarizeSpy.mockRestore() + }) + + it("manageContext summarizes based on available input space, end-to-end", async () => { + const mockSummary = "Available-input summary" + const mockSummarizeResponse: condenseModule.SummarizeResponse = { + messages: [ + { role: "user", content: "First message" }, + { role: "user", content: mockSummary, isSummary: true }, + { role: "assistant", content: "Last message" }, + ], + summary: mockSummary, + cost: 0.05, + newContextTokens: 100, + } + const summarizeSpy = vi + .spyOn(condenseModule, "summarizeConversation") + .mockResolvedValue(mockSummarizeResponse) + + const modelInfo = createModelInfo(200000, 64000) + // Clears the threshold against available input but not the raw window; end-to-end must summarize. + const totalTokens = 100000 + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + const result = await manageContext({ + messages: messagesWithSmallContent, + totalTokens, + contextWindow: modelInfo.contextWindow, + maxTokens: modelInfo.maxTokens, + apiHandler: mockApiHandler, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + systemPrompt: "System prompt", + taskId, + profileThresholds: {}, + currentProfileId: "default", + useAvailableInputForContextPercent: true, + }) + + expect(summarizeSpy).toHaveBeenCalled() + expect(result).toMatchObject({ + summary: mockSummary, + prevContextTokens: totalTokens, + }) + + summarizeSpy.mockRestore() + }) + }) + + /** + * Scoping: the available-input denominator is opt-in; default divides by the full window. + * The maxTokens: -1 reserve guard stays global on the default path. + */ + describe("contextPercent denominator is opt-in (default = full window)", () => { + const messages: ApiMessage[] = [ + { role: "user", content: "First message" }, + { role: "assistant", content: "Second message" }, + { role: "user", content: "Third message" }, + { role: "assistant", content: "Fourth message" }, + { role: "user", content: "Fifth message" }, + ] + + it("willManageContext divides by the full window when the flag is omitted (default)", () => { + // Default divides by the full window, staying below threshold where the opt-in math would fire. + const result = willManageContext({ + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(false) + }) + + it("willManageContext fires on the same inputs when the opt-in flag is true", () => { + // Same inputs, flag on: dividing by available input clears the threshold. + const result = willManageContext({ + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(true) + }) + + it("keeps the maxTokens:-1 reserve guard on the default (full-window) path", () => { + // The -1 reserve guard is global, independent of the percent denominator. + const result = willManageContext({ + totalTokens: 85000, + contextWindow: 100000, + maxTokens: -1, + autoCondenseContext: false, + autoCondenseContextPercent: 50, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + + it("manageContext does NOT summarize on the default path where the opt-in math would have", async () => { + // Default full-window math leaves this case below threshold; the opt-in flag would summarize it. + const summarizeSpy = vi.spyOn(condenseModule, "summarizeConversation") + + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + const result = await manageContext({ + messages: messagesWithSmallContent, + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + apiHandler: mockApiHandler, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + systemPrompt: "System prompt", + taskId, + profileThresholds: {}, + currentProfileId: "default", + }) + + expect(summarizeSpy).not.toHaveBeenCalled() + expect(result).toEqual({ + messages: messagesWithSmallContent, + summary: "", + cost: 0, + prevContextTokens: 100000, + }) + + summarizeSpy.mockRestore() + }) + }) }) diff --git a/src/core/context-management/index.ts b/src/core/context-management/index.ts index 243d7bd797..ed2ee6be5f 100644 --- a/src/core/context-management/index.ts +++ b/src/core/context-management/index.ts @@ -147,6 +147,11 @@ export type WillManageContextOptions = { profileThresholds: Record currentProfileId: string lastMessageTokens: number + /** + * Opt-in (vscode-lm): measure the condense percentage against available input space + * (contextWindow - reserved output) instead of the full window. Others leave it undefined. + */ + useAvailableInputForContextPercent?: boolean } /** @@ -167,16 +172,19 @@ export function willManageContext({ profileThresholds, currentProfileId, lastMessageTokens, + useAvailableInputForContextPercent, }: WillManageContextOptions): boolean { if (!autoCondenseContext) { // When auto-condense is disabled, only truncation can occur - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS const prevContextTokens = totalTokens + lastMessageTokens const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens return prevContextTokens > allowedTokens } - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS const prevContextTokens = totalTokens + lastMessageTokens const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens @@ -192,7 +200,16 @@ export function willManageContext({ // Invalid values fall back to global setting (effectiveThreshold already set) } - const contextPercent = (100 * prevContextTokens) / contextWindow + // Default: divide by the full context window. Opt-in (vscode-lm) divides by available input + // (window minus reserved output); an unknown/unlimited reserve (-1) falls back to the full window. + let contextPercent: number + if (useAvailableInputForContextPercent) { + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 + const availableInputTokens = contextWindow - reservedForOutput + contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 + } else { + contextPercent = (100 * prevContextTokens) / contextWindow + } return contextPercent >= effectiveThreshold || prevContextTokens > allowedTokens } @@ -229,6 +246,11 @@ export type ContextManagementOptions = { cwd?: string /** Optional controller for file access validation */ rooIgnoreController?: RooIgnoreController + /** + * Opt-in (vscode-lm): measure the condense percentage against available input space + * (contextWindow - reserved output) instead of the full window. Others leave it undefined. + */ + useAvailableInputForContextPercent?: boolean } export type ContextManagementResult = SummarizeResponse & { @@ -262,12 +284,14 @@ export async function manageContext({ filesReadByRoo, cwd, rooIgnoreController, + useAvailableInputForContextPercent, }: ContextManagementOptions): Promise { let error: string | undefined let errorDetails: string | undefined let cost = 0 // Calculate the maximum tokens reserved for response - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS // Estimate tokens for the last message (which is always a user message) const lastMessage = messages[messages.length - 1] @@ -304,7 +328,16 @@ export async function manageContext({ // If no specific threshold is found for the profile, fall back to global setting if (autoCondenseContext) { - const contextPercent = (100 * prevContextTokens) / contextWindow + // Default: divide by the full context window. Opt-in (vscode-lm) divides by available input + // (window minus reserved output); an unknown/unlimited reserve (-1) falls back to the full window. + let contextPercent: number + if (useAvailableInputForContextPercent) { + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 + const availableInputTokens = contextWindow - reservedForOutput + contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 + } else { + contextPercent = (100 * prevContextTokens) / contextWindow + } if (contextPercent >= effectiveThreshold || prevContextTokens > allowedTokens) { // Attempt to intelligently condense the context const result = await summarizeConversation({ diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts index 50d4674fd0..ce9e5bcec2 100644 --- a/src/core/task/Task.ts +++ b/src/core/task/Task.ts @@ -2688,9 +2688,13 @@ export class Task extends EventEmitter implements TaskLike { if (signal.aborted) { reject(new Error("Request cancelled by user")) } else { - signal.addEventListener("abort", () => { - reject(new Error("Request cancelled by user")) - }, { once: true }) + signal.addEventListener( + "abort", + () => { + reject(new Error("Request cancelled by user")) + }, + { once: true }, + ) } }) return await Promise.race([nextPromise, abortPromise]) @@ -3734,7 +3738,10 @@ export class Task extends EventEmitter implements TaskLike { settings: this.apiConfiguration, }) - const contextWindow = modelInfo.contextWindow + // vscode-lm condenses against its static-table maxInputTokens (not the inflated live window); + // only it implements getCondenseContextWindow, so others fall back to the full contextWindow. + const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow + const useAvailableInputForContextPercent = typeof this.api.getCondenseContextWindow === "function" // Get the current profile ID using the helper method const currentProfileId = this.getCurrentProfileId(state) @@ -3803,6 +3810,7 @@ export class Task extends EventEmitter implements TaskLike { currentProfileId, metadata, environmentDetails, + useAvailableInputForContextPercent, }) if (truncateResult.messages !== this.apiConversationHistory) { @@ -3930,7 +3938,10 @@ export class Task extends EventEmitter implements TaskLike { settings: this.apiConfiguration, }) - const contextWindow = modelInfo.contextWindow + // vscode-lm condenses against its static-table maxInputTokens (not the inflated live window); + // only it implements getCondenseContextWindow, so others fall back to the full contextWindow. + const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow + const useAvailableInputForContextPercent = typeof this.api.getCondenseContextWindow === "function" // Get the current profile ID using the helper method const currentProfileId = this.getCurrentProfileId(state) @@ -3955,6 +3966,7 @@ export class Task extends EventEmitter implements TaskLike { profileThresholds, currentProfileId, lastMessageTokens, + useAvailableInputForContextPercent, }) // Send condenseTaskContextStarted BEFORE manageContext to show in-progress indicator @@ -4037,6 +4049,7 @@ export class Task extends EventEmitter implements TaskLike { filesReadByRoo: contextMgmtFilesReadByRoo, cwd: this.cwd, rooIgnoreController: this.rooIgnoreController, + useAvailableInputForContextPercent, }) if (truncateResult.messages !== this.apiConversationHistory) { await this.overwriteApiConversationHistory(truncateResult.messages) @@ -4191,10 +4204,14 @@ export class Task extends EventEmitter implements TaskLike { const iterator = stream[Symbol.asyncIterator]() // Set up abort handling - when the signal is aborted, clean up the controller reference - abortSignal.addEventListener("abort", () => { - console.log(`[Task#${this.taskId}.${this.instanceId}] AbortSignal triggered for current request`) - this.currentRequestAbortController = undefined - }, { once: true }) + abortSignal.addEventListener( + "abort", + () => { + console.log(`[Task#${this.taskId}.${this.instanceId}] AbortSignal triggered for current request`) + this.currentRequestAbortController = undefined + }, + { once: true }, + ) try { // Awaiting first chunk to see if it will throw an error. @@ -4206,9 +4223,13 @@ export class Task extends EventEmitter implements TaskLike { if (abortSignal.aborted) { reject(new Error("Request cancelled by user")) } else { - abortSignal.addEventListener("abort", () => { - reject(new Error("Request cancelled by user")) - }, { once: true }) + abortSignal.addEventListener( + "abort", + () => { + reject(new Error("Request cancelled by user")) + }, + { once: true }, + ) } }) diff --git a/webview-ui/src/components/chat/TaskHeader.tsx b/webview-ui/src/components/chat/TaskHeader.tsx index 4ddf5ef35c..927d3d057d 100644 --- a/webview-ui/src/components/chat/TaskHeader.tsx +++ b/webview-ui/src/components/chat/TaskHeader.tsx @@ -76,7 +76,8 @@ const TaskHeader = ({ : 0, [model, modelId, apiConfiguration], ) - const reservedForOutput = maxTokens || 0 + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 const condenseButton = ( { // Should show 0% when available input space is 0 expect(screen.getByText("0%")).toBeInTheDocument() }) + + it("should treat a negative maxTokens (vscode-lm reports -1) as zero reserve", () => { + // vscode-lm reports maxTokens: -1 (unlimited). The guard must treat that negative reserve + // as zero, so available space == contextWindow rather than being inflated by a kept -1. + mockModelInfo = { contextWindow: 1000, maxTokens: -1 } + mockMaxOutputTokens = -1 + + renderTaskHeader({ contextTokens: 250 }) + + expect(screen.getByText("25%")).toBeInTheDocument() + }) }) }) diff --git a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts index 0dc42129c0..f4fd51cffc 100644 --- a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts +++ b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts @@ -14,6 +14,8 @@ import { minimaxDefaultModelId, minimaxModels, openRouterDefaultModelId, + vscodeLlmModels, + vscodeLlmDefaultModelId, } from "@roo-code/types" import { useSelectedModel } from "../useSelectedModel" @@ -772,4 +774,75 @@ describe("useSelectedModel", () => { expect(result.current.info).toEqual(minimaxModels["MiniMax-M2.7"]) }) }) + + describe("vscode-lm provider", () => { + beforeEach(() => { + mockUseRouterModels.mockReturnValue({ + data: { + openrouter: {}, + requesty: {}, + litellm: {}, + }, + isLoading: false, + isError: false, + } as any) + + mockUseOpenRouterModelProviders.mockReturnValue({ + data: {}, + isLoading: false, + isError: false, + } as any) + }) + + it("resolves a listed family's contextWindow to its maxInputTokens", () => { + const family = vscodeLlmDefaultModelId + const apiConfiguration: ProviderSettings = { + apiProvider: "vscode-lm", + vsCodeLmModelSelector: { vendor: "copilot", family }, + } + + const wrapper = createWrapper() + const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) + + expect(result.current.provider).toBe("vscode-lm") + expect(result.current.id).toBe(`copilot/${family}`) + // The bar and the condense gate share one source of truth: contextWindow === maxInputTokens. + expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[family].maxInputTokens) + expect(result.current.info?.supportsImages).toBe(false) + }) + + it("pins a divergent family's contextWindow to maxInputTokens, not its advertised window", () => { + // claude-opus-4.8 is the row where contextWindow and maxInputTokens differ; a field swap to + // the advertised window would be caught here. + const family = "claude-opus-4.8" + const apiConfiguration: ProviderSettings = { + apiProvider: "vscode-lm", + vsCodeLmModelSelector: { vendor: "copilot", family }, + } + + const wrapper = createWrapper() + const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) + + expect(result.current.provider).toBe("vscode-lm") + expect(result.current.id).toBe(`copilot/${family}`) + expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[family].maxInputTokens) // 197897 + expect(result.current.info?.contextWindow).not.toBe(vscodeLlmModels[family].contextWindow) // NOT 679560 + expect(result.current.info?.supportsImages).toBe(false) + }) + + it("falls back to the default model's window for an unlisted family (NOT 128000)", () => { + const apiConfiguration: ProviderSettings = { + apiProvider: "vscode-lm", + vsCodeLmModelSelector: { vendor: "copilot", family: "totally-unknown-family" }, + } + + const wrapper = createWrapper() + const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) + + // A family miss must not use the 128000 sane-defaults window; use the default model's instead. + expect(result.current.info?.contextWindow).not.toBe(128000) + expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[vscodeLlmDefaultModelId].maxInputTokens) + expect(result.current.info?.supportsImages).toBe(false) + }) + }) }) diff --git a/webview-ui/src/components/ui/hooks/useSelectedModel.ts b/webview-ui/src/components/ui/hooks/useSelectedModel.ts index d3ebb6c0dd..ddc1a19755 100644 --- a/webview-ui/src/components/ui/hooks/useSelectedModel.ts +++ b/webview-ui/src/components/ui/hooks/useSelectedModel.ts @@ -310,8 +310,20 @@ function getSelectedModel({ ? `${apiConfiguration.vsCodeLmModelSelector.vendor}/${apiConfiguration.vsCodeLmModelSelector.family}` : vscodeLlmDefaultModelId const modelFamily = apiConfiguration?.vsCodeLmModelSelector?.family ?? vscodeLlmDefaultModelId - const info = vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] - return { id, info: { ...openAiModelInfoSaneDefaults, ...info, supportsImages: false } } // VSCode LM API currently doesn't support images. + // On a family miss, fall back to the default model entry, not openAiModelInfoSaneDefaults + // (whose 128K contextWindow would diverge from the gate and skew the bar). + const listedModel = + vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] ?? vscodeLlmModels[vscodeLlmDefaultModelId] + // Set contextWindow = maxInputTokens so the UI bar shares one source of truth with the gate, + // whose primary window is getCondenseContextWindow() (static-table maxInputTokens); this + // info.contextWindow is only the gate's fallback. + const info: ModelInfo = { + ...openAiModelInfoSaneDefaults, + ...listedModel, + contextWindow: listedModel.maxInputTokens, + supportsImages: false, // VSCode LM API currently doesn't support images. + } + return { id, info } } case "sambanova": { const id = apiConfiguration.apiModelId ?? defaultModelId