From 1eadaea4bd702514281a5b8093e18fa64bf70498 Mon Sep 17 00:00:00 2001 From: Bertan Ari Date: Tue, 23 Jun 2026 17:45:17 -0700 Subject: [PATCH 1/5] fix(vscode-lm): reliable auto context condensing Port of simurg79/Roo-Code#11 into Zoo-Code. --- .changeset/vscode-lm-condense-fix.md | 5 + .../types/src/__tests__/vscode-llm.spec.ts | 36 +++ packages/types/src/providers/vscode-llm.ts | 231 ++++++++++-------- src/api/index.ts | 11 + src/api/providers/__tests__/vscode-lm.spec.ts | 50 ++++ src/api/providers/vscode-lm.ts | 24 +- .../__tests__/context-management.spec.ts | 165 ++++++++++++- src/core/context-management/index.ts | 27 +- src/core/task/Task.ts | 42 +++- webview-ui/src/components/chat/TaskHeader.tsx | 3 +- .../chat/__tests__/TaskHeader.spec.tsx | 14 ++ .../hooks/__tests__/useSelectedModel.spec.ts | 55 +++++ .../components/ui/hooks/useSelectedModel.ts | 17 +- 13 files changed, 549 insertions(+), 131 deletions(-) create mode 100644 .changeset/vscode-lm-condense-fix.md create mode 100644 packages/types/src/__tests__/vscode-llm.spec.ts diff --git a/.changeset/vscode-lm-condense-fix.md b/.changeset/vscode-lm-condense-fix.md new file mode 100644 index 0000000000..a592361786 --- /dev/null +++ b/.changeset/vscode-lm-condense-fix.md @@ -0,0 +1,5 @@ +--- +"zoo-code": patch +--- + +Fix unreliable automatic context condensing on the VS Code LM (vscode-lm) provider. The condense gate now treats the provider's `maxTokens: -1` (unlimited) as the default output reserve and measures usage against available input space, and a new `getCondenseContextWindow()` seam makes the gate use the curated model `maxInputTokens` instead of the inflated live window. Also refreshes the VS Code LM model catalog and default model. diff --git a/packages/types/src/__tests__/vscode-llm.spec.ts b/packages/types/src/__tests__/vscode-llm.spec.ts new file mode 100644 index 0000000000..7a2eabddf7 --- /dev/null +++ b/packages/types/src/__tests__/vscode-llm.spec.ts @@ -0,0 +1,36 @@ +import { describe, it, expect } from "vitest" +import { vscodeLlmModels, vscodeLlmDefaultModelId } from "../providers/vscode-llm.js" + +describe("vscodeLlmModels", () => { + it("exposes the opus-4.8 row with its measured maxInputTokens and contextWindow", () => { + // The VS Code LM API exposes only maxInputTokens; that is the value the UI reads from this + // table (useSelectedModel.ts). For claude-opus-4.8 the two fields intentionally DIVERGE: + // maxInputTokens (197897) is the enforced input ceiling, while contextWindow (679560) records + // the larger advertised window. The UI reads maxInputTokens, so the divergence is a deliberate + // tripwire — assert the actual on-disk literals rather than forcing equality. + expect(vscodeLlmModels).toHaveProperty("claude-opus-4.8") + expect(vscodeLlmModels["claude-opus-4.8"].contextWindow).toBe(679560) + expect(vscodeLlmModels["claude-opus-4.8"].maxInputTokens).toBe(197897) + }) + it("preserves the real window for models captured with a smaller maxInputTokens", () => { + expect(vscodeLlmModels["gpt-4o-mini"].maxInputTokens).toBe(12078) + expect(vscodeLlmModels["gpt-4o-mini"].contextWindow).toBe(12078) + expect(vscodeLlmModels["gemini-2.5-pro"].contextWindow).toBe(108594) + expect(vscodeLlmModels["gemini-2.5-pro"].maxInputTokens).toBe(108594) + }) + it("keeps both window fields populated and positive for every row", () => { + for (const [family, model] of Object.entries(vscodeLlmModels)) { + expect(model.contextWindow, `${family}: contextWindow must be a positive integer`).toBeGreaterThan(0) + expect(model.maxInputTokens, `${family}: maxInputTokens must be a positive integer`).toBeGreaterThan(0) + } + }) + it("excludes fabricated/internal/alias families and the dropped legacy rows", () => { + expect(vscodeLlmModels).not.toHaveProperty("claude-opus-4.7-high") + expect(vscodeLlmModels).not.toHaveProperty("claude-3.5-sonnet") + expect(vscodeLlmModels).not.toHaveProperty("claude-4-sonnet") + }) + it("defaults to a model id that exists in the table", () => { + expect(vscodeLlmDefaultModelId).toBe("claude-sonnet-4.5") + expect(vscodeLlmModels).toHaveProperty(vscodeLlmDefaultModelId) + }) +}) diff --git a/packages/types/src/providers/vscode-llm.ts b/packages/types/src/providers/vscode-llm.ts index efe0691913..46df75fac9 100644 --- a/packages/types/src/providers/vscode-llm.ts +++ b/packages/types/src/providers/vscode-llm.ts @@ -2,189 +2,222 @@ import type { ModelInfo } from "../model.js" export type VscodeLlmModelId = keyof typeof vscodeLlmModels -export const vscodeLlmDefaultModelId: VscodeLlmModelId = "claude-3.5-sonnet" +export const vscodeLlmDefaultModelId: VscodeLlmModelId = "claude-sonnet-4.5" -// https://docs.cline.bot/provider-config/vscode-language-model-api +// Curated VS Code LM (GitHub Copilot) model catalog. +// +// The VS Code LM API only exposes `maxInputTokens` per model; it does NOT report a separate +// total context window. For each row, `contextWindow` records the model's advertised window +// while `maxInputTokens` is the enforced input ceiling the UI actually reads (via +// useSelectedModel.ts) and the condense gate measures against. For most rows the two values +// match. They intentionally DIVERGE only where the provider advertises a larger window than the +// usable input ceiling (e.g. claude-opus-4.8): keeping both fields lets the context bar and the +// auto-condense gate stay on a single source of truth (maxInputTokens) without losing the real +// advertised window. export const vscodeLlmModels = { - "gpt-3.5-turbo": { - contextWindow: 12114, - supportsImages: false, + "claude-opus-4.8": { + contextWindow: 679560, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-3.5-turbo", - version: "gpt-3.5-turbo-0613", - name: "GPT 3.5 Turbo", + family: "claude-opus-4.8", + version: "claude-opus-4.8", + name: "Claude Opus 4.8", supportsToolCalling: true, - maxInputTokens: 12114, + maxInputTokens: 197897, }, - "gpt-4o-mini": { - contextWindow: 12115, - supportsImages: false, + "claude-opus-4.7": { + contextWindow: 197897, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4o-mini", - version: "gpt-4o-mini-2024-07-18", - name: "GPT-4o mini", + family: "claude-opus-4.7", + version: "claude-opus-4.7", + name: "Claude Opus 4.7", supportsToolCalling: true, - maxInputTokens: 12115, + maxInputTokens: 197897, }, - "gpt-4": { - contextWindow: 28501, - supportsImages: false, + "claude-opus-4.6": { + contextWindow: 197897, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4", - version: "gpt-4-0613", - name: "GPT 4", + family: "claude-opus-4.6", + version: "claude-opus-4.6", + name: "Claude Opus 4.6", supportsToolCalling: true, - maxInputTokens: 28501, + maxInputTokens: 197897, }, - "gpt-4-0125-preview": { - contextWindow: 63826, - supportsImages: false, + "claude-opus-4.5": { + contextWindow: 167790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4-turbo", - version: "gpt-4-0125-preview", - name: "GPT 4 Turbo", + family: "claude-opus-4.5", + version: "claude-opus-4.5", + name: "Claude Opus 4.5", supportsToolCalling: true, - maxInputTokens: 63826, + maxInputTokens: 167790, }, - "gpt-4o": { - contextWindow: 63827, + "claude-sonnet-4.6": { + contextWindow: 197896, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4o", - version: "gpt-4o-2024-11-20", - name: "GPT-4o", + family: "claude-sonnet-4.6", + version: "claude-sonnet-4.6", + name: "Claude Sonnet 4.6", supportsToolCalling: true, - maxInputTokens: 63827, + maxInputTokens: 197896, }, - o1: { - contextWindow: 19827, - supportsImages: false, + "claude-sonnet-4.5": { + contextWindow: 167790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o1-ga", - version: "o1-2024-12-17", - name: "o1 (Preview)", + family: "claude-sonnet-4.5", + version: "claude-sonnet-4.5", + name: "Claude Sonnet 4.5", supportsToolCalling: true, - maxInputTokens: 19827, + maxInputTokens: 167790, }, - "o3-mini": { - contextWindow: 63827, - supportsImages: false, + "claude-haiku-4.5": { + contextWindow: 135790, + supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o3-mini", - version: "o3-mini-2025-01-31", - name: "o3-mini", + family: "claude-haiku-4.5", + version: "claude-haiku-4.5", + name: "Claude Haiku 4.5", supportsToolCalling: true, - maxInputTokens: 63827, + maxInputTokens: 135790, }, - "claude-3.5-sonnet": { - contextWindow: 81638, + "gpt-5.5": { + contextWindow: 268426, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "claude-3.5-sonnet", - version: "claude-3.5-sonnet", - name: "Claude 3.5 Sonnet", + family: "gpt-5.5", + version: "gpt-5.5", + name: "GPT-5.5", supportsToolCalling: true, - maxInputTokens: 81638, + maxInputTokens: 268426, }, - "claude-4-sonnet": { - contextWindow: 128000, + "gpt-5.4": { + contextWindow: 268424, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "claude-sonnet-4", - version: "claude-sonnet-4", - name: "Claude Sonnet 4", + family: "gpt-5.4", + version: "gpt-5.4", + name: "GPT-5.4", supportsToolCalling: true, - maxInputTokens: 111836, + maxInputTokens: 268424, }, - "gemini-2.0-flash-001": { - contextWindow: 127827, + "gpt-5.4-mini": { + contextWindow: 271790, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gemini-2.0-flash", - version: "gemini-2.0-flash-001", - name: "Gemini 2.0 Flash", - supportsToolCalling: false, - maxInputTokens: 127827, + family: "gpt-5.4-mini", + version: "gpt-5.4-mini", + name: "GPT-5.4 mini", + supportsToolCalling: true, + maxInputTokens: 271790, }, - "gemini-2.5-pro": { - contextWindow: 128000, + "gpt-5.3-codex": { + contextWindow: 271790, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gemini-2.5-pro", - version: "gemini-2.5-pro-preview-03-25", - name: "Gemini 2.5 Pro (Preview)", + family: "gpt-5.3-codex", + version: "gpt-5.3-codex", + name: "GPT-5.3-Codex", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 271790, }, - "o4-mini": { - contextWindow: 128000, + "gpt-5-mini": { + contextWindow: 127790, + supportsImages: true, + supportsPromptCache: false, + inputPrice: 0, + outputPrice: 0, + family: "gpt-5-mini", + version: "gpt-5-mini", + name: "GPT-5 mini", + supportsToolCalling: true, + maxInputTokens: 127790, + }, + "gpt-4o-mini": { + contextWindow: 12078, supportsImages: false, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "o4-mini", - version: "o4-mini-2025-04-16", - name: "o4-mini (Preview)", + family: "gpt-4o-mini", + version: "gpt-4o-mini-2024-07-18", + name: "GPT-4o mini", supportsToolCalling: true, - maxInputTokens: 111452, + maxInputTokens: 12078, }, - "gpt-4.1": { - contextWindow: 128000, + "gemini-3.1-pro-preview": { + contextWindow: 197897, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-4.1", - version: "gpt-4.1-2025-04-14", - name: "GPT-4.1 (Preview)", + family: "gemini-3.1-pro-preview", + version: "gemini-3.1-pro-preview", + name: "Gemini 3.1 Pro (Preview)", supportsToolCalling: true, - maxInputTokens: 111452, + maxInputTokens: 197897, }, - "gpt-5-mini": { - contextWindow: 128000, + "gemini-3.5-flash": { + contextWindow: 197895, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-5-mini", - version: "gpt-5-mini", - name: "GPT-5 mini (Preview)", + family: "gemini-3.5-flash", + version: "gemini-3.5-flash", + name: "Gemini 3.5 Flash", + supportsToolCalling: true, + maxInputTokens: 197895, + }, + "gemini-3-flash": { + contextWindow: 108594, + supportsImages: true, + supportsPromptCache: false, + inputPrice: 0, + outputPrice: 0, + family: "gemini-3-flash", + version: "gemini-3-flash-preview", + name: "Gemini 3 Flash (Preview)", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 108594, }, - "gpt-5": { - contextWindow: 128000, + "gemini-2.5-pro": { + contextWindow: 108594, supportsImages: true, supportsPromptCache: false, inputPrice: 0, outputPrice: 0, - family: "gpt-5", - version: "gpt-5", - name: "GPT-5 (Preview)", + family: "gemini-2.5-pro", + version: "gemini-2.5-pro", + name: "Gemini 2.5 Pro", supportsToolCalling: true, - maxInputTokens: 108637, + maxInputTokens: 108594, }, } as const satisfies Record< string, diff --git a/src/api/index.ts b/src/api/index.ts index 0c901f8e23..00201b0d21 100644 --- a/src/api/index.ts +++ b/src/api/index.ts @@ -107,6 +107,17 @@ export interface ApiHandler { getModel(): { id: string; info: ModelInfo } + /** + * Optional: the context window (in tokens) to use for context-management / + * auto-condense decisions, when it must differ from getModel().info.contextWindow. + * + * Only the VS Code LM (Copilot) provider overrides this, to measure usage against the + * model's static `maxInputTokens` instead of the inflated live window VS Code reports. + * Other providers leave it undefined and callers fall back to getModel().info.contextWindow, + * so their behavior is unchanged. + */ + getCondenseContextWindow?(): number + /** * Counts tokens for content blocks * All providers extend BaseProvider which provides a default tiktoken implementation, diff --git a/src/api/providers/__tests__/vscode-lm.spec.ts b/src/api/providers/__tests__/vscode-lm.spec.ts index a79a5a4bcb..0e1797ab44 100644 --- a/src/api/providers/__tests__/vscode-lm.spec.ts +++ b/src/api/providers/__tests__/vscode-lm.spec.ts @@ -63,6 +63,7 @@ import * as vscode from "vscode" import { VsCodeLmHandler } from "../vscode-lm" import type { ApiHandlerOptions } from "../../../shared/api" import type { Anthropic } from "@anthropic-ai/sdk" +import { openAiModelInfoSaneDefaults, vscodeLlmModels } from "@roo-code/types" const mockLanguageModelChat = { id: "test-model", @@ -440,6 +441,55 @@ describe("VsCodeLmHandler", () => { const model = handler.getModel() expect(model.info).toBeDefined() }) + + it("should use the full advertised maxInputTokens without an upper cap", async () => { + // VS Code can report a very large advertised window; getModel surfaces it as-is + // (Math.max(0, maxInputTokens)) rather than clamping to a smaller default. + const mockModel = { ...mockLanguageModelChat, maxInputTokens: 936000 } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(936000) + }) + + it("should pass through a small maxInputTokens unchanged", async () => { + const mockModel = { ...mockLanguageModelChat, maxInputTokens: 4096 } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(4096) + }) + + it("should fall back to sane defaults when maxInputTokens is not a number", async () => { + const mockModel = { ...mockLanguageModelChat, maxInputTokens: undefined as unknown as number } + ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) + handler["client"] = null + await handler.initializeClient() + + const model = handler.getModel() + expect(model.info.contextWindow).toBe(openAiModelInfoSaneDefaults.contextWindow) + }) + }) + + describe("getCondenseContextWindow", () => { + it("uses the static-table maxInputTokens for a known VS Code LM family", () => { + const opusHandler = new VsCodeLmHandler({ + vsCodeLmModelSelector: { vendor: "copilot", family: "claude-opus-4.8" }, + }) + expect(opusHandler.getCondenseContextWindow()).toBe(vscodeLlmModels["claude-opus-4.8"].maxInputTokens) + opusHandler.dispose() + }) + + it("falls back to the live model context window for families not in the static table", () => { + // test-family is not a curated row, so the gate uses the live runtime window. + handler["client"] = mockLanguageModelChat as unknown as vscode.LanguageModelChat + expect(handler.getCondenseContextWindow()).toBe(handler.getModel().info.contextWindow) + expect(handler.getCondenseContextWindow()).toBe(mockLanguageModelChat.maxInputTokens) + }) }) describe("countTokens", () => { diff --git a/src/api/providers/vscode-lm.ts b/src/api/providers/vscode-lm.ts index 8fb564a9d5..d730658b44 100644 --- a/src/api/providers/vscode-lm.ts +++ b/src/api/providers/vscode-lm.ts @@ -2,7 +2,7 @@ import { Anthropic } from "@anthropic-ai/sdk" import * as vscode from "vscode" import OpenAI from "openai" -import { type ModelInfo, openAiModelInfoSaneDefaults } from "@roo-code/types" +import { type ModelInfo, openAiModelInfoSaneDefaults, vscodeLlmModels } from "@roo-code/types" import type { ApiHandlerOptions } from "../../shared/api" import { SELECTOR_SEPARATOR, stringifyVsCodeLmModelSelector } from "../../shared/vsCodeSelectorUtils" @@ -562,6 +562,28 @@ export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHan } } + /** + * Context window used for auto-condense / context-management decisions. + * + * VS Code's LM API reports `client.maxInputTokens` as Copilot's *advertised* window, + * which is far larger than the realistic usable window; relying on it keeps auto-condense + * from ever firing. For condense decisions we instead measure usage against the curated + * static table's `maxInputTokens` — the same value the context bar uses via + * `useSelectedModel` — so the gate and the gauge stay on one source of truth. + * + * Falls back to the live runtime window when the selected model isn't in the static table. + */ + getCondenseContextWindow(): number { + const family = this.client?.family ?? this.options.vsCodeLmModelSelector?.family + const staticModel = family ? vscodeLlmModels[family as keyof typeof vscodeLlmModels] : undefined + + if (staticModel && typeof staticModel.maxInputTokens === "number" && staticModel.maxInputTokens > 0) { + return staticModel.maxInputTokens + } + + return this.getModel().info.contextWindow + } + async completePrompt(prompt: string): Promise { try { const client = await this.getClient() diff --git a/src/core/context-management/__tests__/context-management.spec.ts b/src/core/context-management/__tests__/context-management.spec.ts index 9950ec536b..2e81d61e9f 100644 --- a/src/core/context-management/__tests__/context-management.spec.ts +++ b/src/core/context-management/__tests__/context-management.spec.ts @@ -810,9 +810,10 @@ describe("Context Management", () => { const summarizeSpy = vi.spyOn(condenseModule, "summarizeConversation") const modelInfo = createModelInfo(100000, 30000) - // Set tokens to be below both the allowedTokens threshold and the percentage threshold + // Usage is measured against available input space (contextWindow - maxTokens reserve). + // available = 100000 - 30000 = 70000; 30000 / 70000 ≈ 43% < 50% threshold. const contextWindow = modelInfo.contextWindow - const totalTokens = 40000 // 40% of context window + const totalTokens = 30000 const messagesWithSmallContent = [ ...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }, @@ -825,7 +826,7 @@ describe("Context Management", () => { maxTokens: modelInfo.maxTokens, apiHandler: mockApiHandler, autoCondenseContext: true, - autoCondenseContextPercent: 50, // Set threshold to 50% - our tokens are at 40% + autoCondenseContextPercent: 50, // Set threshold to 50% - usage is ~43% of available input systemPrompt: "System prompt", taskId, profileThresholds: {}, @@ -1507,12 +1508,14 @@ describe("Context Management", () => { }) it("should return false when context percent is below threshold", () => { + // Usage is measured against available input space (contextWindow - maxTokens reserve). + // available = 100000 - 30000 = 70000; 30000 / 70000 ≈ 43% < 50% threshold. const result = willManageContext({ - totalTokens: 40000, - contextWindow: 100000, // 40% of context window + totalTokens: 30000, + contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, - autoCondenseContextPercent: 50, // 50% threshold + autoCondenseContextPercent: 50, // 50% threshold; usage is ~43% of available input profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, @@ -1520,6 +1523,26 @@ describe("Context Management", () => { expect(result).toBe(false) }) + it("should treat a negative maxTokens (vscode-lm reports -1) as the default reserve, not -1", () => { + // vscode-lm reports maxTokens: -1 (unlimited). A naive `maxTokens || DEFAULT` keeps -1, + // which would make allowedTokens balloon past the window and skew the percentage. The + // guard must treat -1 like an unknown reserve (ANTHROPIC_DEFAULT_MAX_TOKENS for the + // allowed-tokens math, zero reserve for the available-input percentage). + // With autoCondenseContext disabled, only the allowedTokens path can trigger: + // allowedTokens = 100000 * 0.9 - 8192 = 81808; totalTokens 85000 > 81808 → true. + const result = willManageContext({ + totalTokens: 85000, + contextWindow: 100000, + maxTokens: -1, + autoCondenseContext: false, + autoCondenseContextPercent: 50, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + it("should return true when tokens exceed allowedTokens even if autoCondenseContext is false", () => { // allowedTokens = contextWindow * (1 - 0.1) - reservedTokens = 100000 * 0.9 - 30000 = 60000 const result = willManageContext({ @@ -1581,10 +1604,12 @@ describe("Context Management", () => { }) it("should include lastMessageTokens in the calculation", () => { - // Without lastMessageTokens: 49000 tokens = 49% - // With lastMessageTokens: 49000 + 2000 = 51000 tokens = 51% + // Usage is measured against available input space (contextWindow - maxTokens reserve). + // available = 100000 - 30000 = 70000. + // Without lastMessageTokens: 34000 / 70000 ≈ 48.6% < 50% threshold. + // With lastMessageTokens: (34000 + 2000) / 70000 ≈ 51.4% ≥ 50% threshold. const resultWithoutLastMessage = willManageContext({ - totalTokens: 49000, + totalTokens: 34000, contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, @@ -1596,14 +1621,14 @@ describe("Context Management", () => { expect(resultWithoutLastMessage).toBe(false) const resultWithLastMessage = willManageContext({ - totalTokens: 49000, + totalTokens: 34000, contextWindow: 100000, maxTokens: 30000, autoCondenseContext: true, autoCondenseContextPercent: 50, // 50% threshold profileThresholds: {}, currentProfileId: "default", - lastMessageTokens: 2000, // Pushes total to 51% + lastMessageTokens: 2000, // Pushes usage over 50% of available input }) expect(resultWithLastMessage).toBe(true) }) @@ -1701,4 +1726,122 @@ describe("Context Management", () => { expect(result.newContextTokensAfterTruncation).toBeGreaterThan(0) }) }) + + /** + * Regression tests: the condense gate must measure usage against available input space + * (contextWindow - reserved output), not the raw context window. This keeps the gate in + * lockstep with the UI context gauge and ensures it actually fires for providers like + * vscode-lm that report maxTokens: -1. + */ + describe("contextPercent uses available input space (regression)", () => { + const createModelInfo = (contextWindow: number, maxTokens?: number): ModelInfo => ({ + contextWindow, + supportsPromptCache: true, + maxTokens, + }) + + const messages: ApiMessage[] = [ + { role: "user", content: "First message" }, + { role: "assistant", content: "Second message" }, + { role: "user", content: "Third message" }, + { role: "assistant", content: "Fourth message" }, + { role: "user", content: "Fifth message" }, + ] + + it("willManageContext measures the percentage against available input, not the full window", () => { + // contextWindow 200000, reserve 64000 → available input 136000. + // totalTokens 100000 → 100000 / 136000 ≈ 73.5%, which clears the 70% threshold. + // Against the full window it would be only 50% and the gate would (wrongly) stay closed. + const result = willManageContext({ + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + + it("willManageContext stays below threshold when usage is under available input", () => { + // available input 136000; totalTokens 90000 → ≈ 66.2% < 70% threshold. + const result = willManageContext({ + totalTokens: 90000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(false) + }) + + it("willManageContext treats an unlimited (-1) reserve as zero reserve for the percentage", () => { + // vscode-lm reports maxTokens: -1. The percentage denominator should fall back to the + // full window (zero reserve): 150000 / 200000 = 75% ≥ 70% threshold. + const result = willManageContext({ + totalTokens: 150000, + contextWindow: 200000, + maxTokens: -1, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + + it("manageContext summarizes based on available input space, end-to-end", async () => { + const mockSummary = "Available-input summary" + const mockSummarizeResponse: condenseModule.SummarizeResponse = { + messages: [ + { role: "user", content: "First message" }, + { role: "user", content: mockSummary, isSummary: true }, + { role: "assistant", content: "Last message" }, + ], + summary: mockSummary, + cost: 0.05, + newContextTokens: 100, + } + const summarizeSpy = vi + .spyOn(condenseModule, "summarizeConversation") + .mockResolvedValue(mockSummarizeResponse) + + const modelInfo = createModelInfo(200000, 64000) + // available input 136000; totalTokens 100000 → ≈ 73.5% ≥ 70% threshold, but only 50% of + // the raw window. The end-to-end path must trigger summarization on the available-input math. + const totalTokens = 100000 + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + const result = await manageContext({ + messages: messagesWithSmallContent, + totalTokens, + contextWindow: modelInfo.contextWindow, + maxTokens: modelInfo.maxTokens, + apiHandler: mockApiHandler, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + systemPrompt: "System prompt", + taskId, + profileThresholds: {}, + currentProfileId: "default", + }) + + expect(summarizeSpy).toHaveBeenCalled() + expect(result).toMatchObject({ + summary: mockSummary, + prevContextTokens: totalTokens, + }) + + summarizeSpy.mockRestore() + }) + }) }) diff --git a/src/core/context-management/index.ts b/src/core/context-management/index.ts index 243d7bd797..cc4d5ba3d7 100644 --- a/src/core/context-management/index.ts +++ b/src/core/context-management/index.ts @@ -170,13 +170,15 @@ export function willManageContext({ }: WillManageContextOptions): boolean { if (!autoCondenseContext) { // When auto-condense is disabled, only truncation can occur - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS const prevContextTokens = totalTokens + lastMessageTokens const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens return prevContextTokens > allowedTokens } - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS const prevContextTokens = totalTokens + lastMessageTokens const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens @@ -192,7 +194,14 @@ export function willManageContext({ // Invalid values fall back to global setting (effectiveThreshold already set) } - const contextPercent = (100 * prevContextTokens) / contextWindow + // Measure usage against the available input space (context window minus the + // reserved output budget), matching the context gauge shown in the UI. Reserved + // output tokens can never hold conversation context, so this is the meaningful + // "how full is my usable input" figure. When the reserve is unknown/unlimited + // (e.g., vscode-lm reports -1), fall back to the full context window. + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 + const availableInputTokens = contextWindow - reservedForOutput + const contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 return contextPercent >= effectiveThreshold || prevContextTokens > allowedTokens } @@ -267,7 +276,8 @@ export async function manageContext({ let errorDetails: string | undefined let cost = 0 // Calculate the maximum tokens reserved for response - const reservedTokens = maxTokens || ANTHROPIC_DEFAULT_MAX_TOKENS + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedTokens = maxTokens && maxTokens > 0 ? maxTokens : ANTHROPIC_DEFAULT_MAX_TOKENS // Estimate tokens for the last message (which is always a user message) const lastMessage = messages[messages.length - 1] @@ -304,7 +314,14 @@ export async function manageContext({ // If no specific threshold is found for the profile, fall back to global setting if (autoCondenseContext) { - const contextPercent = (100 * prevContextTokens) / contextWindow + // Measure usage against the available input space (context window minus the + // reserved output budget), matching the context gauge shown in the UI. Reserved + // output tokens can never hold conversation context, so this is the meaningful + // "how full is my usable input" figure. When the reserve is unknown/unlimited + // (e.g., vscode-lm reports -1), fall back to the full context window. + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 + const availableInputTokens = contextWindow - reservedForOutput + const contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 if (contextPercent >= effectiveThreshold || prevContextTokens > allowedTokens) { // Attempt to intelligently condense the context const result = await summarizeConversation({ diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts index 50d4674fd0..183b0cd191 100644 --- a/src/core/task/Task.ts +++ b/src/core/task/Task.ts @@ -2688,9 +2688,13 @@ export class Task extends EventEmitter implements TaskLike { if (signal.aborted) { reject(new Error("Request cancelled by user")) } else { - signal.addEventListener("abort", () => { - reject(new Error("Request cancelled by user")) - }, { once: true }) + signal.addEventListener( + "abort", + () => { + reject(new Error("Request cancelled by user")) + }, + { once: true }, + ) } }) return await Promise.race([nextPromise, abortPromise]) @@ -3734,7 +3738,10 @@ export class Task extends EventEmitter implements TaskLike { settings: this.apiConfiguration, }) - const contextWindow = modelInfo.contextWindow + // VS Code LM (Copilot) measures usage against its static-table maxInputTokens, not the + // inflated live window, so context management runs in line with the context bar. Every other + // provider returns undefined here and falls back to modelInfo.contextWindow. + const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow // Get the current profile ID using the helper method const currentProfileId = this.getCurrentProfileId(state) @@ -3930,7 +3937,10 @@ export class Task extends EventEmitter implements TaskLike { settings: this.apiConfiguration, }) - const contextWindow = modelInfo.contextWindow + // VS Code LM (Copilot) measures usage against its static-table maxInputTokens, not the + // inflated live window, so context management runs in line with the context bar. Every other + // provider returns undefined here and falls back to modelInfo.contextWindow. + const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow // Get the current profile ID using the helper method const currentProfileId = this.getCurrentProfileId(state) @@ -4191,10 +4201,14 @@ export class Task extends EventEmitter implements TaskLike { const iterator = stream[Symbol.asyncIterator]() // Set up abort handling - when the signal is aborted, clean up the controller reference - abortSignal.addEventListener("abort", () => { - console.log(`[Task#${this.taskId}.${this.instanceId}] AbortSignal triggered for current request`) - this.currentRequestAbortController = undefined - }, { once: true }) + abortSignal.addEventListener( + "abort", + () => { + console.log(`[Task#${this.taskId}.${this.instanceId}] AbortSignal triggered for current request`) + this.currentRequestAbortController = undefined + }, + { once: true }, + ) try { // Awaiting first chunk to see if it will throw an error. @@ -4206,9 +4220,13 @@ export class Task extends EventEmitter implements TaskLike { if (abortSignal.aborted) { reject(new Error("Request cancelled by user")) } else { - abortSignal.addEventListener("abort", () => { - reject(new Error("Request cancelled by user")) - }, { once: true }) + abortSignal.addEventListener( + "abort", + () => { + reject(new Error("Request cancelled by user")) + }, + { once: true }, + ) } }) diff --git a/webview-ui/src/components/chat/TaskHeader.tsx b/webview-ui/src/components/chat/TaskHeader.tsx index 4ddf5ef35c..927d3d057d 100644 --- a/webview-ui/src/components/chat/TaskHeader.tsx +++ b/webview-ui/src/components/chat/TaskHeader.tsx @@ -76,7 +76,8 @@ const TaskHeader = ({ : 0, [model, modelId, apiConfiguration], ) - const reservedForOutput = maxTokens || 0 + // vscode-lm reports maxTokens: -1 (unlimited); a negative reserve must not distort the window math. + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 const condenseButton = ( { // Should show 0% when available input space is 0 expect(screen.getByText("0%")).toBeInTheDocument() }) + + it("should treat a negative maxTokens (vscode-lm reports -1) as zero reserve", () => { + // vscode-lm reports maxTokens: -1 (unlimited). A naive `maxTokens || 0` keeps -1, + // which would inflate available input space and skew the percentage. The guard must + // treat -1 as a zero reserve so available space == contextWindow. + // contextTokens = 250, contextWindow = 1000, reservedForOutput = 0 + // Percentage = 250 / 1000 * 100 = 25% + mockModelInfo = { contextWindow: 1000, maxTokens: -1 } + mockMaxOutputTokens = -1 + + renderTaskHeader({ contextTokens: 250 }) + + expect(screen.getByText("25%")).toBeInTheDocument() + }) }) }) diff --git a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts index 0dc42129c0..6f3f1edc4f 100644 --- a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts +++ b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts @@ -14,6 +14,8 @@ import { minimaxDefaultModelId, minimaxModels, openRouterDefaultModelId, + vscodeLlmModels, + vscodeLlmDefaultModelId, } from "@roo-code/types" import { useSelectedModel } from "../useSelectedModel" @@ -772,4 +774,57 @@ describe("useSelectedModel", () => { expect(result.current.info).toEqual(minimaxModels["MiniMax-M2.7"]) }) }) + + describe("vscode-lm provider", () => { + beforeEach(() => { + mockUseRouterModels.mockReturnValue({ + data: { + openrouter: {}, + requesty: {}, + litellm: {}, + }, + isLoading: false, + isError: false, + } as any) + + mockUseOpenRouterModelProviders.mockReturnValue({ + data: {}, + isLoading: false, + isError: false, + } as any) + }) + + it("resolves a listed family's contextWindow to its maxInputTokens", () => { + const family = vscodeLlmDefaultModelId + const apiConfiguration: ProviderSettings = { + apiProvider: "vscode-lm", + vsCodeLmModelSelector: { vendor: "copilot", family }, + } + + const wrapper = createWrapper() + const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) + + expect(result.current.provider).toBe("vscode-lm") + expect(result.current.id).toBe(`copilot/${family}`) + // The bar and the condense gate share one source of truth: contextWindow === maxInputTokens. + expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[family].maxInputTokens) + expect(result.current.info?.supportsImages).toBe(false) + }) + + it("falls back to the default model's window for an unlisted family (NOT 128000)", () => { + const apiConfiguration: ProviderSettings = { + apiProvider: "vscode-lm", + vsCodeLmModelSelector: { vendor: "copilot", family: "totally-unknown-family" }, + } + + const wrapper = createWrapper() + const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) + + // On a family miss we must NOT fall back to openAiModelInfoSaneDefaults' 128000 window, + // which would diverge from the gate. Instead, use the default model's maxInputTokens. + expect(result.current.info?.contextWindow).not.toBe(128000) + expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[vscodeLlmDefaultModelId].maxInputTokens) + expect(result.current.info?.supportsImages).toBe(false) + }) + }) }) diff --git a/webview-ui/src/components/ui/hooks/useSelectedModel.ts b/webview-ui/src/components/ui/hooks/useSelectedModel.ts index d3ebb6c0dd..8c8a5360da 100644 --- a/webview-ui/src/components/ui/hooks/useSelectedModel.ts +++ b/webview-ui/src/components/ui/hooks/useSelectedModel.ts @@ -310,8 +310,21 @@ function getSelectedModel({ ? `${apiConfiguration.vsCodeLmModelSelector.vendor}/${apiConfiguration.vsCodeLmModelSelector.family}` : vscodeLlmDefaultModelId const modelFamily = apiConfiguration?.vsCodeLmModelSelector?.family ?? vscodeLlmDefaultModelId - const info = vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] - return { id, info: { ...openAiModelInfoSaneDefaults, ...info, supportsImages: false } } // VSCode LM API currently doesn't support images. + // On a family miss, fall back to the default model entry instead of openAiModelInfoSaneDefaults, + // whose 128K contextWindow would diverge from the gate and make the bar read >100% while + // auto-condense never fires (the gate uses the live window). + const listedModel = + vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] ?? vscodeLlmModels[vscodeLlmDefaultModelId] + // contextWindow MUST equal maxInputTokens: that is the exact value the gate consumes via + // getModel().info.contextWindow = Math.max(0, client.maxInputTokens) in src/api/providers/vscode-lm.ts, + // so the UI bar and the condense gate share a single source of truth. + const info: ModelInfo = { + ...openAiModelInfoSaneDefaults, + ...listedModel, + contextWindow: listedModel.maxInputTokens, + supportsImages: false, // VSCode LM API currently doesn't support images. + } + return { id, info } } case "sambanova": { const id = apiConfiguration.apiModelId ?? defaultModelId From 62a556c97e06f7f78837d314c0e419d06a066d61 Mon Sep 17 00:00:00 2001 From: Bertan Ari Date: Wed, 24 Jun 2026 08:23:15 -0700 Subject: [PATCH 2/5] test(vscode-lm): cover condense-window edge branches for codecov/patch Add targeted tests for the previously-uncovered ported branches: the availableInputTokens<=0 fallback to 100% in willManageContext/manageContext, getCondenseContextWindow() guard fallbacks, and the vscode-lm UI family-miss window resolution. Raises patch coverage to satisfy the codecov/patch 80% gate. --- src/api/providers/__tests__/vscode-lm.spec.ts | 29 +++++++ .../__tests__/context-management.spec.ts | 81 +++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/src/api/providers/__tests__/vscode-lm.spec.ts b/src/api/providers/__tests__/vscode-lm.spec.ts index 0e1797ab44..42779e813c 100644 --- a/src/api/providers/__tests__/vscode-lm.spec.ts +++ b/src/api/providers/__tests__/vscode-lm.spec.ts @@ -490,6 +490,35 @@ describe("VsCodeLmHandler", () => { expect(handler.getCondenseContextWindow()).toBe(handler.getModel().info.contextWindow) expect(handler.getCondenseContextWindow()).toBe(mockLanguageModelChat.maxInputTokens) }) + + it("falls back to the live window when no family is resolvable (no client, no selector family)", () => { + // With neither a client nor a selector family, `family` is undefined, so the static-table + // lookup is skipped entirely and the gate uses getModel().info.contextWindow (fallback info). + const noFamilyHandler = new VsCodeLmHandler({ vsCodeLmModelSelector: { vendor: "copilot" } }) + noFamilyHandler["client"] = null + expect(noFamilyHandler.getCondenseContextWindow()).toBe(noFamilyHandler.getModel().info.contextWindow) + expect(noFamilyHandler.getCondenseContextWindow()).toBe(openAiModelInfoSaneDefaults.contextWindow) + noFamilyHandler.dispose() + }) + + it("falls back to the live window when the static row exists but maxInputTokens is non-positive", () => { + // Guard sub-condition: a curated family is found but its maxInputTokens is <= 0 (corrupt/zeroed). + // The gate must NOT trust that value; it falls through to the live runtime window instead. + const family = "claude-opus-4.8" + const original = vscodeLlmModels[family].maxInputTokens + try { + ;(vscodeLlmModels[family] as { maxInputTokens: number }).maxInputTokens = 0 + const guardHandler = new VsCodeLmHandler({ + vsCodeLmModelSelector: { vendor: "copilot", family }, + }) + guardHandler["client"] = mockLanguageModelChat as unknown as vscode.LanguageModelChat + expect(guardHandler.getCondenseContextWindow()).toBe(guardHandler.getModel().info.contextWindow) + expect(guardHandler.getCondenseContextWindow()).toBe(mockLanguageModelChat.maxInputTokens) + guardHandler.dispose() + } finally { + ;(vscodeLlmModels[family] as { maxInputTokens: number }).maxInputTokens = original + } + }) }) describe("countTokens", () => { diff --git a/src/core/context-management/__tests__/context-management.spec.ts b/src/core/context-management/__tests__/context-management.spec.ts index 2e81d61e9f..b05ebefabc 100644 --- a/src/core/context-management/__tests__/context-management.spec.ts +++ b/src/core/context-management/__tests__/context-management.spec.ts @@ -1796,6 +1796,87 @@ describe("Context Management", () => { expect(result).toBe(true) }) + it("willManageContext falls back to 100% when the reserve is >= the window (availableInput <= 0)", () => { + // When maxTokens (reserve) >= contextWindow, availableInputTokens = window - reserve <= 0. + // The denominator guard must short-circuit contextPercent to 100 rather than divide by + // a non-positive number, so the gate fires regardless of the (tiny) totalTokens. + const result = willManageContext({ + totalTokens: 1, + contextWindow: 50000, + maxTokens: 60000, // reserve > window → availableInput = -10000 + autoCondenseContext: true, + autoCondenseContextPercent: 80, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + // contextPercent === 100 >= 80 threshold → true. + expect(result).toBe(true) + }) + + it("willManageContext falls back to 100% when the reserve exactly equals the window (availableInput === 0)", () => { + // Boundary: reserve === window → availableInputTokens === 0, still the FALSE branch (> 0 is false). + const result = willManageContext({ + totalTokens: 1, + contextWindow: 50000, + maxTokens: 50000, + autoCondenseContext: true, + autoCondenseContextPercent: 90, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + + it("manageContext summarizes via the 100% fallback when the reserve >= the window (availableInput <= 0)", async () => { + // Mirror the willManageContext edge for the manageContext path: reserve >= window forces + // contextPercent to 100 via the denominator guard, so summarization triggers even though + // totalTokens is small relative to the raw window. + const mockSummary = "Reserve-exceeds-window summary" + const mockSummarizeResponse: condenseModule.SummarizeResponse = { + messages: [ + { role: "user", content: "First message" }, + { role: "user", content: mockSummary, isSummary: true }, + { role: "assistant", content: "Last message" }, + ], + summary: mockSummary, + cost: 0.05, + newContextTokens: 100, + } + const summarizeSpy = vi + .spyOn(condenseModule, "summarizeConversation") + .mockResolvedValue(mockSummarizeResponse) + + // contextWindow 50000, maxTokens 60000 → availableInput = -10000 → contextPercent = 100. + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + const result = await manageContext({ + messages: messagesWithSmallContent, + totalTokens: 1, + contextWindow: 50000, + maxTokens: 60000, + apiHandler: mockApiHandler, + autoCondenseContext: true, + autoCondenseContextPercent: 80, + systemPrompt: "System prompt", + taskId, + profileThresholds: {}, + currentProfileId: "default", + }) + + expect(summarizeSpy).toHaveBeenCalled() + expect(result).toMatchObject({ + summary: mockSummary, + prevContextTokens: 1, + }) + + summarizeSpy.mockRestore() + }) + it("manageContext summarizes based on available input space, end-to-end", async () => { const mockSummary = "Available-input summary" const mockSummarizeResponse: condenseModule.SummarizeResponse = { From e45155d3b8f784a8af6d2676141e00a691474390 Mon Sep 17 00:00:00 2001 From: Bertan Ari Date: Wed, 24 Jun 2026 09:00:17 -0700 Subject: [PATCH 3/5] =?UTF-8?q?chore(vscode-lm):=20address=20review=20?= =?UTF-8?q?=E2=80=94=20drop=20changeset,=20fix=20condense-window=20guard?= =?UTF-8?q?=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove .changeset/vscode-lm-condense-fix.md (changesets are maintainer-managed per AGENTS.md; CodeRabbit flagged). - Fix getCondenseContextWindow() non-positive-guard test so the selector family (claude-opus-4.8) drives the lookup and the zeroed static row actually exercises the maxInputTokens > 0 guard before falling back. --- .changeset/vscode-lm-condense-fix.md | 5 ----- src/api/providers/__tests__/vscode-lm.spec.ts | 12 ++++++++---- 2 files changed, 8 insertions(+), 9 deletions(-) delete mode 100644 .changeset/vscode-lm-condense-fix.md diff --git a/.changeset/vscode-lm-condense-fix.md b/.changeset/vscode-lm-condense-fix.md deleted file mode 100644 index a592361786..0000000000 --- a/.changeset/vscode-lm-condense-fix.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"zoo-code": patch ---- - -Fix unreliable automatic context condensing on the VS Code LM (vscode-lm) provider. The condense gate now treats the provider's `maxTokens: -1` (unlimited) as the default output reserve and measures usage against available input space, and a new `getCondenseContextWindow()` seam makes the gate use the curated model `maxInputTokens` instead of the inflated live window. Also refreshes the VS Code LM model catalog and default model. diff --git a/src/api/providers/__tests__/vscode-lm.spec.ts b/src/api/providers/__tests__/vscode-lm.spec.ts index 42779e813c..eb026e8169 100644 --- a/src/api/providers/__tests__/vscode-lm.spec.ts +++ b/src/api/providers/__tests__/vscode-lm.spec.ts @@ -501,9 +501,11 @@ describe("VsCodeLmHandler", () => { noFamilyHandler.dispose() }) - it("falls back to the live window when the static row exists but maxInputTokens is non-positive", () => { + it("falls back to the derived window when the static row exists but maxInputTokens is non-positive", () => { // Guard sub-condition: a curated family is found but its maxInputTokens is <= 0 (corrupt/zeroed). - // The gate must NOT trust that value; it falls through to the live runtime window instead. + // With the selector family `claude-opus-4.8` and no live client, the zeroed static row is the one + // consulted, so the `maxInputTokens > 0` guard fails and the gate falls back to the derived window + // from getModel().info.contextWindow (sane defaults here, since there is no live client). const family = "claude-opus-4.8" const original = vscodeLlmModels[family].maxInputTokens try { @@ -511,9 +513,11 @@ describe("VsCodeLmHandler", () => { const guardHandler = new VsCodeLmHandler({ vsCodeLmModelSelector: { vendor: "copilot", family }, }) - guardHandler["client"] = mockLanguageModelChat as unknown as vscode.LanguageModelChat + // Leave the client unset so `family` resolves from the selector (claude-opus-4.8), + // forcing the zeroed static row to be read instead of a live client's family. + guardHandler["client"] = null expect(guardHandler.getCondenseContextWindow()).toBe(guardHandler.getModel().info.contextWindow) - expect(guardHandler.getCondenseContextWindow()).toBe(mockLanguageModelChat.maxInputTokens) + expect(guardHandler.getCondenseContextWindow()).toBe(openAiModelInfoSaneDefaults.contextWindow) guardHandler.dispose() } finally { ;(vscodeLlmModels[family] as { maxInputTokens: number }).maxInputTokens = original From d128a5c1d5dbd2a6c5fc360c32f8dd2bcac89b0d Mon Sep 17 00:00:00 2001 From: Bertan Ari Date: Fri, 26 Jun 2026 06:14:42 -0700 Subject: [PATCH 4/5] fix(vscode-lm): scope available-input condense denominator to vscode-lm; address review Address review feedback from edelauna on #710: - Scope the available-input-space condense percent denominator to vscode-lm only (via the getCondenseContextWindow seam); all other providers keep dividing by the full context window. The maxTokens:-1 reserve guard remains global. - Correct the misleading useSelectedModel comment: the gate's primary window is getCondenseContextWindow() (static maxInputTokens), not getModel().info.contextWindow. - Strengthen the listed-family test with a claude-opus-4.8 case (contextWindow != maxInputTokens) to catch a field swap. --- .../__tests__/context-management.spec.ts | 131 ++++++++++++++++-- src/core/context-management/index.ts | 62 ++++++--- src/core/task/Task.ts | 11 ++ .../hooks/__tests__/useSelectedModel.spec.ts | 20 +++ .../components/ui/hooks/useSelectedModel.ts | 7 +- 5 files changed, 203 insertions(+), 28 deletions(-) diff --git a/src/core/context-management/__tests__/context-management.spec.ts b/src/core/context-management/__tests__/context-management.spec.ts index b05ebefabc..ba0a77aacf 100644 --- a/src/core/context-management/__tests__/context-management.spec.ts +++ b/src/core/context-management/__tests__/context-management.spec.ts @@ -1508,8 +1508,8 @@ describe("Context Management", () => { }) it("should return false when context percent is below threshold", () => { - // Usage is measured against available input space (contextWindow - maxTokens reserve). - // available = 100000 - 30000 = 70000; 30000 / 70000 ≈ 43% < 50% threshold. + // Available-input denominator (opt-in): available = 100000 - 30000 = 70000; + // 30000 / 70000 ≈ 43% < 50% threshold. const result = willManageContext({ totalTokens: 30000, contextWindow: 100000, @@ -1519,6 +1519,7 @@ describe("Context Management", () => { profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, + useAvailableInputForContextPercent: true, }) expect(result).toBe(false) }) @@ -1604,10 +1605,10 @@ describe("Context Management", () => { }) it("should include lastMessageTokens in the calculation", () => { - // Usage is measured against available input space (contextWindow - maxTokens reserve). - // available = 100000 - 30000 = 70000. + // Available-input denominator (opt-in): available = 100000 - 30000 = 70000. // Without lastMessageTokens: 34000 / 70000 ≈ 48.6% < 50% threshold. // With lastMessageTokens: (34000 + 2000) / 70000 ≈ 51.4% ≥ 50% threshold. + // (Against the full window both cases are < 50%, so this case requires the opt-in flag.) const resultWithoutLastMessage = willManageContext({ totalTokens: 34000, contextWindow: 100000, @@ -1617,6 +1618,7 @@ describe("Context Management", () => { profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, + useAvailableInputForContextPercent: true, }) expect(resultWithoutLastMessage).toBe(false) @@ -1629,6 +1631,7 @@ describe("Context Management", () => { profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 2000, // Pushes usage over 50% of available input + useAvailableInputForContextPercent: true, }) expect(resultWithLastMessage).toBe(true) }) @@ -1728,12 +1731,13 @@ describe("Context Management", () => { }) /** - * Regression tests: the condense gate must measure usage against available input space - * (contextWindow - reserved output), not the raw context window. This keeps the gate in - * lockstep with the UI context gauge and ensures it actually fires for providers like - * vscode-lm that report maxTokens: -1. + * Regression tests for the opt-in available-input denominator (vscode-lm). With the flag on, + * the condense gate measures usage against available input space (contextWindow - reserved + * output), not the raw context window. This keeps the gate in lockstep with the UI context + * gauge and ensures it actually fires for vscode-lm, which reports maxTokens: -1. The default + * (full-window) behavior for every other provider is covered by the sibling describe below. */ - describe("contextPercent uses available input space (regression)", () => { + describe("contextPercent uses available input space (opt-in, regression)", () => { const createModelInfo = (contextWindow: number, maxTokens?: number): ModelInfo => ({ contextWindow, supportsPromptCache: true, @@ -1761,6 +1765,7 @@ describe("Context Management", () => { profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, + useAvailableInputForContextPercent: true, }) expect(result).toBe(true) }) @@ -1776,6 +1781,7 @@ describe("Context Management", () => { profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, + useAvailableInputForContextPercent: true, }) expect(result).toBe(false) }) @@ -1792,6 +1798,7 @@ describe("Context Management", () => { profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, + useAvailableInputForContextPercent: true, }) expect(result).toBe(true) }) @@ -1809,6 +1816,7 @@ describe("Context Management", () => { profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, + useAvailableInputForContextPercent: true, }) // contextPercent === 100 >= 80 threshold → true. expect(result).toBe(true) @@ -1825,6 +1833,7 @@ describe("Context Management", () => { profileThresholds: {}, currentProfileId: "default", lastMessageTokens: 0, + useAvailableInputForContextPercent: true, }) expect(result).toBe(true) }) @@ -1866,6 +1875,7 @@ describe("Context Management", () => { taskId, profileThresholds: {}, currentProfileId: "default", + useAvailableInputForContextPercent: true, }) expect(summarizeSpy).toHaveBeenCalled() @@ -1914,6 +1924,7 @@ describe("Context Management", () => { taskId, profileThresholds: {}, currentProfileId: "default", + useAvailableInputForContextPercent: true, }) expect(summarizeSpy).toHaveBeenCalled() @@ -1925,4 +1936,106 @@ describe("Context Management", () => { summarizeSpy.mockRestore() }) }) + + /** + * Scoping tests: the available-input denominator is opt-in. By default (flag omitted), the gate + * divides by the FULL context window, exactly as every non-vscode-lm provider did before the + * vscode-lm fix. The maxTokens: -1 reserve guard, however, remains global on the default path. + */ + describe("contextPercent denominator is opt-in (default = full window)", () => { + const messages: ApiMessage[] = [ + { role: "user", content: "First message" }, + { role: "assistant", content: "Second message" }, + { role: "user", content: "Third message" }, + { role: "assistant", content: "Fourth message" }, + { role: "user", content: "Fifth message" }, + ] + + it("willManageContext divides by the full window when the flag is omitted (default)", () => { + // Same inputs as the regression block: contextWindow 200000, reserve 64000, totalTokens 100000. + // Default (full window): 100000 / 200000 = 50% < 70% threshold → false. Under the opt-in + // available-input math it would be ≈ 73.5% and fire — this proves the scoping. + const result = willManageContext({ + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(false) + }) + + it("willManageContext fires on the same inputs when the opt-in flag is true", () => { + // Identical inputs, flag on: available input 136000 → 100000 / 136000 ≈ 73.5% ≥ 70% → true. + const result = willManageContext({ + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + useAvailableInputForContextPercent: true, + }) + expect(result).toBe(true) + }) + + it("keeps the maxTokens:-1 reserve guard on the default (full-window) path", () => { + // The reserve guard is global, independent of the percent denominator. With auto-condense + // off, only the allowedTokens path can fire: allowedTokens = 100000 * 0.9 - 8192 = 81808; + // totalTokens 85000 > 81808 → true. (A naive `maxTokens || DEFAULT` keeping -1 would break this.) + const result = willManageContext({ + totalTokens: 85000, + contextWindow: 100000, + maxTokens: -1, + autoCondenseContext: false, + autoCondenseContextPercent: 50, + profileThresholds: {}, + currentProfileId: "default", + lastMessageTokens: 0, + }) + expect(result).toBe(true) + }) + + it("manageContext does NOT summarize on the default path where the opt-in math would have", async () => { + // contextWindow 200000, reserve 64000, totalTokens 100000. Default full-window percent is + // 50% < 70% threshold, and allowedTokens = 200000 * 0.9 - 64000 = 116000 > 100000, so neither + // condense nor truncation runs. With the opt-in flag this same case summarizes (asserted above + // in the regression block), proving the default path reverts to pre-fix behavior. + const summarizeSpy = vi.spyOn(condenseModule, "summarizeConversation") + + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + const result = await manageContext({ + messages: messagesWithSmallContent, + totalTokens: 100000, + contextWindow: 200000, + maxTokens: 64000, + apiHandler: mockApiHandler, + autoCondenseContext: true, + autoCondenseContextPercent: 70, + systemPrompt: "System prompt", + taskId, + profileThresholds: {}, + currentProfileId: "default", + }) + + expect(summarizeSpy).not.toHaveBeenCalled() + expect(result).toEqual({ + messages: messagesWithSmallContent, + summary: "", + cost: 0, + prevContextTokens: 100000, + }) + + summarizeSpy.mockRestore() + }) + }) }) diff --git a/src/core/context-management/index.ts b/src/core/context-management/index.ts index cc4d5ba3d7..b4d89487fd 100644 --- a/src/core/context-management/index.ts +++ b/src/core/context-management/index.ts @@ -147,6 +147,14 @@ export type WillManageContextOptions = { profileThresholds: Record currentProfileId: string lastMessageTokens: number + /** + * Opt-in: measure the condense percentage against the available input space + * (contextWindow - reserved output) instead of the full context window. Only providers + * whose advertised live window is inflated relative to the usable input ceiling (vscode-lm, + * which exposes the seam via getCondenseContextWindow) set this. All other providers leave it + * undefined and keep dividing by the full context window (original behavior). + */ + useAvailableInputForContextPercent?: boolean } /** @@ -167,6 +175,7 @@ export function willManageContext({ profileThresholds, currentProfileId, lastMessageTokens, + useAvailableInputForContextPercent, }: WillManageContextOptions): boolean { if (!autoCondenseContext) { // When auto-condense is disabled, only truncation can occur @@ -194,14 +203,20 @@ export function willManageContext({ // Invalid values fall back to global setting (effectiveThreshold already set) } - // Measure usage against the available input space (context window minus the - // reserved output budget), matching the context gauge shown in the UI. Reserved - // output tokens can never hold conversation context, so this is the meaningful - // "how full is my usable input" figure. When the reserve is unknown/unlimited - // (e.g., vscode-lm reports -1), fall back to the full context window. - const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 - const availableInputTokens = contextWindow - reservedForOutput - const contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 + // By default, measure usage against the full context window (original behavior shared by all + // providers). Opt-in (vscode-lm via getCondenseContextWindow) measures against the available + // input space (context window minus the reserved output budget) to match the UI context gauge, + // because that provider's advertised window is inflated relative to its usable input ceiling. + // Reserved output tokens can never hold conversation context. When the reserve is + // unknown/unlimited (e.g., vscode-lm reports -1), fall back to the full context window. + let contextPercent: number + if (useAvailableInputForContextPercent) { + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 + const availableInputTokens = contextWindow - reservedForOutput + contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 + } else { + contextPercent = (100 * prevContextTokens) / contextWindow + } return contextPercent >= effectiveThreshold || prevContextTokens > allowedTokens } @@ -238,6 +253,14 @@ export type ContextManagementOptions = { cwd?: string /** Optional controller for file access validation */ rooIgnoreController?: RooIgnoreController + /** + * Opt-in: measure the condense percentage against the available input space + * (contextWindow - reserved output) instead of the full context window. Only providers + * whose advertised live window is inflated relative to the usable input ceiling (vscode-lm, + * which exposes the seam via getCondenseContextWindow) set this. All other providers leave it + * undefined and keep dividing by the full context window (original behavior). + */ + useAvailableInputForContextPercent?: boolean } export type ContextManagementResult = SummarizeResponse & { @@ -271,6 +294,7 @@ export async function manageContext({ filesReadByRoo, cwd, rooIgnoreController, + useAvailableInputForContextPercent, }: ContextManagementOptions): Promise { let error: string | undefined let errorDetails: string | undefined @@ -314,14 +338,20 @@ export async function manageContext({ // If no specific threshold is found for the profile, fall back to global setting if (autoCondenseContext) { - // Measure usage against the available input space (context window minus the - // reserved output budget), matching the context gauge shown in the UI. Reserved - // output tokens can never hold conversation context, so this is the meaningful - // "how full is my usable input" figure. When the reserve is unknown/unlimited - // (e.g., vscode-lm reports -1), fall back to the full context window. - const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 - const availableInputTokens = contextWindow - reservedForOutput - const contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 + // By default, measure usage against the full context window (original behavior shared by all + // providers). Opt-in (vscode-lm via getCondenseContextWindow) measures against the available + // input space (context window minus the reserved output budget) to match the UI context gauge, + // because that provider's advertised window is inflated relative to its usable input ceiling. + // Reserved output tokens can never hold conversation context. When the reserve is + // unknown/unlimited (e.g., vscode-lm reports -1), fall back to the full context window. + let contextPercent: number + if (useAvailableInputForContextPercent) { + const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 + const availableInputTokens = contextWindow - reservedForOutput + contextPercent = availableInputTokens > 0 ? (100 * prevContextTokens) / availableInputTokens : 100 + } else { + contextPercent = (100 * prevContextTokens) / contextWindow + } if (contextPercent >= effectiveThreshold || prevContextTokens > allowedTokens) { // Attempt to intelligently condense the context const result = await summarizeConversation({ diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts index 183b0cd191..81a2435452 100644 --- a/src/core/task/Task.ts +++ b/src/core/task/Task.ts @@ -3743,6 +3743,10 @@ export class Task extends EventEmitter implements TaskLike { // provider returns undefined here and falls back to modelInfo.contextWindow. const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow + // Only vscode-lm implements getCondenseContextWindow, so its presence scopes the + // available-input condense denominator to that provider; all others use the full window. + const useAvailableInputForContextPercent = typeof this.api.getCondenseContextWindow === "function" + // Get the current profile ID using the helper method const currentProfileId = this.getCurrentProfileId(state) @@ -3810,6 +3814,7 @@ export class Task extends EventEmitter implements TaskLike { currentProfileId, metadata, environmentDetails, + useAvailableInputForContextPercent, }) if (truncateResult.messages !== this.apiConversationHistory) { @@ -3942,6 +3947,10 @@ export class Task extends EventEmitter implements TaskLike { // provider returns undefined here and falls back to modelInfo.contextWindow. const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow + // Only vscode-lm implements getCondenseContextWindow, so its presence scopes the + // available-input condense denominator to that provider; all others use the full window. + const useAvailableInputForContextPercent = typeof this.api.getCondenseContextWindow === "function" + // Get the current profile ID using the helper method const currentProfileId = this.getCurrentProfileId(state) // Check if context management will likely run (threshold check) @@ -3965,6 +3974,7 @@ export class Task extends EventEmitter implements TaskLike { profileThresholds, currentProfileId, lastMessageTokens, + useAvailableInputForContextPercent, }) // Send condenseTaskContextStarted BEFORE manageContext to show in-progress indicator @@ -4047,6 +4057,7 @@ export class Task extends EventEmitter implements TaskLike { filesReadByRoo: contextMgmtFilesReadByRoo, cwd: this.cwd, rooIgnoreController: this.rooIgnoreController, + useAvailableInputForContextPercent, }) if (truncateResult.messages !== this.apiConversationHistory) { await this.overwriteApiConversationHistory(truncateResult.messages) diff --git a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts index 6f3f1edc4f..3ffe85e144 100644 --- a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts +++ b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts @@ -811,6 +811,26 @@ describe("useSelectedModel", () => { expect(result.current.info?.supportsImages).toBe(false) }) + it("pins a divergent family's contextWindow to maxInputTokens, not its advertised window", () => { + // claude-opus-4.8 is the row where contextWindow (679560) and maxInputTokens (197897) DIFFER. + // The hook must surface maxInputTokens so the bar matches the condense gate; a field swap to + // the advertised contextWindow would be caught here (unlike the default model where they match). + const family = "claude-opus-4.8" + const apiConfiguration: ProviderSettings = { + apiProvider: "vscode-lm", + vsCodeLmModelSelector: { vendor: "copilot", family }, + } + + const wrapper = createWrapper() + const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) + + expect(result.current.provider).toBe("vscode-lm") + expect(result.current.id).toBe(`copilot/${family}`) + expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[family].maxInputTokens) // 197897 + expect(result.current.info?.contextWindow).not.toBe(vscodeLlmModels[family].contextWindow) // NOT 679560 + expect(result.current.info?.supportsImages).toBe(false) + }) + it("falls back to the default model's window for an unlisted family (NOT 128000)", () => { const apiConfiguration: ProviderSettings = { apiProvider: "vscode-lm", diff --git a/webview-ui/src/components/ui/hooks/useSelectedModel.ts b/webview-ui/src/components/ui/hooks/useSelectedModel.ts index 8c8a5360da..a5940ba7d3 100644 --- a/webview-ui/src/components/ui/hooks/useSelectedModel.ts +++ b/webview-ui/src/components/ui/hooks/useSelectedModel.ts @@ -315,9 +315,10 @@ function getSelectedModel({ // auto-condense never fires (the gate uses the live window). const listedModel = vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] ?? vscodeLlmModels[vscodeLlmDefaultModelId] - // contextWindow MUST equal maxInputTokens: that is the exact value the gate consumes via - // getModel().info.contextWindow = Math.max(0, client.maxInputTokens) in src/api/providers/vscode-lm.ts, - // so the UI bar and the condense gate share a single source of truth. + // Set contextWindow = maxInputTokens so the UI bar matches what the condense gate uses for + // vscode-lm. The gate's primary window comes from getCondenseContextWindow() (which returns the + // static-table maxInputTokens); getModel().info.contextWindow is only the fallback. Sharing + // maxInputTokens keeps the bar and the gate on a single source of truth. const info: ModelInfo = { ...openAiModelInfoSaneDefaults, ...listedModel, From 30389d3ac8a8c693446c702126a4752d0638abee Mon Sep 17 00:00:00 2001 From: Bertan Ari Date: Fri, 26 Jun 2026 13:06:48 -0700 Subject: [PATCH 5/5] docs(vscode-lm): tighten auto-condense comments Simplify comments added in PR #710 to be brief and rationale-focused; no logic, assertions, or test values changed. --- .../types/src/__tests__/vscode-llm.spec.ts | 7 +- packages/types/src/providers/vscode-llm.ts | 11 +-- src/api/index.ts | 10 +-- src/api/providers/__tests__/vscode-lm.spec.ts | 19 +++-- src/api/providers/vscode-lm.ts | 13 ++-- .../__tests__/context-management.spec.ts | 69 +++++-------------- src/core/context-management/index.ts | 30 +++----- src/core/task/Task.ts | 16 ++--- .../chat/__tests__/TaskHeader.spec.tsx | 7 +- .../hooks/__tests__/useSelectedModel.spec.ts | 8 +-- .../components/ui/hooks/useSelectedModel.ts | 12 ++-- 11 files changed, 60 insertions(+), 142 deletions(-) diff --git a/packages/types/src/__tests__/vscode-llm.spec.ts b/packages/types/src/__tests__/vscode-llm.spec.ts index 7a2eabddf7..041bc3c8b4 100644 --- a/packages/types/src/__tests__/vscode-llm.spec.ts +++ b/packages/types/src/__tests__/vscode-llm.spec.ts @@ -3,11 +3,8 @@ import { vscodeLlmModels, vscodeLlmDefaultModelId } from "../providers/vscode-ll describe("vscodeLlmModels", () => { it("exposes the opus-4.8 row with its measured maxInputTokens and contextWindow", () => { - // The VS Code LM API exposes only maxInputTokens; that is the value the UI reads from this - // table (useSelectedModel.ts). For claude-opus-4.8 the two fields intentionally DIVERGE: - // maxInputTokens (197897) is the enforced input ceiling, while contextWindow (679560) records - // the larger advertised window. The UI reads maxInputTokens, so the divergence is a deliberate - // tripwire — assert the actual on-disk literals rather than forcing equality. + // claude-opus-4.8 intentionally diverges: maxInputTokens (197897) is the enforced ceiling the + // UI reads, contextWindow (679560) the advertised window. Assert the on-disk literals as a tripwire. expect(vscodeLlmModels).toHaveProperty("claude-opus-4.8") expect(vscodeLlmModels["claude-opus-4.8"].contextWindow).toBe(679560) expect(vscodeLlmModels["claude-opus-4.8"].maxInputTokens).toBe(197897) diff --git a/packages/types/src/providers/vscode-llm.ts b/packages/types/src/providers/vscode-llm.ts index 46df75fac9..5286b0ed28 100644 --- a/packages/types/src/providers/vscode-llm.ts +++ b/packages/types/src/providers/vscode-llm.ts @@ -5,15 +5,8 @@ export type VscodeLlmModelId = keyof typeof vscodeLlmModels export const vscodeLlmDefaultModelId: VscodeLlmModelId = "claude-sonnet-4.5" // Curated VS Code LM (GitHub Copilot) model catalog. -// -// The VS Code LM API only exposes `maxInputTokens` per model; it does NOT report a separate -// total context window. For each row, `contextWindow` records the model's advertised window -// while `maxInputTokens` is the enforced input ceiling the UI actually reads (via -// useSelectedModel.ts) and the condense gate measures against. For most rows the two values -// match. They intentionally DIVERGE only where the provider advertises a larger window than the -// usable input ceiling (e.g. claude-opus-4.8): keeping both fields lets the context bar and the -// auto-condense gate stay on a single source of truth (maxInputTokens) without losing the real -// advertised window. +// The API exposes only `maxInputTokens`; the UI and condense gate read that. `contextWindow` is +// the advertised window, kept for rows where it diverges from the ceiling (e.g. claude-opus-4.8). export const vscodeLlmModels = { "claude-opus-4.8": { contextWindow: 679560, diff --git a/src/api/index.ts b/src/api/index.ts index 00201b0d21..9e4ba3bfb5 100644 --- a/src/api/index.ts +++ b/src/api/index.ts @@ -108,13 +108,9 @@ export interface ApiHandler { getModel(): { id: string; info: ModelInfo } /** - * Optional: the context window (in tokens) to use for context-management / - * auto-condense decisions, when it must differ from getModel().info.contextWindow. - * - * Only the VS Code LM (Copilot) provider overrides this, to measure usage against the - * model's static `maxInputTokens` instead of the inflated live window VS Code reports. - * Other providers leave it undefined and callers fall back to getModel().info.contextWindow, - * so their behavior is unchanged. + * Optional context window for context-management / auto-condense when it must differ from + * getModel().info.contextWindow. Only VS Code LM overrides it (static `maxInputTokens` vs its + * inflated live window); others leave it undefined and callers fall back. */ getCondenseContextWindow?(): number diff --git a/src/api/providers/__tests__/vscode-lm.spec.ts b/src/api/providers/__tests__/vscode-lm.spec.ts index eb026e8169..5c425b5e25 100644 --- a/src/api/providers/__tests__/vscode-lm.spec.ts +++ b/src/api/providers/__tests__/vscode-lm.spec.ts @@ -443,8 +443,7 @@ describe("VsCodeLmHandler", () => { }) it("should use the full advertised maxInputTokens without an upper cap", async () => { - // VS Code can report a very large advertised window; getModel surfaces it as-is - // (Math.max(0, maxInputTokens)) rather than clamping to a smaller default. + // A large advertised window is surfaced as-is, not clamped to a smaller default. const mockModel = { ...mockLanguageModelChat, maxInputTokens: 936000 } ;(vscode.lm.selectChatModels as Mock).mockResolvedValue([mockModel]) handler["client"] = null @@ -485,15 +484,15 @@ describe("VsCodeLmHandler", () => { }) it("falls back to the live model context window for families not in the static table", () => { - // test-family is not a curated row, so the gate uses the live runtime window. + // Not a curated row, so the gate uses the live runtime window. handler["client"] = mockLanguageModelChat as unknown as vscode.LanguageModelChat expect(handler.getCondenseContextWindow()).toBe(handler.getModel().info.contextWindow) expect(handler.getCondenseContextWindow()).toBe(mockLanguageModelChat.maxInputTokens) }) it("falls back to the live window when no family is resolvable (no client, no selector family)", () => { - // With neither a client nor a selector family, `family` is undefined, so the static-table - // lookup is skipped entirely and the gate uses getModel().info.contextWindow (fallback info). + // No client and no selector family means `family` is undefined, so the gate skips the + // static lookup and uses getModel().info.contextWindow. const noFamilyHandler = new VsCodeLmHandler({ vsCodeLmModelSelector: { vendor: "copilot" } }) noFamilyHandler["client"] = null expect(noFamilyHandler.getCondenseContextWindow()).toBe(noFamilyHandler.getModel().info.contextWindow) @@ -502,10 +501,8 @@ describe("VsCodeLmHandler", () => { }) it("falls back to the derived window when the static row exists but maxInputTokens is non-positive", () => { - // Guard sub-condition: a curated family is found but its maxInputTokens is <= 0 (corrupt/zeroed). - // With the selector family `claude-opus-4.8` and no live client, the zeroed static row is the one - // consulted, so the `maxInputTokens > 0` guard fails and the gate falls back to the derived window - // from getModel().info.contextWindow (sane defaults here, since there is no live client). + // A curated row exists but its maxInputTokens is <= 0, so the `> 0` guard fails and the gate + // falls back to getModel().info.contextWindow. const family = "claude-opus-4.8" const original = vscodeLlmModels[family].maxInputTokens try { @@ -513,8 +510,8 @@ describe("VsCodeLmHandler", () => { const guardHandler = new VsCodeLmHandler({ vsCodeLmModelSelector: { vendor: "copilot", family }, }) - // Leave the client unset so `family` resolves from the selector (claude-opus-4.8), - // forcing the zeroed static row to be read instead of a live client's family. + // Leave the client unset so `family` resolves from the selector, forcing the zeroed + // static row to be read instead of a live client's family. guardHandler["client"] = null expect(guardHandler.getCondenseContextWindow()).toBe(guardHandler.getModel().info.contextWindow) expect(guardHandler.getCondenseContextWindow()).toBe(openAiModelInfoSaneDefaults.contextWindow) diff --git a/src/api/providers/vscode-lm.ts b/src/api/providers/vscode-lm.ts index d730658b44..9adcefa972 100644 --- a/src/api/providers/vscode-lm.ts +++ b/src/api/providers/vscode-lm.ts @@ -563,15 +563,10 @@ export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHan } /** - * Context window used for auto-condense / context-management decisions. - * - * VS Code's LM API reports `client.maxInputTokens` as Copilot's *advertised* window, - * which is far larger than the realistic usable window; relying on it keeps auto-condense - * from ever firing. For condense decisions we instead measure usage against the curated - * static table's `maxInputTokens` — the same value the context bar uses via - * `useSelectedModel` — so the gate and the gauge stay on one source of truth. - * - * Falls back to the live runtime window when the selected model isn't in the static table. + * Context window for auto-condense. The API's advertised `client.maxInputTokens` is far larger + * than usable, so relying on it stops auto-condense from firing; measure against the curated + * static table's `maxInputTokens` instead (the same value the bar uses). Fall back to the live + * window when the model isn't in the table. */ getCondenseContextWindow(): number { const family = this.client?.family ?? this.options.vsCodeLmModelSelector?.family diff --git a/src/core/context-management/__tests__/context-management.spec.ts b/src/core/context-management/__tests__/context-management.spec.ts index ba0a77aacf..89797b045f 100644 --- a/src/core/context-management/__tests__/context-management.spec.ts +++ b/src/core/context-management/__tests__/context-management.spec.ts @@ -810,8 +810,7 @@ describe("Context Management", () => { const summarizeSpy = vi.spyOn(condenseModule, "summarizeConversation") const modelInfo = createModelInfo(100000, 30000) - // Usage is measured against available input space (contextWindow - maxTokens reserve). - // available = 100000 - 30000 = 70000; 30000 / 70000 ≈ 43% < 50% threshold. + // Usage measured against available input space stays below the threshold. const contextWindow = modelInfo.contextWindow const totalTokens = 30000 const messagesWithSmallContent = [ @@ -1508,8 +1507,7 @@ describe("Context Management", () => { }) it("should return false when context percent is below threshold", () => { - // Available-input denominator (opt-in): available = 100000 - 30000 = 70000; - // 30000 / 70000 ≈ 43% < 50% threshold. + // Opt-in available-input denominator: usage stays below threshold. const result = willManageContext({ totalTokens: 30000, contextWindow: 100000, @@ -1525,12 +1523,7 @@ describe("Context Management", () => { }) it("should treat a negative maxTokens (vscode-lm reports -1) as the default reserve, not -1", () => { - // vscode-lm reports maxTokens: -1 (unlimited). A naive `maxTokens || DEFAULT` keeps -1, - // which would make allowedTokens balloon past the window and skew the percentage. The - // guard must treat -1 like an unknown reserve (ANTHROPIC_DEFAULT_MAX_TOKENS for the - // allowed-tokens math, zero reserve for the available-input percentage). - // With autoCondenseContext disabled, only the allowedTokens path can trigger: - // allowedTokens = 100000 * 0.9 - 8192 = 81808; totalTokens 85000 > 81808 → true. + // A -1 reserve must be treated as unknown (default reserve), not kept as -1. const result = willManageContext({ totalTokens: 85000, contextWindow: 100000, @@ -1605,10 +1598,7 @@ describe("Context Management", () => { }) it("should include lastMessageTokens in the calculation", () => { - // Available-input denominator (opt-in): available = 100000 - 30000 = 70000. - // Without lastMessageTokens: 34000 / 70000 ≈ 48.6% < 50% threshold. - // With lastMessageTokens: (34000 + 2000) / 70000 ≈ 51.4% ≥ 50% threshold. - // (Against the full window both cases are < 50%, so this case requires the opt-in flag.) + // Adding lastMessageTokens pushes usage over the threshold (opt-in available-input denominator). const resultWithoutLastMessage = willManageContext({ totalTokens: 34000, contextWindow: 100000, @@ -1731,11 +1721,8 @@ describe("Context Management", () => { }) /** - * Regression tests for the opt-in available-input denominator (vscode-lm). With the flag on, - * the condense gate measures usage against available input space (contextWindow - reserved - * output), not the raw context window. This keeps the gate in lockstep with the UI context - * gauge and ensures it actually fires for vscode-lm, which reports maxTokens: -1. The default - * (full-window) behavior for every other provider is covered by the sibling describe below. + * Regression: with the opt-in flag on, the gate measures usage against available input space + * (contextWindow - reserved output) so it stays in lockstep with the UI gauge and fires for vscode-lm. */ describe("contextPercent uses available input space (opt-in, regression)", () => { const createModelInfo = (contextWindow: number, maxTokens?: number): ModelInfo => ({ @@ -1753,9 +1740,7 @@ describe("Context Management", () => { ] it("willManageContext measures the percentage against available input, not the full window", () => { - // contextWindow 200000, reserve 64000 → available input 136000. - // totalTokens 100000 → 100000 / 136000 ≈ 73.5%, which clears the 70% threshold. - // Against the full window it would be only 50% and the gate would (wrongly) stay closed. + // Dividing by available input clears the threshold; the full window would keep the gate closed. const result = willManageContext({ totalTokens: 100000, contextWindow: 200000, @@ -1771,7 +1756,7 @@ describe("Context Management", () => { }) it("willManageContext stays below threshold when usage is under available input", () => { - // available input 136000; totalTokens 90000 → ≈ 66.2% < 70% threshold. + // Usage under available input stays below threshold. const result = willManageContext({ totalTokens: 90000, contextWindow: 200000, @@ -1787,8 +1772,7 @@ describe("Context Management", () => { }) it("willManageContext treats an unlimited (-1) reserve as zero reserve for the percentage", () => { - // vscode-lm reports maxTokens: -1. The percentage denominator should fall back to the - // full window (zero reserve): 150000 / 200000 = 75% ≥ 70% threshold. + // A -1 reserve falls back to the full window (zero reserve) for the percentage. const result = willManageContext({ totalTokens: 150000, contextWindow: 200000, @@ -1804,9 +1788,7 @@ describe("Context Management", () => { }) it("willManageContext falls back to 100% when the reserve is >= the window (availableInput <= 0)", () => { - // When maxTokens (reserve) >= contextWindow, availableInputTokens = window - reserve <= 0. - // The denominator guard must short-circuit contextPercent to 100 rather than divide by - // a non-positive number, so the gate fires regardless of the (tiny) totalTokens. + // Non-positive available input must short-circuit contextPercent to 100 rather than divide. const result = willManageContext({ totalTokens: 1, contextWindow: 50000, @@ -1818,12 +1800,11 @@ describe("Context Management", () => { lastMessageTokens: 0, useAvailableInputForContextPercent: true, }) - // contextPercent === 100 >= 80 threshold → true. expect(result).toBe(true) }) it("willManageContext falls back to 100% when the reserve exactly equals the window (availableInput === 0)", () => { - // Boundary: reserve === window → availableInputTokens === 0, still the FALSE branch (> 0 is false). + // Boundary: reserve === window → available input 0, still the non-positive guard. const result = willManageContext({ totalTokens: 1, contextWindow: 50000, @@ -1839,9 +1820,7 @@ describe("Context Management", () => { }) it("manageContext summarizes via the 100% fallback when the reserve >= the window (availableInput <= 0)", async () => { - // Mirror the willManageContext edge for the manageContext path: reserve >= window forces - // contextPercent to 100 via the denominator guard, so summarization triggers even though - // totalTokens is small relative to the raw window. + // reserve >= window forces contextPercent to 100, so summarization triggers. const mockSummary = "Reserve-exceeds-window summary" const mockSummarizeResponse: condenseModule.SummarizeResponse = { messages: [ @@ -1857,7 +1836,6 @@ describe("Context Management", () => { .spyOn(condenseModule, "summarizeConversation") .mockResolvedValue(mockSummarizeResponse) - // contextWindow 50000, maxTokens 60000 → availableInput = -10000 → contextPercent = 100. const messagesWithSmallContent = [ ...messages.slice(0, -1), { ...messages[messages.length - 1], content: "" }, @@ -1904,8 +1882,7 @@ describe("Context Management", () => { .mockResolvedValue(mockSummarizeResponse) const modelInfo = createModelInfo(200000, 64000) - // available input 136000; totalTokens 100000 → ≈ 73.5% ≥ 70% threshold, but only 50% of - // the raw window. The end-to-end path must trigger summarization on the available-input math. + // Clears the threshold against available input but not the raw window; end-to-end must summarize. const totalTokens = 100000 const messagesWithSmallContent = [ ...messages.slice(0, -1), @@ -1938,9 +1915,8 @@ describe("Context Management", () => { }) /** - * Scoping tests: the available-input denominator is opt-in. By default (flag omitted), the gate - * divides by the FULL context window, exactly as every non-vscode-lm provider did before the - * vscode-lm fix. The maxTokens: -1 reserve guard, however, remains global on the default path. + * Scoping: the available-input denominator is opt-in; default divides by the full window. + * The maxTokens: -1 reserve guard stays global on the default path. */ describe("contextPercent denominator is opt-in (default = full window)", () => { const messages: ApiMessage[] = [ @@ -1952,9 +1928,7 @@ describe("Context Management", () => { ] it("willManageContext divides by the full window when the flag is omitted (default)", () => { - // Same inputs as the regression block: contextWindow 200000, reserve 64000, totalTokens 100000. - // Default (full window): 100000 / 200000 = 50% < 70% threshold → false. Under the opt-in - // available-input math it would be ≈ 73.5% and fire — this proves the scoping. + // Default divides by the full window, staying below threshold where the opt-in math would fire. const result = willManageContext({ totalTokens: 100000, contextWindow: 200000, @@ -1969,7 +1943,7 @@ describe("Context Management", () => { }) it("willManageContext fires on the same inputs when the opt-in flag is true", () => { - // Identical inputs, flag on: available input 136000 → 100000 / 136000 ≈ 73.5% ≥ 70% → true. + // Same inputs, flag on: dividing by available input clears the threshold. const result = willManageContext({ totalTokens: 100000, contextWindow: 200000, @@ -1985,9 +1959,7 @@ describe("Context Management", () => { }) it("keeps the maxTokens:-1 reserve guard on the default (full-window) path", () => { - // The reserve guard is global, independent of the percent denominator. With auto-condense - // off, only the allowedTokens path can fire: allowedTokens = 100000 * 0.9 - 8192 = 81808; - // totalTokens 85000 > 81808 → true. (A naive `maxTokens || DEFAULT` keeping -1 would break this.) + // The -1 reserve guard is global, independent of the percent denominator. const result = willManageContext({ totalTokens: 85000, contextWindow: 100000, @@ -2002,10 +1974,7 @@ describe("Context Management", () => { }) it("manageContext does NOT summarize on the default path where the opt-in math would have", async () => { - // contextWindow 200000, reserve 64000, totalTokens 100000. Default full-window percent is - // 50% < 70% threshold, and allowedTokens = 200000 * 0.9 - 64000 = 116000 > 100000, so neither - // condense nor truncation runs. With the opt-in flag this same case summarizes (asserted above - // in the regression block), proving the default path reverts to pre-fix behavior. + // Default full-window math leaves this case below threshold; the opt-in flag would summarize it. const summarizeSpy = vi.spyOn(condenseModule, "summarizeConversation") const messagesWithSmallContent = [ diff --git a/src/core/context-management/index.ts b/src/core/context-management/index.ts index b4d89487fd..ed2ee6be5f 100644 --- a/src/core/context-management/index.ts +++ b/src/core/context-management/index.ts @@ -148,11 +148,8 @@ export type WillManageContextOptions = { currentProfileId: string lastMessageTokens: number /** - * Opt-in: measure the condense percentage against the available input space - * (contextWindow - reserved output) instead of the full context window. Only providers - * whose advertised live window is inflated relative to the usable input ceiling (vscode-lm, - * which exposes the seam via getCondenseContextWindow) set this. All other providers leave it - * undefined and keep dividing by the full context window (original behavior). + * Opt-in (vscode-lm): measure the condense percentage against available input space + * (contextWindow - reserved output) instead of the full window. Others leave it undefined. */ useAvailableInputForContextPercent?: boolean } @@ -203,12 +200,8 @@ export function willManageContext({ // Invalid values fall back to global setting (effectiveThreshold already set) } - // By default, measure usage against the full context window (original behavior shared by all - // providers). Opt-in (vscode-lm via getCondenseContextWindow) measures against the available - // input space (context window minus the reserved output budget) to match the UI context gauge, - // because that provider's advertised window is inflated relative to its usable input ceiling. - // Reserved output tokens can never hold conversation context. When the reserve is - // unknown/unlimited (e.g., vscode-lm reports -1), fall back to the full context window. + // Default: divide by the full context window. Opt-in (vscode-lm) divides by available input + // (window minus reserved output); an unknown/unlimited reserve (-1) falls back to the full window. let contextPercent: number if (useAvailableInputForContextPercent) { const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 @@ -254,11 +247,8 @@ export type ContextManagementOptions = { /** Optional controller for file access validation */ rooIgnoreController?: RooIgnoreController /** - * Opt-in: measure the condense percentage against the available input space - * (contextWindow - reserved output) instead of the full context window. Only providers - * whose advertised live window is inflated relative to the usable input ceiling (vscode-lm, - * which exposes the seam via getCondenseContextWindow) set this. All other providers leave it - * undefined and keep dividing by the full context window (original behavior). + * Opt-in (vscode-lm): measure the condense percentage against available input space + * (contextWindow - reserved output) instead of the full window. Others leave it undefined. */ useAvailableInputForContextPercent?: boolean } @@ -338,12 +328,8 @@ export async function manageContext({ // If no specific threshold is found for the profile, fall back to global setting if (autoCondenseContext) { - // By default, measure usage against the full context window (original behavior shared by all - // providers). Opt-in (vscode-lm via getCondenseContextWindow) measures against the available - // input space (context window minus the reserved output budget) to match the UI context gauge, - // because that provider's advertised window is inflated relative to its usable input ceiling. - // Reserved output tokens can never hold conversation context. When the reserve is - // unknown/unlimited (e.g., vscode-lm reports -1), fall back to the full context window. + // Default: divide by the full context window. Opt-in (vscode-lm) divides by available input + // (window minus reserved output); an unknown/unlimited reserve (-1) falls back to the full window. let contextPercent: number if (useAvailableInputForContextPercent) { const reservedForOutput = maxTokens && maxTokens > 0 ? maxTokens : 0 diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts index 81a2435452..ce9e5bcec2 100644 --- a/src/core/task/Task.ts +++ b/src/core/task/Task.ts @@ -3738,13 +3738,9 @@ export class Task extends EventEmitter implements TaskLike { settings: this.apiConfiguration, }) - // VS Code LM (Copilot) measures usage against its static-table maxInputTokens, not the - // inflated live window, so context management runs in line with the context bar. Every other - // provider returns undefined here and falls back to modelInfo.contextWindow. + // vscode-lm condenses against its static-table maxInputTokens (not the inflated live window); + // only it implements getCondenseContextWindow, so others fall back to the full contextWindow. const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow - - // Only vscode-lm implements getCondenseContextWindow, so its presence scopes the - // available-input condense denominator to that provider; all others use the full window. const useAvailableInputForContextPercent = typeof this.api.getCondenseContextWindow === "function" // Get the current profile ID using the helper method @@ -3942,13 +3938,9 @@ export class Task extends EventEmitter implements TaskLike { settings: this.apiConfiguration, }) - // VS Code LM (Copilot) measures usage against its static-table maxInputTokens, not the - // inflated live window, so context management runs in line with the context bar. Every other - // provider returns undefined here and falls back to modelInfo.contextWindow. + // vscode-lm condenses against its static-table maxInputTokens (not the inflated live window); + // only it implements getCondenseContextWindow, so others fall back to the full contextWindow. const contextWindow = this.api.getCondenseContextWindow?.() ?? modelInfo.contextWindow - - // Only vscode-lm implements getCondenseContextWindow, so its presence scopes the - // available-input condense denominator to that provider; all others use the full window. const useAvailableInputForContextPercent = typeof this.api.getCondenseContextWindow === "function" // Get the current profile ID using the helper method diff --git a/webview-ui/src/components/chat/__tests__/TaskHeader.spec.tsx b/webview-ui/src/components/chat/__tests__/TaskHeader.spec.tsx index c845382632..252cbbb722 100644 --- a/webview-ui/src/components/chat/__tests__/TaskHeader.spec.tsx +++ b/webview-ui/src/components/chat/__tests__/TaskHeader.spec.tsx @@ -269,11 +269,8 @@ describe("TaskHeader", () => { }) it("should treat a negative maxTokens (vscode-lm reports -1) as zero reserve", () => { - // vscode-lm reports maxTokens: -1 (unlimited). A naive `maxTokens || 0` keeps -1, - // which would inflate available input space and skew the percentage. The guard must - // treat -1 as a zero reserve so available space == contextWindow. - // contextTokens = 250, contextWindow = 1000, reservedForOutput = 0 - // Percentage = 250 / 1000 * 100 = 25% + // vscode-lm reports maxTokens: -1 (unlimited). The guard must treat that negative reserve + // as zero, so available space == contextWindow rather than being inflated by a kept -1. mockModelInfo = { contextWindow: 1000, maxTokens: -1 } mockMaxOutputTokens = -1 diff --git a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts index 3ffe85e144..f4fd51cffc 100644 --- a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts +++ b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts @@ -812,9 +812,8 @@ describe("useSelectedModel", () => { }) it("pins a divergent family's contextWindow to maxInputTokens, not its advertised window", () => { - // claude-opus-4.8 is the row where contextWindow (679560) and maxInputTokens (197897) DIFFER. - // The hook must surface maxInputTokens so the bar matches the condense gate; a field swap to - // the advertised contextWindow would be caught here (unlike the default model where they match). + // claude-opus-4.8 is the row where contextWindow and maxInputTokens differ; a field swap to + // the advertised window would be caught here. const family = "claude-opus-4.8" const apiConfiguration: ProviderSettings = { apiProvider: "vscode-lm", @@ -840,8 +839,7 @@ describe("useSelectedModel", () => { const wrapper = createWrapper() const { result } = renderHook(() => useSelectedModel(apiConfiguration), { wrapper }) - // On a family miss we must NOT fall back to openAiModelInfoSaneDefaults' 128000 window, - // which would diverge from the gate. Instead, use the default model's maxInputTokens. + // A family miss must not use the 128000 sane-defaults window; use the default model's instead. expect(result.current.info?.contextWindow).not.toBe(128000) expect(result.current.info?.contextWindow).toBe(vscodeLlmModels[vscodeLlmDefaultModelId].maxInputTokens) expect(result.current.info?.supportsImages).toBe(false) diff --git a/webview-ui/src/components/ui/hooks/useSelectedModel.ts b/webview-ui/src/components/ui/hooks/useSelectedModel.ts index a5940ba7d3..ddc1a19755 100644 --- a/webview-ui/src/components/ui/hooks/useSelectedModel.ts +++ b/webview-ui/src/components/ui/hooks/useSelectedModel.ts @@ -310,15 +310,13 @@ function getSelectedModel({ ? `${apiConfiguration.vsCodeLmModelSelector.vendor}/${apiConfiguration.vsCodeLmModelSelector.family}` : vscodeLlmDefaultModelId const modelFamily = apiConfiguration?.vsCodeLmModelSelector?.family ?? vscodeLlmDefaultModelId - // On a family miss, fall back to the default model entry instead of openAiModelInfoSaneDefaults, - // whose 128K contextWindow would diverge from the gate and make the bar read >100% while - // auto-condense never fires (the gate uses the live window). + // On a family miss, fall back to the default model entry, not openAiModelInfoSaneDefaults + // (whose 128K contextWindow would diverge from the gate and skew the bar). const listedModel = vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] ?? vscodeLlmModels[vscodeLlmDefaultModelId] - // Set contextWindow = maxInputTokens so the UI bar matches what the condense gate uses for - // vscode-lm. The gate's primary window comes from getCondenseContextWindow() (which returns the - // static-table maxInputTokens); getModel().info.contextWindow is only the fallback. Sharing - // maxInputTokens keeps the bar and the gate on a single source of truth. + // Set contextWindow = maxInputTokens so the UI bar shares one source of truth with the gate, + // whose primary window is getCondenseContextWindow() (static-table maxInputTokens); this + // info.contextWindow is only the gate's fallback. const info: ModelInfo = { ...openAiModelInfoSaneDefaults, ...listedModel,