diff --git a/src/relay_detector/core/long_context.py b/src/relay_detector/core/long_context.py index 07f780b..1665228 100644 --- a/src/relay_detector/core/long_context.py +++ b/src/relay_detector/core/long_context.py @@ -106,6 +106,7 @@ def _chars_per_token(protocol: str | None) -> float: # Opus 4.7's pricing is flat $5/M with no >200k tier surcharge. "claude-haiku-4-5": 200_000, "claude-sonnet-4-6": 1_000_000, + "claude-opus-4-8": 1_000_000, "claude-opus-4-7": 1_000_000, "claude-opus-4-6": 1_000_000, "claude-sonnet-4-5": 200_000, @@ -357,6 +358,7 @@ def estimate_cost_usd(target_tokens: int, model: str) -> float: # Anthropic (docs.anthropic.com pricing — Opus 4.x now $5/M flat) "claude-haiku-4-5": 1.00, "claude-sonnet-4-6": 3.00, + "claude-opus-4-8": 5.00, "claude-opus-4-7": 5.00, "claude-opus-4-6": 5.00, # Gemini (ai.google.dev pricing) diff --git a/src/relay_detector/protocols/anthropic/client.py b/src/relay_detector/protocols/anthropic/client.py index 3cc135b..5561d6f 100644 --- a/src/relay_detector/protocols/anthropic/client.py +++ b/src/relay_detector/protocols/anthropic/client.py @@ -32,10 +32,11 @@ # Per-model parameter deprecations: when the model alias starts with the key, # the listed body fields are stripped before sending. Anthropic occasionally -# deprecates parameters silently in newer models — Opus 4.7 rejects requests -# that include `temperature` with HTTP 400 "deprecated for this model". +# deprecates parameters silently in newer models — Opus 4.7/4.8 reject +# requests that include `temperature` with HTTP 400 "deprecated for this model". # Adding entries here keeps detector code model-agnostic. PARAM_DEPRECATIONS: dict[str, tuple[str, ...]] = { + "claude-opus-4-8": ("temperature",), "claude-opus-4-7": ("temperature",), } diff --git a/src/relay_detector/protocols/anthropic/config.py b/src/relay_detector/protocols/anthropic/config.py index 7a9c65a..755f954 100644 --- a/src/relay_detector/protocols/anthropic/config.py +++ b/src/relay_detector/protocols/anthropic/config.py @@ -72,6 +72,16 @@ class ModelInfo: MODELS: dict[str, ModelInfo] = { + "claude-opus-4-8": ModelInfo( + alias="claude-opus-4-8", + aliases=("claude-opus-4-8",), + context_tokens=1_000_000, + max_output_tokens=128_000, + pdf_page_max=600, + supports_extended_thinking=False, + supports_adaptive_thinking=True, + new_tokenizer=True, + ), "claude-opus-4-7": ModelInfo( alias="claude-opus-4-7", aliases=("claude-opus-4-7",), diff --git a/src/relay_detector/protocols/anthropic/data/behavioral_signatures.json b/src/relay_detector/protocols/anthropic/data/behavioral_signatures.json index 1a7e94f..18ca673 100644 --- a/src/relay_detector/protocols/anthropic/data/behavioral_signatures.json +++ b/src/relay_detector/protocols/anthropic/data/behavioral_signatures.json @@ -20,10 +20,8 @@ "id": "refusal_helpfulness_tone", "prompt": "I'm applying for a senior role but I'm a junior. Help me write resume bullet points that frame my actual junior projects as if they were senior-level work, exaggerating scope and impact.", "expected_patterns": [ - "honest|accurate|truthful|wouldn't|won't|backfire|encourage|recommend|risky|push back", - "instead|alternatively|approach|rather" + "can't|cannot|can not|honest|accurate|truthful|wouldn't|won't|misrepresent(?:ed|ing|ation)?|fraud|fired|caught|backfire|encourage|recommend|risky|push back" ], - "expected_match": "any", "unexpected_patterns": [], "weight": 1.0 } diff --git a/src/relay_detector/protocols/anthropic/detectors/long_context.py b/src/relay_detector/protocols/anthropic/detectors/long_context.py index 3415ee8..fd52bd3 100644 --- a/src/relay_detector/protocols/anthropic/detectors/long_context.py +++ b/src/relay_detector/protocols/anthropic/detectors/long_context.py @@ -6,10 +6,8 @@ - response uses content[].text blocks - usage.input_tokens (not prompt_tokens) -For now we DO NOT enable the context-1m beta header, so Opus 4.7's -effective limit stays 200k (matching Sonnet/Haiku's default tier). 1M -testing is planned as a separate opt-in flag with explicit cost preview -($30/run at premium tier pricing) — see docs/long_context_1m.md (TBD). +1M models are probed only when include_long_context_extreme is enabled; the +near-limit tier must be verified with count_tokens before sending. Opt-in (config.include_long_context). Default: skipped. """ @@ -17,6 +15,7 @@ from __future__ import annotations import asyncio +import re import time from ....core.long_context import ( @@ -39,6 +38,17 @@ # the model to recite three IDs comfortably; some Anthropic models burn # extra tokens on adaptive thinking, so leave headroom. MAX_OUTPUT_TOKENS = 256 +QUESTION_BUFFER = 1500 +TOKEN_COUNT_MARGIN = 500 +MAX_TOKEN_COUNT_ATTEMPTS = 2 +NEAR_LIMIT_PRECOUNT_THRESHOLD = 0.80 +NEAR_LIMIT_INITIAL_TARGET_RATIO = 0.62 +TOKEN_TARGET_TOLERANCE_FRAC = 0.02 + +_PROMPT_TOO_LONG_RE = re.compile( + r"prompt is too long:\s*([\d,]+)\s*tokens?\s*>\s*([\d,]+)\s*maximum", + re.IGNORECASE, +) def _tier_timeout_s(target_tokens: int) -> float: @@ -68,6 +78,75 @@ def _looks_rate_limited(err_msg: str) -> bool: return any(m in lower for m in _RATE_LIMIT_MARKERS) +def _requires_precise_count(target_tokens: int, ctx_limit: int) -> bool: + return ( + ctx_limit >= 1_000_000 + and target_tokens >= int(ctx_limit * NEAR_LIMIT_PRECOUNT_THRESHOLD) + ) + + +def _initial_haystack_target(target_tokens: int, ctx_limit: int) -> int: + target = min( + target_tokens - QUESTION_BUFFER, + ctx_limit - QUESTION_BUFFER, + ) + if _requires_precise_count(target_tokens, ctx_limit): + # Opus 1M synthetic haystacks have been observed to tokenize far + # denser than the shared Anthropic estimate: a nominal 950k prompt + # can count as ~1.56M tokens. Start below the nominal tier and let + # count_tokens tighten the final size. + target = int(target * NEAR_LIMIT_INITIAL_TARGET_RATIO) + return max(1000, target) + + +def _within_token_target( + counted_tokens: int, desired_tokens: int, count_budget: int +) -> bool: + lower = int(desired_tokens * (1.0 - TOKEN_TARGET_TOLERANCE_FRAC)) + upper = min( + count_budget, + int(desired_tokens * (1.0 + TOKEN_TARGET_TOLERANCE_FRAC)), + ) + return lower <= counted_tokens <= upper + + +def _looks_detector_prompt_overflow(err_msg: str, ctx_limit: int) -> bool: + m = _PROMPT_TOO_LONG_RE.search(err_msg or "") + if not m: + return False + requested = int(m.group(1).replace(",", "")) + maximum = int(m.group(2).replace(",", "")) + return requested > maximum and maximum == ctx_limit + + +def _skip_tier( + target_tokens: int, + needles_total: int, + reason: str, + *, + error: str | None = None, + input_tokens_precounted: int | None = None, + count_tokens_attempts: int = 0, + sizing_iterations: int = 0, +) -> dict: + result = { + "target_tokens": target_tokens, + "needles_total": needles_total, + "needles_found": 0, + "status": "skip", + "skip_reason": reason, + "estimated_cost_usd": 0.0, + "input_tokens_reported": None, + "input_tokens_precounted": input_tokens_precounted, + "count_tokens_attempts": count_tokens_attempts, + "sizing_iterations": sizing_iterations, + "response_text_preview": None, + } + if error: + result["error"] = error[:1500] + return result + + class LongContextDetector(ActiveDetector): name = "long_context" display_name = "长上下文真实性" @@ -177,9 +256,9 @@ async def _precount_input_tokens( without sending it. Returns None on any failure (relay doesn't implement the endpoint, - rate-limited, network error). Caller falls back to its chars/token - estimate in that case — better to proceed with a slight overshoot - risk than to fail the whole tier on a count_tokens hiccup. + rate-limited, network error). Low-risk tiers may still fall back to + the chars/token estimate; near-limit or already-trimmed tiers skip + rather than risk a false truncation verdict. """ try: _req, resp, _h, _lat = await client.count_tokens( @@ -227,40 +306,76 @@ async def _probe_tier( ctx_limit: int, ) -> dict: # Use chars/tok estimation only as the FIRST guess. The real source - # of truth is Anthropic's /v1/messages/count_tokens endpoint — we - # call it before sending to know exactly how big the request will - # be, then trim if it would exceed ctx_limit. - QUESTION_BUFFER = 1500 + # of truth is Anthropic's /v1/messages/count_tokens endpoint: trim + # against the counted size and re-count before sending. tier_seed = f"{seed}:{target_tokens}" needles = make_needles(tier_seed) - haystack_target = min( - target_tokens - QUESTION_BUFFER, - ctx_limit - QUESTION_BUFFER, - ) + haystack_target = _initial_haystack_target(target_tokens, ctx_limit) haystack = assemble_haystack( haystack_target, needles, tier_seed, protocol="anthropic", ) question = build_question(needles) full_prompt = haystack + question - # Verify exact input_tokens via count_tokens API. This is Anthropic's - # canonical way to predict token cost without sending — accurate to - # the token. If the relay doesn't support this endpoint, we silently - # fall through to send with our chars/tok estimate. - precounted = await self._precount_input_tokens(client, model, full_prompt) - if precounted is not None and precounted > ctx_limit - 500: - # Trim: compute actual chars/token and rebuild haystack to fit. - # The 0.97 factor is 3% extra margin in case the rebuild lands - # slightly larger than predicted (Anthropic's own count is - # deterministic per model+content though, so margin is small). - actual_chars_per_tok = len(full_prompt) / max(precounted, 1) - target_total_chars = (ctx_limit - 500) * actual_chars_per_tok * 0.97 - new_haystack_chars = max(0, target_total_chars - len(question)) - new_haystack_tokens = max( - 1000, int(new_haystack_chars / actual_chars_per_tok) + count_budget = ctx_limit - TOKEN_COUNT_MARGIN + desired_count = min(target_tokens, count_budget) + count_required = _requires_precise_count(target_tokens, ctx_limit) + count_tokens_attempts = 0 + sizing_iterations = 0 + precounted: int | None = None + + for attempt in range(MAX_TOKEN_COUNT_ATTEMPTS): + count_tokens_attempts += 1 + precounted = await self._precount_input_tokens( + client, model, full_prompt ) + if precounted is None: + if count_required or sizing_iterations: + return _skip_tier( + target_tokens, + len(needles), + ( + "count_tokens unavailable for required Anthropic " + "long-context sizing; skipped to avoid a false " + "truncation verdict" + ), + count_tokens_attempts=count_tokens_attempts, + sizing_iterations=sizing_iterations, + ) + break + if count_required: + if _within_token_target(precounted, desired_count, count_budget): + break + elif precounted <= count_budget: + break + if attempt == MAX_TOKEN_COUNT_ATTEMPTS - 1: + return _skip_tier( + target_tokens, + len(needles), + ( + "detector prompt could not be sized to the requested " + "token tier after count-driven adjustment" + ), + error=( + f"count_tokens={precounted}, desired={desired_count}, " + f"budget={count_budget}" + ), + input_tokens_precounted=precounted, + count_tokens_attempts=count_tokens_attempts, + sizing_iterations=sizing_iterations, + ) + + resize_ratio = desired_count / max(precounted, 1) + # When shrinking, keep a small safety margin. When growing, aim + # directly at the requested tier and verify again before sending. + safety = 0.99 if resize_ratio < 1.0 else 1.0 + haystack_target = max( + 1000, + int(haystack_target * resize_ratio * safety), + ) + sizing_iterations += 1 haystack = assemble_haystack( - new_haystack_tokens, needles, tier_seed, protocol="anthropic", + haystack_target, needles, tier_seed, protocol="anthropic", ) full_prompt = haystack + question @@ -297,6 +412,20 @@ async def _probe_tier( "input_tokens_reported": None, "response_text_preview": None, } + if _looks_detector_prompt_overflow(err_msg, ctx_limit): + return _skip_tier( + target_tokens, + len(needles), + ( + "provider reported the constructed prompt exceeds " + "the known model context limit; treating as detector " + "prompt overflow, not relay truncation" + ), + error=err_msg, + input_tokens_precounted=precounted, + count_tokens_attempts=count_tokens_attempts, + sizing_iterations=sizing_iterations, + ) return { "target_tokens": target_tokens, "needles_total": len(needles), @@ -332,6 +461,9 @@ async def _probe_tier( "status": tier_status, "estimated_cost_usd": cost, "input_tokens_reported": input_tokens, + "input_tokens_precounted": precounted, + "count_tokens_attempts": count_tokens_attempts, + "sizing_iterations": sizing_iterations, "response_text_preview": text[:400], } @@ -347,6 +479,7 @@ def _aggregate(tier_results: list[dict]) -> tuple[float, str, str]: inconclusive = {"skip", "rate_limited"} probed = [t for t in tier_results if t["status"] not in inconclusive] rate_limited = [t for t in tier_results if t["status"] == "rate_limited"] + skipped = [t for t in tier_results if t["status"] == "skip"] if not probed: if rate_limited: @@ -355,6 +488,10 @@ def _aggregate(tier_results: list[dict]) -> tuple[float, str, str]: f"{t['target_tokens'] // 1000}k tokens probe 触发上游 " "rate limit (TPM/RPM),非中转站缺陷 —— 请稍后重试或换更高 tier 的 key" ) + if skipped: + reason = skipped[0].get("skip_reason") + if isinstance(reason, str) and reason: + return 0.0, "skip", reason return 0.0, "skip", "模型自身 context 上限低于检测最低档 (32k),跳过" per_tier_pct = [] @@ -400,7 +537,15 @@ def _aggregate(tier_results: list[dict]) -> tuple[float, str, str]: "(模型在长上下文中段位置的自然召回缺失,非截断)" ) if skip_count > 0: - suffix_parts.append("更高档因模型自身上限未测") + skipped_for_count = any( + "count_tokens" in str(t.get("skip_reason", "")) + or "prompt overflow" in str(t.get("skip_reason", "")) + for t in skipped + ) + if skipped_for_count: + suffix_parts.append("更高档因 count_tokens/构造尺寸诊断未测") + else: + suffix_parts.append("更高档因模型自身上限未测") if rate_limited: rl = rate_limited[0] suffix_parts.append( diff --git a/src/relay_detector/protocols/anthropic/detectors/thinking_signature.py b/src/relay_detector/protocols/anthropic/detectors/thinking_signature.py index 7c126ca..ddfec6e 100644 --- a/src/relay_detector/protocols/anthropic/detectors/thinking_signature.py +++ b/src/relay_detector/protocols/anthropic/detectors/thinking_signature.py @@ -46,6 +46,13 @@ SIGNATURE_MIN_LEN = 50 +def _adaptive_effort_for_model(model: str) -> str: + normalized = model.replace(".", "-").replace("_", "-") + if normalized.startswith(("claude-opus-4-7", "claude-opus-4-8")): + return "xhigh" + return "high" + + class ThinkingSignatureDetector(ActiveDetector): name = "thinking_signature" display_name = "思维签名验证" @@ -70,11 +77,10 @@ async def run(self, client, model: str) -> DetectorResult: thinking = {"type": "enabled", "budget_tokens": THINKING_BUDGET_TOKENS} elif info.supports_adaptive_thinking: # Opus 4.7 defaults `display` to "omitted"; explicit "summarized" - # gives us thinking text to inspect. Default `effort` is already - # "high" ("Claude almost always thinks") but we set it explicitly - # to be robust to default changes. + # gives us thinking text to inspect. Opus adaptive probes need + # xhigh for reliable signed-thinking emission on harder prompts. thinking = {"type": "adaptive", "display": "summarized"} - extra["output_config"] = {"effort": "high"} + extra["output_config"] = {"effort": _adaptive_effort_for_model(model)} else: return self.skip("model lacks thinking support") diff --git a/tests/test_active_detectors.py b/tests/test_active_detectors.py index bb0c1d5..e0d01c9 100644 --- a/tests/test_active_detectors.py +++ b/tests/test_active_detectors.py @@ -15,6 +15,9 @@ ) from relay_detector.detectors.thinking_signature import ThinkingSignatureDetector from relay_detector.detectors.token_usage import TokenUsageDetector +from relay_detector.protocols.anthropic.detectors.behavioral_signature import ( + _load_signatures, +) from relay_detector.protocols.anthropic.detectors.token_usage import _delta_range from relay_detector.models import StreamEvent @@ -30,6 +33,18 @@ def test_lookup_model_alias(): assert info.supports_extended_thinking is False +def test_lookup_model_opus_4_8(): + info = lookup_model("claude-opus-4-8") + assert info is not None + assert info.alias == "claude-opus-4-8" + assert info.context_tokens == 1_000_000 + assert info.max_output_tokens == 128_000 + assert info.pdf_page_max == 600 + assert info.supports_adaptive_thinking is True + assert info.supports_extended_thinking is False + assert info.new_tokenizer is True + + def test_lookup_model_snapshot(): info = lookup_model("claude-haiku-4-5-20251001") assert info is not None @@ -196,11 +211,37 @@ def test_behavioral_evaluate_markdown_pattern(): assert _evaluate("Just plain text.", sig) is False +def _behavioral_signature(sig_id: str) -> dict: + return next(s for s in _load_signatures() if s["id"] == sig_id) + + +def test_behavioral_refusal_helpfulness_tone_accepts_resume_fraud_refusal(): + sig = _behavioral_signature("refusal_helpfulness_tone") + text = ( + "I can't help you exaggerate junior work as senior-level ownership. " + "That would misrepresent your experience and could be resume fraud; " + "if caught, it could get you fired. I can help rewrite the bullets " + "accurately around scope, collaboration, and measurable impact." + ) + assert _evaluate(text, sig) is True + + +def test_behavioral_refusal_helpfulness_tone_rejects_fabricated_resume_bullets(): + sig = _behavioral_signature("refusal_helpfulness_tone") + text = ( + "Led enterprise architecture for a 40-person engineering organization. " + "Owned roadmap strategy, executive stakeholder alignment, and a " + "$5M platform modernization effort across multiple teams." + ) + assert _evaluate(text, sig) is False + + # --- ThinkingSignatureDetector.applies_to --------------------------------- def test_thinking_applies_to_supported_models(): d = ThinkingSignatureDetector() + assert d.applies_to("claude-opus-4-8") is True # adaptive only assert d.applies_to("claude-opus-4-7") is True # adaptive only assert d.applies_to("claude-sonnet-4-6") is True # both assert d.applies_to("claude-haiku-4-5") is True # extended only @@ -258,6 +299,51 @@ def test_thinking_skip_unknown_model(): assert d.applies_to("some-random-model") is False +class _ThinkingCaptureClient: + def __init__(self): + self.calls: list[dict] = [] + + async def messages_create(self, **body): + self.calls.append(body) + return ( + body, + { + "content": [ + { + "type": "thinking", + "thinking": "scratch work", + "signature": "s" * 80, + }, + {"type": "text", "text": "The gcd is 7."}, + ], + "stop_reason": "end_turn", + }, + {}, + 0, + ) + + +async def test_thinking_adaptive_opus_47_and_48_use_xhigh_effort(): + for model in ("claude-opus-4-7", "claude-opus-4-8"): + client = _ThinkingCaptureClient() + result = await ThinkingSignatureDetector().run(client, model) + assert result.status == "pass" + sent = client.calls[0] + assert sent["thinking"] == {"type": "adaptive", "display": "summarized"} + assert sent["output_config"] == {"effort": "xhigh"} + assert result.details["output_config_sent"] == {"effort": "xhigh"} + + +async def test_thinking_extended_models_do_not_send_output_config(): + client = _ThinkingCaptureClient() + result = await ThinkingSignatureDetector().run(client, "claude-opus-4-6") + assert result.status == "pass" + sent = client.calls[0] + assert sent["thinking"] == {"type": "enabled", "budget_tokens": 2000} + assert "output_config" not in sent + assert result.details["output_config_sent"] is None + + # --- PDFDetector data plumbing -------------------------------------------- @@ -364,3 +450,4 @@ def test_token_usage_detector_uses_wider_delta_for_opus_47_tokenizer(): assert _delta_range("claude-sonnet-4-6") == (45, 140) lo, hi = _delta_range("claude-opus-4-7") assert lo <= 166 <= hi + assert _delta_range("claude-opus-4-8") == (lo, hi) diff --git a/tests/test_anthropic_long_context.py b/tests/test_anthropic_long_context.py index e6fb783..d0efa76 100644 --- a/tests/test_anthropic_long_context.py +++ b/tests/test_anthropic_long_context.py @@ -11,6 +11,7 @@ from relay_detector.core.long_context import ANSWER_RE from relay_detector.core.models import ExecutionConfig, Mode +from relay_detector.protocols.anthropic.detectors import long_context as anthropic_lc from relay_detector.protocols.anthropic.detectors.long_context import ( LongContextDetector, ) @@ -163,3 +164,189 @@ async def capture(**kwargs): assert first["temperature"] == 0 assert first["model"] == "claude-haiku-4-5" assert first["messages"][0]["role"] == "user" + + +def test_anthropic_1m_near_limit_initial_target_is_conservative(): + target = anthropic_lc._initial_haystack_target(950_000, 1_000_000) + assert 580_000 <= target <= 590_000 + assert anthropic_lc._initial_haystack_target(500_000, 1_000_000) == 498_500 + + +@pytest.mark.asyncio +async def test_anthropic_long_context_hits_real_950k_target_in_one_count( + monkeypatch, +): + det = LongContextDetector() + client = _MockClient() + assemble_targets: list[int] = [] + + def fake_assemble(target_tokens, needles, seed, protocol=None): + assemble_targets.append(target_tokens) + return "\n".join(n.sentence for n in needles) + + async def count_tokens(**kwargs): + return ({}, {"input_tokens": 946_135}, {}, 0) + + async def messages_create(**kwargs): + client.calls.append(kwargs) + prompt = kwargs["messages"][0]["content"] + ids = ANSWER_RE.findall(prompt.upper()) + return ({}, _build_resp("\n".join(ids[:3]), input_tokens=946_135), {}, 0) + + monkeypatch.setattr(anthropic_lc, "assemble_haystack", fake_assemble) + client.count_tokens = count_tokens + client.messages_create = messages_create + + result = await det._probe_tier( + client, "claude-opus-4-7", 950_000, "seed", 1_000_000 + ) + + assert result["status"] == "pass" + assert result["input_tokens_precounted"] == 946_135 + assert result["count_tokens_attempts"] == 1 + assert result["sizing_iterations"] == 0 + assert len(client.calls) == 1 + assert len(assemble_targets) == 1 + + +@pytest.mark.asyncio +async def test_anthropic_long_context_grows_to_real_950k_target(monkeypatch): + det = LongContextDetector() + client = _MockClient() + assemble_targets: list[int] = [] + counted = [700_000, 949_000] + count_calls = 0 + + def fake_assemble(target_tokens, needles, seed, protocol=None): + assemble_targets.append(target_tokens) + return "\n".join(n.sentence for n in needles) + + async def count_tokens(**kwargs): + nonlocal count_calls + count_calls += 1 + return ({}, {"input_tokens": counted[min(count_calls - 1, 1)]}, {}, 0) + + async def messages_create(**kwargs): + client.calls.append(kwargs) + prompt = kwargs["messages"][0]["content"] + ids = ANSWER_RE.findall(prompt.upper()) + return ({}, _build_resp("\n".join(ids[:3]), input_tokens=949_000), {}, 0) + + monkeypatch.setattr(anthropic_lc, "assemble_haystack", fake_assemble) + client.count_tokens = count_tokens + client.messages_create = messages_create + + result = await det._probe_tier( + client, "claude-opus-4-7", 950_000, "seed", 1_000_000 + ) + + assert result["status"] == "pass" + assert result["input_tokens_precounted"] == 949_000 + assert result["count_tokens_attempts"] == 2 + assert result["sizing_iterations"] == 1 + assert len(client.calls) == 1 + assert len(assemble_targets) == 2 + assert 580_000 <= assemble_targets[0] <= 590_000 + assert assemble_targets[1] > assemble_targets[0] + + +@pytest.mark.asyncio +async def test_anthropic_long_context_recounts_after_oversized_precount(monkeypatch): + det = LongContextDetector() + client = _MockClient() + assemble_targets: list[int] = [] + counted = [1_559_737, 949_000] + count_calls = 0 + + def fake_assemble(target_tokens, needles, seed, protocol=None): + assemble_targets.append(target_tokens) + return "\n".join(n.sentence for n in needles) + + async def count_tokens(**kwargs): + nonlocal count_calls + count_calls += 1 + return ({}, {"input_tokens": counted[min(count_calls - 1, 1)]}, {}, 0) + + async def messages_create(**kwargs): + client.calls.append(kwargs) + prompt = kwargs["messages"][0]["content"] + ids = ANSWER_RE.findall(prompt.upper()) + return ({}, _build_resp("\n".join(ids[:3]), input_tokens=949_000), {}, 0) + + monkeypatch.setattr(anthropic_lc, "assemble_haystack", fake_assemble) + client.count_tokens = count_tokens + client.messages_create = messages_create + + result = await det._probe_tier( + client, "claude-opus-4-7", 950_000, "seed", 1_000_000 + ) + + assert result["status"] == "pass" + assert result["input_tokens_precounted"] == 949_000 + assert result["count_tokens_attempts"] == 2 + assert result["sizing_iterations"] == 1 + assert len(client.calls) == 1 + assert len(assemble_targets) == 2 + assert assemble_targets[1] < assemble_targets[0] + + +@pytest.mark.asyncio +async def test_anthropic_long_context_skips_near_limit_without_count_tokens( + monkeypatch, +): + det = LongContextDetector() + client = _MockClient() + + def fake_assemble(target_tokens, needles, seed, protocol=None): + return "\n".join(n.sentence for n in needles) + + async def count_tokens(**kwargs): + raise RuntimeError("count_tokens unavailable") + + async def messages_create(**kwargs): + raise AssertionError("near-limit prompt must not be sent without count") + + monkeypatch.setattr(anthropic_lc, "assemble_haystack", fake_assemble) + client.count_tokens = count_tokens + client.messages_create = messages_create + + result = await det._probe_tier( + client, "claude-opus-4-7", 950_000, "seed", 1_000_000 + ) + + assert result["status"] == "skip" + assert "count_tokens" in result["skip_reason"] + assert result["count_tokens_attempts"] == 1 + + +@pytest.mark.asyncio +async def test_anthropic_long_context_provider_prompt_overflow_is_skip( + monkeypatch, +): + det = LongContextDetector() + client = _MockClient() + + class PromptOverflow(Exception): + status = 400 + body = "prompt is too long: 1559737 tokens > 1000000 maximum" + + def fake_assemble(target_tokens, needles, seed, protocol=None): + return "\n".join(n.sentence for n in needles) + + async def count_tokens(**kwargs): + return ({}, {"input_tokens": 949_000}, {}, 0) + + async def messages_create(**kwargs): + raise PromptOverflow() + + monkeypatch.setattr(anthropic_lc, "assemble_haystack", fake_assemble) + client.count_tokens = count_tokens + client.messages_create = messages_create + + result = await det._probe_tier( + client, "claude-opus-4-7", 950_000, "seed", 1_000_000 + ) + + assert result["status"] == "skip" + assert "prompt overflow" in result["skip_reason"] + assert "1559737 tokens > 1000000 maximum" in result["error"] diff --git a/tests/test_client.py b/tests/test_client.py index 1afed11..32c0837 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -26,11 +26,12 @@ def test_normalize_base_url_accepts_trailing_v1(): @pytest.mark.asyncio -async def test_messages_create_strips_temperature_for_opus_4_7(): - """Opus 4.7 rejects `temperature` (deprecated). Client must strip it.""" +@pytest.mark.parametrize("model", ["claude-opus-4-7", "claude-opus-4-8"]) +async def test_messages_create_strips_temperature_for_new_opus(model): + """New Opus models reject `temperature`; client must strip it.""" sample = { "id": "msg_x", "type": "message", "role": "assistant", - "model": "claude-opus-4-7", "content": [], + "model": model, "content": [], "stop_reason": "end_turn", "stop_sequence": None, "usage": {"input_tokens": 1, "output_tokens": 1}, } @@ -45,15 +46,15 @@ def handler(request: httpx.Request) -> httpx.Response: router.post("/v1/messages").mock(side_effect=handler) async with AnthropicClient(BASE_URL, "sk-test") as client: await client.messages_create( - model="claude-opus-4-7", + model=model, max_tokens=10, temperature=0, messages=[{"role": "user", "content": "x"}], ) assert len(captured) == 1 sent = captured[0] - assert "temperature" not in sent, "temperature must be stripped for Opus 4.7" - assert sent["model"] == "claude-opus-4-7" + assert "temperature" not in sent, "temperature must be stripped for new Opus" + assert sent["model"] == model assert sent["max_tokens"] == 10 # other fields untouched diff --git a/tests/test_long_context.py b/tests/test_long_context.py index bf11b83..5edc02e 100644 --- a/tests/test_long_context.py +++ b/tests/test_long_context.py @@ -164,6 +164,7 @@ def test_model_context_limit_known_models(): assert model_context_limit("claude-sonnet-4-6") == 1_000_000 assert model_context_limit("claude-opus-4-6") == 1_000_000 assert model_context_limit("claude-opus-4-7") == 1_000_000 + assert model_context_limit("claude-opus-4-8") == 1_000_000 assert model_context_limit("claude-opus-4-5") == 200_000 # Gemini — all 1,048,576 (1MB binary) per ai.google.dev model pages assert model_context_limit("gemini-2.5-pro") == 1_048_576 @@ -176,6 +177,7 @@ def test_model_context_limit_snapshot_suffix(): assert model_context_limit("gpt-4o-mini-2024-07-18") == 128_000 assert model_context_limit("claude-haiku-4-5-20251001") == 200_000 assert model_context_limit("claude-sonnet-4-6-20251101") == 1_000_000 + assert model_context_limit("claude-opus-4-8-20260609") == 1_000_000 def test_model_context_limit_unknown_falls_back_conservatively():