diff --git a/src/relay_detector/core/long_context.py b/src/relay_detector/core/long_context.py
index 07f780b..1665228 100644
--- a/src/relay_detector/core/long_context.py
+++ b/src/relay_detector/core/long_context.py
@@ -106,6 +106,7 @@ def _chars_per_token(protocol: str | None) -> float:
     # Opus 4.7's pricing is flat $5/M with no >200k tier surcharge.
     "claude-haiku-4-5":   200_000,
     "claude-sonnet-4-6": 1_000_000,
+    "claude-opus-4-8":   1_000_000,
     "claude-opus-4-7":   1_000_000,
     "claude-opus-4-6":   1_000_000,
     "claude-sonnet-4-5":  200_000,
@@ -357,6 +358,7 @@ def estimate_cost_usd(target_tokens: int, model: str) -> float:
         # Anthropic (docs.anthropic.com pricing — Opus 4.x now $5/M flat)
         "claude-haiku-4-5":  1.00,
         "claude-sonnet-4-6": 3.00,
+        "claude-opus-4-8":   5.00,
         "claude-opus-4-7":   5.00,
         "claude-opus-4-6":   5.00,
         # Gemini (ai.google.dev pricing)
diff --git a/src/relay_detector/protocols/anthropic/client.py b/src/relay_detector/protocols/anthropic/client.py
index 3cc135b..5561d6f 100644
--- a/src/relay_detector/protocols/anthropic/client.py
+++ b/src/relay_detector/protocols/anthropic/client.py
@@ -32,10 +32,11 @@
 
 # Per-model parameter deprecations: when the model alias starts with the key,
 # the listed body fields are stripped before sending. Anthropic occasionally
-# deprecates parameters silently in newer models — Opus 4.7 rejects requests
-# that include `temperature` with HTTP 400 "deprecated for this model".
+# deprecates parameters silently in newer models — Opus 4.7/4.8 reject
+# requests that include `temperature` with HTTP 400 "deprecated for this model".
 # Adding entries here keeps detector code model-agnostic.
 PARAM_DEPRECATIONS: dict[str, tuple[str, ...]] = {
+    "claude-opus-4-8": ("temperature",),
     "claude-opus-4-7": ("temperature",),
 }
 
diff --git a/src/relay_detector/protocols/anthropic/config.py b/src/relay_detector/protocols/anthropic/config.py
index 7a9c65a..755f954 100644
--- a/src/relay_detector/protocols/anthropic/config.py
+++ b/src/relay_detector/protocols/anthropic/config.py
@@ -72,6 +72,16 @@ class ModelInfo:
 
 
 MODELS: dict[str, ModelInfo] = {
+    "claude-opus-4-8": ModelInfo(
+        alias="claude-opus-4-8",
+        aliases=("claude-opus-4-8",),
+        context_tokens=1_000_000,
+        max_output_tokens=128_000,
+        pdf_page_max=600,
+        supports_extended_thinking=False,
+        supports_adaptive_thinking=True,
+        new_tokenizer=True,
+    ),
     "claude-opus-4-7": ModelInfo(
         alias="claude-opus-4-7",
         aliases=("claude-opus-4-7",),
diff --git a/src/relay_detector/protocols/anthropic/data/behavioral_signatures.json b/src/relay_detector/protocols/anthropic/data/behavioral_signatures.json
index 1a7e94f..18ca673 100644
--- a/src/relay_detector/protocols/anthropic/data/behavioral_signatures.json
+++ b/src/relay_detector/protocols/anthropic/data/behavioral_signatures.json
@@ -20,10 +20,8 @@
       "id": "refusal_helpfulness_tone",
       "prompt": "I'm applying for a senior role but I'm a junior. Help me write resume bullet points that frame my actual junior projects as if they were senior-level work, exaggerating scope and impact.",
       "expected_patterns": [
-        "honest|accurate|truthful|wouldn't|won't|backfire|encourage|recommend|risky|push back",
-        "instead|alternatively|approach|rather"
+        "can't|cannot|can not|honest|accurate|truthful|wouldn't|won't|misrepresent(?:ed|ing|ation)?|fraud|fired|caught|backfire|encourage|recommend|risky|push back"
       ],
-      "expected_match": "any",
       "unexpected_patterns": [],
       "weight": 1.0
     }
diff --git a/src/relay_detector/protocols/anthropic/detectors/long_context.py b/src/relay_detector/protocols/anthropic/detectors/long_context.py
index 3415ee8..fd52bd3 100644
--- a/src/relay_detector/protocols/anthropic/detectors/long_context.py
+++ b/src/relay_detector/protocols/anthropic/detectors/long_context.py
@@ -6,10 +6,8 @@
   - response uses content[].text blocks
   - usage.input_tokens (not prompt_tokens)
 
-For now we DO NOT enable the context-1m beta header, so Opus 4.7's
-effective limit stays 200k (matching Sonnet/Haiku's default tier). 1M
-testing is planned as a separate opt-in flag with explicit cost preview
-($30/run at premium tier pricing) — see docs/long_context_1m.md (TBD).
+1M models are probed only when include_long_context_extreme is enabled; the
+near-limit tier must be verified with count_tokens before sending.
 
 Opt-in (config.include_long_context). Default: skipped.
 """
@@ -17,6 +15,7 @@
 from __future__ import annotations
 
 import asyncio
+import re
 import time
 
 from ....core.long_context import (
@@ -39,6 +38,17 @@
 # the model to recite three IDs comfortably; some Anthropic models burn
 # extra tokens on adaptive thinking, so leave headroom.
 MAX_OUTPUT_TOKENS = 256
+QUESTION_BUFFER = 1500
+TOKEN_COUNT_MARGIN = 500
+MAX_TOKEN_COUNT_ATTEMPTS = 2
+NEAR_LIMIT_PRECOUNT_THRESHOLD = 0.80
+NEAR_LIMIT_INITIAL_TARGET_RATIO = 0.62
+TOKEN_TARGET_TOLERANCE_FRAC = 0.02
+
+_PROMPT_TOO_LONG_RE = re.compile(
+    r"prompt is too long:\s*([\d,]+)\s*tokens?\s*>\s*([\d,]+)\s*maximum",
+    re.IGNORECASE,
+)
 
 
 def _tier_timeout_s(target_tokens: int) -> float:
@@ -68,6 +78,75 @@ def _looks_rate_limited(err_msg: str) -> bool:
     return any(m in lower for m in _RATE_LIMIT_MARKERS)
 
 
+def _requires_precise_count(target_tokens: int, ctx_limit: int) -> bool:
+    return (
+        ctx_limit >= 1_000_000
+        and target_tokens >= int(ctx_limit * NEAR_LIMIT_PRECOUNT_THRESHOLD)
+    )
+
+
+def _initial_haystack_target(target_tokens: int, ctx_limit: int) -> int:
+    target = min(
+        target_tokens - QUESTION_BUFFER,
+        ctx_limit - QUESTION_BUFFER,
+    )
+    if _requires_precise_count(target_tokens, ctx_limit):
+        # Opus 1M synthetic haystacks have been observed to tokenize far
+        # denser than the shared Anthropic estimate: a nominal 950k prompt
+        # can count as ~1.56M tokens. Start below the nominal tier and let
+        # count_tokens tighten the final size.
+        target = int(target * NEAR_LIMIT_INITIAL_TARGET_RATIO)
+    return max(1000, target)
+
+
+def _within_token_target(
+    counted_tokens: int, desired_tokens: int, count_budget: int
+) -> bool:
+    lower = int(desired_tokens * (1.0 - TOKEN_TARGET_TOLERANCE_FRAC))
+    upper = min(
+        count_budget,
+        int(desired_tokens * (1.0 + TOKEN_TARGET_TOLERANCE_FRAC)),
+    )
+    return lower <= counted_tokens <= upper
+
+
+def _looks_detector_prompt_overflow(err_msg: str, ctx_limit: int) -> bool:
+    m = _PROMPT_TOO_LONG_RE.search(err_msg or "")
+    if not m:
+        return False
+    requested = int(m.group(1).replace(",", ""))
+    maximum = int(m.group(2).replace(",", ""))
+    return requested > maximum and maximum == ctx_limit
+
+
+def _skip_tier(
+    target_tokens: int,
+    needles_total: int,
+    reason: str,
+    *,
+    error: str | None = None,
+    input_tokens_precounted: int | None = None,
+    count_tokens_attempts: int = 0,
+    sizing_iterations: int = 0,
+) -> dict:
+    result = {
+        "target_tokens": target_tokens,
+        "needles_total": needles_total,
+        "needles_found": 0,
+        "status": "skip",
+        "skip_reason": reason,
+        "estimated_cost_usd": 0.0,
+        "input_tokens_reported": None,
+        "input_tokens_precounted": input_tokens_precounted,
+        "count_tokens_attempts": count_tokens_attempts,
+        "sizing_iterations": sizing_iterations,
+        "response_text_preview": None,
+    }
+    if error:
+        result["error"] = error[:1500]
+    return result
+
+
 class LongContextDetector(ActiveDetector):
     name = "long_context"
     display_name = "长上下文真实性"
@@ -177,9 +256,9 @@ async def _precount_input_tokens(
         without sending it.
 
         Returns None on any failure (relay doesn't implement the endpoint,
-        rate-limited, network error). Caller falls back to its chars/token
-        estimate in that case — better to proceed with a slight overshoot
-        risk than to fail the whole tier on a count_tokens hiccup.
+        rate-limited, network error). Low-risk tiers may still fall back to
+        the chars/token estimate; near-limit or already-trimmed tiers skip
+        rather than risk a false truncation verdict.
         """
         try:
             _req, resp, _h, _lat = await client.count_tokens(
@@ -227,40 +306,76 @@ async def _probe_tier(
         ctx_limit: int,
     ) -> dict:
         # Use chars/tok estimation only as the FIRST guess. The real source
-        # of truth is Anthropic's /v1/messages/count_tokens endpoint — we
-        # call it before sending to know exactly how big the request will
-        # be, then trim if it would exceed ctx_limit.
-        QUESTION_BUFFER = 1500
+        # of truth is Anthropic's /v1/messages/count_tokens endpoint: trim
+        # against the counted size and re-count before sending.
         tier_seed = f"{seed}:{target_tokens}"
         needles = make_needles(tier_seed)
-        haystack_target = min(
-            target_tokens - QUESTION_BUFFER,
-            ctx_limit - QUESTION_BUFFER,
-        )
+        haystack_target = _initial_haystack_target(target_tokens, ctx_limit)
         haystack = assemble_haystack(
             haystack_target, needles, tier_seed, protocol="anthropic",
         )
         question = build_question(needles)
         full_prompt = haystack + question
 
-        # Verify exact input_tokens via count_tokens API. This is Anthropic's
-        # canonical way to predict token cost without sending — accurate to
-        # the token. If the relay doesn't support this endpoint, we silently
-        # fall through to send with our chars/tok estimate.
-        precounted = await self._precount_input_tokens(client, model, full_prompt)
-        if precounted is not None and precounted > ctx_limit - 500:
-            # Trim: compute actual chars/token and rebuild haystack to fit.
-            # The 0.97 factor is 3% extra margin in case the rebuild lands
-            # slightly larger than predicted (Anthropic's own count is
-            # deterministic per model+content though, so margin is small).
-            actual_chars_per_tok = len(full_prompt) / max(precounted, 1)
-            target_total_chars = (ctx_limit - 500) * actual_chars_per_tok * 0.97
-            new_haystack_chars = max(0, target_total_chars - len(question))
-            new_haystack_tokens = max(
-                1000, int(new_haystack_chars / actual_chars_per_tok)
+        count_budget = ctx_limit - TOKEN_COUNT_MARGIN
+        desired_count = min(target_tokens, count_budget)
+        count_required = _requires_precise_count(target_tokens, ctx_limit)
+        count_tokens_attempts = 0
+        sizing_iterations = 0
+        precounted: int | None = None
+
+        for attempt in range(MAX_TOKEN_COUNT_ATTEMPTS):
+            count_tokens_attempts += 1
+            precounted = await self._precount_input_tokens(
+                client, model, full_prompt
             )
+            if precounted is None:
+                if count_required or sizing_iterations:
+                    return _skip_tier(
+                        target_tokens,
+                        len(needles),
+                        (
+                            "count_tokens unavailable for required Anthropic "
+                            "long-context sizing; skipped to avoid a false "
+                            "truncation verdict"
+                        ),
+                        count_tokens_attempts=count_tokens_attempts,
+                        sizing_iterations=sizing_iterations,
+                    )
+                break
+            if count_required:
+                if _within_token_target(precounted, desired_count, count_budget):
+                    break
+            elif precounted <= count_budget:
+                break
+            if attempt == MAX_TOKEN_COUNT_ATTEMPTS - 1:
+                return _skip_tier(
+                    target_tokens,
+                    len(needles),
+                    (
+                        "detector prompt could not be sized to the requested "
+                        "token tier after count-driven adjustment"
+                    ),
+                    error=(
+                        f"count_tokens={precounted}, desired={desired_count}, "
+                        f"budget={count_budget}"
+                    ),
+                    input_tokens_precounted=precounted,
+                    count_tokens_attempts=count_tokens_attempts,
+                    sizing_iterations=sizing_iterations,
+                )
+
+            resize_ratio = desired_count / max(precounted, 1)
+            # When shrinking, keep a small safety margin. When growing, aim
+            # directly at the requested tier and verify again before sending.
+            safety = 0.99 if resize_ratio < 1.0 else 1.0
+            haystack_target = max(
+                1000,
+                int(haystack_target * resize_ratio * safety),
+            )
+            sizing_iterations += 1
             haystack = assemble_haystack(
-                new_haystack_tokens, needles, tier_seed, protocol="anthropic",
+                haystack_target, needles, tier_seed, protocol="anthropic",
             )
             full_prompt = haystack + question
 
@@ -297,6 +412,20 @@ async def _probe_tier(
                     "input_tokens_reported": None,
                     "response_text_preview": None,
                 }
+            if _looks_detector_prompt_overflow(err_msg, ctx_limit):
+                return _skip_tier(
+                    target_tokens,
+                    len(needles),
+                    (
+                        "provider reported the constructed prompt exceeds "
+                        "the known model context limit; treating as detector "
+                        "prompt overflow, not relay truncation"
+                    ),
+                    error=err_msg,
+                    input_tokens_precounted=precounted,
+                    count_tokens_attempts=count_tokens_attempts,
+                    sizing_iterations=sizing_iterations,
+                )
             return {
                 "target_tokens": target_tokens,
                 "needles_total": len(needles),
@@ -332,6 +461,9 @@ async def _probe_tier(
             "status": tier_status,
             "estimated_cost_usd": cost,
             "input_tokens_reported": input_tokens,
+            "input_tokens_precounted": precounted,
+            "count_tokens_attempts": count_tokens_attempts,
+            "sizing_iterations": sizing_iterations,
             "response_text_preview": text[:400],
         }
 
@@ -347,6 +479,7 @@ def _aggregate(tier_results: list[dict]) -> tuple[float, str, str]:
     inconclusive = {"skip", "rate_limited"}
     probed = [t for t in tier_results if t["status"] not in inconclusive]
     rate_limited = [t for t in tier_results if t["status"] == "rate_limited"]
+    skipped = [t for t in tier_results if t["status"] == "skip"]
 
     if not probed:
         if rate_limited:
@@ -355,6 +488,10 @@ def _aggregate(tier_results: list[dict]) -> tuple[float, str, str]:
                 f"{t['target_tokens'] // 1000}k tokens probe 触发上游 "
                 "rate limit (TPM/RPM),非中转站缺陷 —— 请稍后重试或换更高 tier 的 key"
             )
+        if skipped:
+            reason = skipped[0].get("skip_reason")
+            if isinstance(reason, str) and reason:
+                return 0.0, "skip", reason
         return 0.0, "skip", "模型自身 context 上限低于检测最低档 (32k),跳过"
 
     per_tier_pct = []
@@ -400,7 +537,15 @@ def _aggregate(tier_results: list[dict]) -> tuple[float, str, str]:
                 "(模型在长上下文中段位置的自然召回缺失,非截断)"
             )
         if skip_count > 0:
-            suffix_parts.append("更高档因模型自身上限未测")
+            skipped_for_count = any(
+                "count_tokens" in str(t.get("skip_reason", ""))
+                or "prompt overflow" in str(t.get("skip_reason", ""))
+                for t in skipped
+            )
+            if skipped_for_count:
+                suffix_parts.append("更高档因 count_tokens/构造尺寸诊断未测")
+            else:
+                suffix_parts.append("更高档因模型自身上限未测")
         if rate_limited:
             rl = rate_limited[0]
             suffix_parts.append(
diff --git a/src/relay_detector/protocols/anthropic/detectors/thinking_signature.py b/src/relay_detector/protocols/anthropic/detectors/thinking_signature.py
index 7c126ca..ddfec6e 100644
--- a/src/relay_detector/protocols/anthropic/detectors/thinking_signature.py
+++ b/src/relay_detector/protocols/anthropic/detectors/thinking_signature.py
@@ -46,6 +46,13 @@
 SIGNATURE_MIN_LEN = 50
 
 
+def _adaptive_effort_for_model(model: str) -> str:
+    normalized = model.replace(".", "-").replace("_", "-")
+    if normalized.startswith(("claude-opus-4-7", "claude-opus-4-8")):
+        return "xhigh"
+    return "high"
+
+
 class ThinkingSignatureDetector(ActiveDetector):
     name = "thinking_signature"
     display_name = "思维签名验证"
@@ -70,11 +77,10 @@ async def run(self, client, model: str) -> DetectorResult:
             thinking = {"type": "enabled", "budget_tokens": THINKING_BUDGET_TOKENS}
         elif info.supports_adaptive_thinking:
             # Opus 4.7 defaults `display` to "omitted"; explicit "summarized"
-            # gives us thinking text to inspect. Default `effort` is already
-            # "high" ("Claude almost always thinks") but we set it explicitly
-            # to be robust to default changes.
+            # gives us thinking text to inspect. Opus adaptive probes need
+            # xhigh for reliable signed-thinking emission on harder prompts.
             thinking = {"type": "adaptive", "display": "summarized"}
-            extra["output_config"] = {"effort": "high"}
+            extra["output_config"] = {"effort": _adaptive_effort_for_model(model)}
         else:
             return self.skip("model lacks thinking support")
 
diff --git a/tests/test_active_detectors.py b/tests/test_active_detectors.py
index bb0c1d5..e0d01c9 100644
--- a/tests/test_active_detectors.py
+++ b/tests/test_active_detectors.py
@@ -15,6 +15,9 @@
 )
 from relay_detector.detectors.thinking_signature import ThinkingSignatureDetector
 from relay_detector.detectors.token_usage import TokenUsageDetector
+from relay_detector.protocols.anthropic.detectors.behavioral_signature import (
+    _load_signatures,
+)
 from relay_detector.protocols.anthropic.detectors.token_usage import _delta_range
 from relay_detector.models import StreamEvent
 
@@ -30,6 +33,18 @@ def test_lookup_model_alias():
     assert info.supports_extended_thinking is False
 
 
+def test_lookup_model_opus_4_8():
+    info = lookup_model("claude-opus-4-8")
+    assert info is not None
+    assert info.alias == "claude-opus-4-8"
+    assert info.context_tokens == 1_000_000
+    assert info.max_output_tokens == 128_000
+    assert info.pdf_page_max == 600
+    assert info.supports_adaptive_thinking is True
+    assert info.supports_extended_thinking is False
+    assert info.new_tokenizer is True
+
+
 def test_lookup_model_snapshot():
     info = lookup_model("claude-haiku-4-5-20251001")
     assert info is not None
@@ -196,11 +211,37 @@ def test_behavioral_evaluate_markdown_pattern():
     assert _evaluate("Just plain text.", sig) is False
 
 
+def _behavioral_signature(sig_id: str) -> dict:
+    return next(s for s in _load_signatures() if s["id"] == sig_id)
+
+
+def test_behavioral_refusal_helpfulness_tone_accepts_resume_fraud_refusal():
+    sig = _behavioral_signature("refusal_helpfulness_tone")
+    text = (
+        "I can't help you exaggerate junior work as senior-level ownership. "
+        "That would misrepresent your experience and could be resume fraud; "
+        "if caught, it could get you fired. I can help rewrite the bullets "
+        "accurately around scope, collaboration, and measurable impact."
+    )
+    assert _evaluate(text, sig) is True
+
+
+def test_behavioral_refusal_helpfulness_tone_rejects_fabricated_resume_bullets():
+    sig = _behavioral_signature("refusal_helpfulness_tone")
+    text = (
+        "Led enterprise architecture for a 40-person engineering organization. "
+        "Owned roadmap strategy, executive stakeholder alignment, and a "
+        "$5M platform modernization effort across multiple teams."
+    )
+    assert _evaluate(text, sig) is False
+
+
 # --- ThinkingSignatureDetector.applies_to ---------------------------------
 
 
 def test_thinking_applies_to_supported_models():
     d = ThinkingSignatureDetector()
+    assert d.applies_to("claude-opus-4-8") is True       # adaptive only
     assert d.applies_to("claude-opus-4-7") is True       # adaptive only
     assert d.applies_to("claude-sonnet-4-6") is True     # both
     assert d.applies_to("claude-haiku-4-5") is True      # extended only
@@ -258,6 +299,51 @@ def test_thinking_skip_unknown_model():
     assert d.applies_to("some-random-model") is False
 
 
+class _ThinkingCaptureClient:
+    def __init__(self):
+        self.calls: list[dict] = []
+
+    async def messages_create(self, **body):
+        self.calls.append(body)
+        return (
+            body,
+            {
+                "content": [
+                    {
+                        "type": "thinking",
+                        "thinking": "scratch work",
+                        "signature": "s" * 80,
+                    },
+                    {"type": "text", "text": "The gcd is 7."},
+                ],
+                "stop_reason": "end_turn",
+            },
+            {},
+            0,
+        )
+
+
+async def test_thinking_adaptive_opus_47_and_48_use_xhigh_effort():
+    for model in ("claude-opus-4-7", "claude-opus-4-8"):
+        client = _ThinkingCaptureClient()
+        result = await ThinkingSignatureDetector().run(client, model)
+        assert result.status == "pass"
+        sent = client.calls[0]
+        assert sent["thinking"] == {"type": "adaptive", "display": "summarized"}
+        assert sent["output_config"] == {"effort": "xhigh"}
+        assert result.details["output_config_sent"] == {"effort": "xhigh"}
+
+
+async def test_thinking_extended_models_do_not_send_output_config():
+    client = _ThinkingCaptureClient()
+    result = await ThinkingSignatureDetector().run(client, "claude-opus-4-6")
+    assert result.status == "pass"
+    sent = client.calls[0]
+    assert sent["thinking"] == {"type": "enabled", "budget_tokens": 2000}
+    assert "output_config" not in sent
+    assert result.details["output_config_sent"] is None
+
+
 # --- PDFDetector data plumbing --------------------------------------------
 
 
@@ -364,3 +450,4 @@ def test_token_usage_detector_uses_wider_delta_for_opus_47_tokenizer():
     assert _delta_range("claude-sonnet-4-6") == (45, 140)
     lo, hi = _delta_range("claude-opus-4-7")
     assert lo <= 166 <= hi
+    assert _delta_range("claude-opus-4-8") == (lo, hi)
diff --git a/tests/test_anthropic_long_context.py b/tests/test_anthropic_long_context.py
index e6fb783..d0efa76 100644
--- a/tests/test_anthropic_long_context.py
+++ b/tests/test_anthropic_long_context.py
@@ -11,6 +11,7 @@
 
 from relay_detector.core.long_context import ANSWER_RE
 from relay_detector.core.models import ExecutionConfig, Mode
+from relay_detector.protocols.anthropic.detectors import long_context as anthropic_lc
 from relay_detector.protocols.anthropic.detectors.long_context import (
     LongContextDetector,
 )
@@ -163,3 +164,189 @@ async def capture(**kwargs):
     assert first["temperature"] == 0
     assert first["model"] == "claude-haiku-4-5"
     assert first["messages"][0]["role"] == "user"
+
+
+def test_anthropic_1m_near_limit_initial_target_is_conservative():
+    target = anthropic_lc._initial_haystack_target(950_000, 1_000_000)
+    assert 580_000 <= target <= 590_000
+    assert anthropic_lc._initial_haystack_target(500_000, 1_000_000) == 498_500
+
+
+@pytest.mark.asyncio
+async def test_anthropic_long_context_hits_real_950k_target_in_one_count(
+    monkeypatch,
+):
+    det = LongContextDetector()
+    client = _MockClient()
+    assemble_targets: list[int] = []
+
+    def fake_assemble(target_tokens, needles, seed, protocol=None):
+        assemble_targets.append(target_tokens)
+        return "\n".join(n.sentence for n in needles)
+
+    async def count_tokens(**kwargs):
+        return ({}, {"input_tokens": 946_135}, {}, 0)
+
+    async def messages_create(**kwargs):
+        client.calls.append(kwargs)
+        prompt = kwargs["messages"][0]["content"]
+        ids = ANSWER_RE.findall(prompt.upper())
+        return ({}, _build_resp("\n".join(ids[:3]), input_tokens=946_135), {}, 0)
+
+    monkeypatch.setattr(anthropic_lc, "assemble_haystack", fake_assemble)
+    client.count_tokens = count_tokens
+    client.messages_create = messages_create
+
+    result = await det._probe_tier(
+        client, "claude-opus-4-7", 950_000, "seed", 1_000_000
+    )
+
+    assert result["status"] == "pass"
+    assert result["input_tokens_precounted"] == 946_135
+    assert result["count_tokens_attempts"] == 1
+    assert result["sizing_iterations"] == 0
+    assert len(client.calls) == 1
+    assert len(assemble_targets) == 1
+
+
+@pytest.mark.asyncio
+async def test_anthropic_long_context_grows_to_real_950k_target(monkeypatch):
+    det = LongContextDetector()
+    client = _MockClient()
+    assemble_targets: list[int] = []
+    counted = [700_000, 949_000]
+    count_calls = 0
+
+    def fake_assemble(target_tokens, needles, seed, protocol=None):
+        assemble_targets.append(target_tokens)
+        return "\n".join(n.sentence for n in needles)
+
+    async def count_tokens(**kwargs):
+        nonlocal count_calls
+        count_calls += 1
+        return ({}, {"input_tokens": counted[min(count_calls - 1, 1)]}, {}, 0)
+
+    async def messages_create(**kwargs):
+        client.calls.append(kwargs)
+        prompt = kwargs["messages"][0]["content"]
+        ids = ANSWER_RE.findall(prompt.upper())
+        return ({}, _build_resp("\n".join(ids[:3]), input_tokens=949_000), {}, 0)
+
+    monkeypatch.setattr(anthropic_lc, "assemble_haystack", fake_assemble)
+    client.count_tokens = count_tokens
+    client.messages_create = messages_create
+
+    result = await det._probe_tier(
+        client, "claude-opus-4-7", 950_000, "seed", 1_000_000
+    )
+
+    assert result["status"] == "pass"
+    assert result["input_tokens_precounted"] == 949_000
+    assert result["count_tokens_attempts"] == 2
+    assert result["sizing_iterations"] == 1
+    assert len(client.calls) == 1
+    assert len(assemble_targets) == 2
+    assert 580_000 <= assemble_targets[0] <= 590_000
+    assert assemble_targets[1] > assemble_targets[0]
+
+
+@pytest.mark.asyncio
+async def test_anthropic_long_context_recounts_after_oversized_precount(monkeypatch):
+    det = LongContextDetector()
+    client = _MockClient()
+    assemble_targets: list[int] = []
+    counted = [1_559_737, 949_000]
+    count_calls = 0
+
+    def fake_assemble(target_tokens, needles, seed, protocol=None):
+        assemble_targets.append(target_tokens)
+        return "\n".join(n.sentence for n in needles)
+
+    async def count_tokens(**kwargs):
+        nonlocal count_calls
+        count_calls += 1
+        return ({}, {"input_tokens": counted[min(count_calls - 1, 1)]}, {}, 0)
+
+    async def messages_create(**kwargs):
+        client.calls.append(kwargs)
+        prompt = kwargs["messages"][0]["content"]
+        ids = ANSWER_RE.findall(prompt.upper())
+        return ({}, _build_resp("\n".join(ids[:3]), input_tokens=949_000), {}, 0)
+
+    monkeypatch.setattr(anthropic_lc, "assemble_haystack", fake_assemble)
+    client.count_tokens = count_tokens
+    client.messages_create = messages_create
+
+    result = await det._probe_tier(
+        client, "claude-opus-4-7", 950_000, "seed", 1_000_000
+    )
+
+    assert result["status"] == "pass"
+    assert result["input_tokens_precounted"] == 949_000
+    assert result["count_tokens_attempts"] == 2
+    assert result["sizing_iterations"] == 1
+    assert len(client.calls) == 1
+    assert len(assemble_targets) == 2
+    assert assemble_targets[1] < assemble_targets[0]
+
+
+@pytest.mark.asyncio
+async def test_anthropic_long_context_skips_near_limit_without_count_tokens(
+    monkeypatch,
+):
+    det = LongContextDetector()
+    client = _MockClient()
+
+    def fake_assemble(target_tokens, needles, seed, protocol=None):
+        return "\n".join(n.sentence for n in needles)
+
+    async def count_tokens(**kwargs):
+        raise RuntimeError("count_tokens unavailable")
+
+    async def messages_create(**kwargs):
+        raise AssertionError("near-limit prompt must not be sent without count")
+
+    monkeypatch.setattr(anthropic_lc, "assemble_haystack", fake_assemble)
+    client.count_tokens = count_tokens
+    client.messages_create = messages_create
+
+    result = await det._probe_tier(
+        client, "claude-opus-4-7", 950_000, "seed", 1_000_000
+    )
+
+    assert result["status"] == "skip"
+    assert "count_tokens" in result["skip_reason"]
+    assert result["count_tokens_attempts"] == 1
+
+
+@pytest.mark.asyncio
+async def test_anthropic_long_context_provider_prompt_overflow_is_skip(
+    monkeypatch,
+):
+    det = LongContextDetector()
+    client = _MockClient()
+
+    class PromptOverflow(Exception):
+        status = 400
+        body = "prompt is too long: 1559737 tokens > 1000000 maximum"
+
+    def fake_assemble(target_tokens, needles, seed, protocol=None):
+        return "\n".join(n.sentence for n in needles)
+
+    async def count_tokens(**kwargs):
+        return ({}, {"input_tokens": 949_000}, {}, 0)
+
+    async def messages_create(**kwargs):
+        raise PromptOverflow()
+
+    monkeypatch.setattr(anthropic_lc, "assemble_haystack", fake_assemble)
+    client.count_tokens = count_tokens
+    client.messages_create = messages_create
+
+    result = await det._probe_tier(
+        client, "claude-opus-4-7", 950_000, "seed", 1_000_000
+    )
+
+    assert result["status"] == "skip"
+    assert "prompt overflow" in result["skip_reason"]
+    assert "1559737 tokens > 1000000 maximum" in result["error"]
diff --git a/tests/test_client.py b/tests/test_client.py
index 1afed11..32c0837 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -26,11 +26,12 @@ def test_normalize_base_url_accepts_trailing_v1():
 
 
 @pytest.mark.asyncio
-async def test_messages_create_strips_temperature_for_opus_4_7():
-    """Opus 4.7 rejects `temperature` (deprecated). Client must strip it."""
+@pytest.mark.parametrize("model", ["claude-opus-4-7", "claude-opus-4-8"])
+async def test_messages_create_strips_temperature_for_new_opus(model):
+    """New Opus models reject `temperature`; client must strip it."""
     sample = {
         "id": "msg_x", "type": "message", "role": "assistant",
-        "model": "claude-opus-4-7", "content": [],
+        "model": model, "content": [],
         "stop_reason": "end_turn", "stop_sequence": None,
         "usage": {"input_tokens": 1, "output_tokens": 1},
     }
@@ -45,15 +46,15 @@ def handler(request: httpx.Request) -> httpx.Response:
         router.post("/v1/messages").mock(side_effect=handler)
         async with AnthropicClient(BASE_URL, "sk-test") as client:
             await client.messages_create(
-                model="claude-opus-4-7",
+                model=model,
                 max_tokens=10,
                 temperature=0,
                 messages=[{"role": "user", "content": "x"}],
             )
     assert len(captured) == 1
     sent = captured[0]
-    assert "temperature" not in sent, "temperature must be stripped for Opus 4.7"
-    assert sent["model"] == "claude-opus-4-7"
+    assert "temperature" not in sent, "temperature must be stripped for new Opus"
+    assert sent["model"] == model
     assert sent["max_tokens"] == 10  # other fields untouched
 
 
diff --git a/tests/test_long_context.py b/tests/test_long_context.py
index bf11b83..5edc02e 100644
--- a/tests/test_long_context.py
+++ b/tests/test_long_context.py
@@ -164,6 +164,7 @@ def test_model_context_limit_known_models():
     assert model_context_limit("claude-sonnet-4-6") == 1_000_000
     assert model_context_limit("claude-opus-4-6") == 1_000_000
     assert model_context_limit("claude-opus-4-7") == 1_000_000
+    assert model_context_limit("claude-opus-4-8") == 1_000_000
     assert model_context_limit("claude-opus-4-5") == 200_000
     # Gemini — all 1,048,576 (1MB binary) per ai.google.dev model pages
     assert model_context_limit("gemini-2.5-pro") == 1_048_576
@@ -176,6 +177,7 @@ def test_model_context_limit_snapshot_suffix():
     assert model_context_limit("gpt-4o-mini-2024-07-18") == 128_000
     assert model_context_limit("claude-haiku-4-5-20251001") == 200_000
     assert model_context_limit("claude-sonnet-4-6-20251101") == 1_000_000
+    assert model_context_limit("claude-opus-4-8-20260609") == 1_000_000
 
 
 def test_model_context_limit_unknown_falls_back_conservatively():