diff --git a/api-reference/classify.mdx b/api-reference/classify.mdx
index a22db4b..b2b2837 100644
--- a/api-reference/classify.mdx
+++ b/api-reference/classify.mdx
@@ -11,6 +11,17 @@ Classify a comment in one API call. Returns intent labels, sentiment, language,
   Cost: **$0.006** per request (6 credits). Semantic cache hits are free (`X-NAWA-Cache: HIT`) and do not decrement your credit balance.
 </Note>
 
+### Model routing by tier
+
+Classification requests are routed to different models based on your account tier:
+
+| Tier | Classification model | Notes |
+|------|---------------------|-------|
+| Free, Basic, Pro | `claude-haiku-4-5-20251001` | Optimized for cost and speed |
+| Enterprise | `claude-sonnet-4-5-20250929` | Higher accuracy for complex comments |
+
+The `model` field in the response tells you which model produced the result. Prompt caching is enabled on the classification path to reduce latency and token costs for repeated patterns within the same user context.
+
 ## Request
 
 ### Headers
@@ -84,7 +95,7 @@ result = nawa.classify(text="متى الجزء الثاني؟")
       "direction": "rtl"
     },
     "provider": "claude",
-    "model": "claude-sonnet-4-5-20250929",
+    "model": "claude-haiku-4-5-20251001",
     "fallback_used": false,
     "tokens_used": null,
     "cost_usd": 0.006,
@@ -112,7 +123,7 @@ result = nawa.classify(text="متى الجزء الثاني؟")
 | `suggested_reply.text` | string | NAGL's natural-language analysis of the comment (the reasoning behind the classification). This is not a draft reply; use `/v1/comments/reply` for that. |
 | `suggested_reply.direction` | string | `"rtl"` for Arabic, `"ltr"` otherwise. Useful for UI rendering. |
 | `provider` | string | AI provider that produced this result: `claude`, `allam`, or `gemini` |
-| `model` | string | Specific model ID, e.g. `claude-sonnet-4-5-20250929` |
+| `model` | string | Specific model ID. `claude-haiku-4-5-20251001` for free/basic/pro tiers, `claude-sonnet-4-5-20250929` for enterprise. |
 | `fallback_used` | boolean | `true` if the primary provider failed and a fallback produced this result |
 | `tokens_used` | number \| null | Currently always `null`. Token accounting is planned; track usage via `cost_usd` for now. |
 | `cost_usd` | number | Always `0.006` for this endpoint |
@@ -179,7 +190,7 @@ See [Errors](/errors) for the full envelope shape and all error codes.
           "direction": "ltr"
         },
         "provider": "claude",
-        "model": "claude-sonnet-4-5-20250929",
+        "model": "claude-haiku-4-5-20251001",
         "fallback_used": false,
         "tokens_used": null,
         "cost_usd": 0.006,
@@ -210,7 +221,7 @@ See [Errors](/errors) for the full envelope shape and all error codes.
           "direction": "ltr"
         },
         "provider": "claude",
-        "model": "claude-sonnet-4-5-20250929",
+        "model": "claude-haiku-4-5-20251001",
         "fallback_used": false,
         "tokens_used": null,
         "cost_usd": 0.006,
@@ -241,7 +252,7 @@ See [Errors](/errors) for the full envelope shape and all error codes.
           "direction": "rtl"
         },
         "provider": "claude",
-        "model": "claude-sonnet-4-5-20250929",
+        "model": "claude-haiku-4-5-20251001",
         "fallback_used": false,
         "tokens_used": null,
         "cost_usd": 0.006,
diff --git a/changelog.mdx b/changelog.mdx
index 6a70c83..9b29280 100644
--- a/changelog.mdx
+++ b/changelog.mdx
@@ -4,6 +4,20 @@ description: "NAWA API platform updates and releases"
 rss: true
 ---
 
+<Update label="April 20, 2026" description="v1.2.1" tags={["API", "Performance", "Reliability"]}>
+## Prompt caching, Haiku classification, and improved retry handling
+
+### Changed
+- **Classification model routing:** Classification requests now use `claude-haiku-4-5-20251001` for free, basic, and pro tiers. Enterprise tier continues to use `claude-sonnet-4-5-20250929`. The `model` field in the response reflects which model produced the result.
+- **Prompt caching:** The classification path now uses Anthropic prompt caching to reduce latency and token costs for repeated patterns within the same user context.
+- **429 retry with backoff:** Rate-limited requests (429) are now retried up to 3 times internally with exponential backoff (1 s / 2 s / 4 s), random jitter (0--500 ms), and `Retry-After` header support. The previous behavior of failing fast on 429 has been replaced with this safer retry strategy.
+- **5xx retry preserved:** Server errors (5xx) from upstream providers continue to be retried up to 3 times with exponential backoff.
+
+### Improved
+- Rate limit and cache observability headers are now logged on every provider response for better debugging.
+- Updated [rate limits](/rate-limits) and [errors](/errors) documentation with new retry guidance and code samples.
+</Update>
+
 <Update label="April 16, 2026" description="/report refresh" tags={["API", "Intelligence"]}>
 ## `/report` structure and tagging update
 
diff --git a/errors.mdx b/errors.mdx
index fdcf365..dacb1e9 100644
--- a/errors.mdx
+++ b/errors.mdx
@@ -202,9 +202,9 @@ You've exceeded the rate limit for your current tier.
 
 <AccordionGroup>
   <Accordion title="rate_limit_exceeded">
-    **Cause:** Too many requests in the current time window.
+    **Cause:** Too many requests in the current time window. The NAWA backend retries 429 responses from upstream providers up to 3 times with exponential backoff and jitter before returning this error to you.
 
-    **Fix:** Wait until the `X-RateLimit-Reset` time, then retry. Consider upgrading your tier.
+    **Fix:** Read the `Retry-After` header (seconds) to know exactly how long to wait. Fall back to `X-RateLimit-Reset` if `Retry-After` is absent. Use exponential backoff with jitter in your client code (see [Handling 429 errors](/rate-limits#handling-429-errors)).
 
     ```json
     {
@@ -215,7 +215,13 @@ You've exceeded the rate limit for your current tier.
     }
     ```
 
-    The `X-NAWA-RateLimit-Reason` response header provides additional context (`minute_limit` or `sandbox_exhausted`).
+    **Response headers on 429:**
+
+    | Header | Description |
+    |--------|-------------|
+    | `Retry-After` | Seconds to wait before retrying. Always present on 429 responses. |
+    | `X-NAWA-RateLimit-Reason` | Which limit was hit: `minute_limit` or `sandbox_exhausted`. |
+    | `X-RateLimit-Reset` | RFC 3339 timestamp when the current window resets. |
   </Accordion>
 
   <Accordion title="sandbox_exhausted">
@@ -255,7 +261,7 @@ An unexpected error occurred on the NAWA side.
   </Accordion>
 
   <Accordion title="provider_failure">
-    **Cause:** The upstream AI provider (ALLaM) experienced a failure.
+    **Cause:** The upstream AI provider (ALLaM or Claude) experienced a failure. NAWA retries 5xx errors from providers up to 3 times with exponential backoff before returning this error. If both the primary provider and its fallback fail, you see this code.
 
     **Fix:** Retry after a brief delay. Check [status.trynawa.com](https://status.trynawa.com) for incidents.
 
@@ -292,7 +298,9 @@ if (error) {
       console.error('Check your API key:', error.message)
       break
     case 'rate_limit_error':
-      console.log('Rate limited, retrying after:', error.retryAfter)
+      // Retry-After header is parsed into error.retryAfter (seconds)
+      const delay = error.retryAfter ?? 60
+      console.log(`Rate limited, retrying after ${delay}s`)
       break
     case 'insufficient_credits':
       console.error('Buy credits:', error.suggested_action)
@@ -315,7 +323,9 @@ if result.error:
     if result.error.type == "authentication_error":
         print(f"Check your API key: {result.error.message}")
     elif result.error.type == "rate_limit_error":
-        print(f"Rate limited, retry after: {result.error.retry_after}")
+        # retry_after comes from the Retry-After header (seconds)
+        delay = result.error.retry_after or 60
+        print(f"Rate limited, retry after {delay}s")
     elif result.error.type == "insufficient_credits":
         print(f"Buy credits: {result.error.suggested_action}")
     else:
diff --git a/openapi.yaml b/openapi.yaml
index 2c6201d..903d7b4 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -60,6 +60,12 @@ paths:
         NAGL. Semantic cache hits are free and return identical shape with
         `X-NAWA-Cache: HIT`.
 
+        Classification uses `claude-haiku-4-5-20251001` for free, basic, and pro
+        tiers and `claude-sonnet-4-5-20250929` for enterprise. Prompt caching is
+        enabled on the classification path to reduce latency and token costs for
+        repeated patterns within the same user context. The `model` response field
+        tells you which model produced the result.
+
         Dialect note: `dialect` is currently hardcoded to `"gulf"` for any Arabic
         text on this endpoint; real dialect detection runs only on `/v1/detect`
         and `/v1/translate`. `dialect_confidence` is the NAGL intent-classifier
@@ -143,7 +149,7 @@ paths:
                         text: "Viewer is asking when the next part drops. Respond with the schedule."
                         direction: rtl
                       provider: claude
-                      model: claude-sonnet-4-5-20250929
+                      model: claude-haiku-4-5-20251001
                       fallback_used: false
                       tokens_used: null
                       cost_usd: 0.006
@@ -1711,7 +1717,13 @@ components:
           description: AI provider that produced this result
         model:
           type: string
-          description: Specific model ID (e.g. `claude-sonnet-4-5-20250929`)
+          description: |
+            Specific model ID. Classification uses `claude-haiku-4-5-20251001`
+            for free, basic, and pro tiers, and `claude-sonnet-4-5-20250929`
+            for enterprise tier.
+          examples:
+            - "claude-haiku-4-5-20251001"
+            - "claude-sonnet-4-5-20250929"
         fallback_used:
           type: boolean
           description: True when the primary provider failed and a fallback produced this result
diff --git a/rate-limits.mdx b/rate-limits.mdx
index 26c3d15..3bcd648 100644
--- a/rate-limits.mdx
+++ b/rate-limits.mdx
@@ -44,25 +44,46 @@ On `429` responses, additional headers are included:
 
 ## Handling 429 errors
 
-Use exponential backoff with jitter to retry rate-limited requests:
+NAWA retries rate-limited requests internally with exponential backoff, jitter, and `Retry-After` header support. If all internal retries are exhausted, the API returns a `429` with a `Retry-After` header indicating how long to wait.
+
+When you handle 429 responses in your own code, follow the same pattern:
+
+1. Use exponential backoff: 1 s, 2 s, 4 s base delays.
+2. Add random jitter (0--500 ms) to avoid synchronized retries across clients.
+3. Respect the `Retry-After` header. If the header value (in seconds) is longer than your computed backoff, use it instead.
+4. Cap retries at 3 attempts. After that, surface the error to your application.
 
 <CodeGroup>
 
+```bash cURL
+# cURL does not retry automatically. Parse the Retry-After header and loop:
+RETRY_AFTER=$(curl -s -o /dev/null -w '%header{retry-after}' \
+  -X POST https://api.trynawa.com/v1/classify \
+  -H "Authorization: Bearer $NAWA_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"text": "test"}')
+
+echo "Retry after $RETRY_AFTER seconds"
+```
+
 ```typescript TypeScript
 async function classifyWithRetry(
   nawa: Nawa,
   params: ClassifyParams,
   maxRetries = 3
 ) {
-  for (let attempt = 0; attempt <= maxRetries; attempt++) {
+  for (let attempt = 0; attempt < maxRetries; attempt++) {
     const { data, error } = await nawa.classify(params)
 
     if (!error) return data
 
-    if (error.type === 'rate_limit_error' && attempt < maxRetries) {
-      const baseDelay = Math.pow(2, attempt) * 1000
-      const jitter = Math.random() * 1000
-      await new Promise(resolve => setTimeout(resolve, baseDelay + jitter))
+    if (error.type === 'rate_limit_error' && attempt < maxRetries - 1) {
+      // Exponential backoff: 1s, 2s, 4s + 0-500ms jitter
+      const computedMs = Math.pow(2, attempt) * 1000 + Math.floor(Math.random() * 500)
+      // Respect Retry-After header when available
+      const retryAfterMs = error.retryAfter ? error.retryAfter * 1000 : 0
+      const delayMs = Math.max(computedMs, retryAfterMs)
+      await new Promise(resolve => setTimeout(resolve, delayMs))
       continue
     }
 
@@ -77,16 +98,19 @@ import random
 from nawa import Nawa
 
 def classify_with_retry(nawa: Nawa, text: str, platform: str, max_retries: int = 3):
-    for attempt in range(max_retries + 1):
+    for attempt in range(max_retries):
         result = nawa.classify(text=text, platform=platform)
 
         if not result.error:
             return result.data
 
-        if result.error.type == "rate_limit_error" and attempt < max_retries:
-            base_delay = (2 ** attempt)
-            jitter = random.uniform(0, 1)
-            time.sleep(base_delay + jitter)
+        if result.error.type == "rate_limit_error" and attempt < max_retries - 1:
+            # Exponential backoff: 1s, 2s, 4s + 0-500ms jitter
+            computed = (2 ** attempt) + random.uniform(0, 0.5)
+            # Respect Retry-After header when available
+            retry_after = getattr(result.error, "retry_after", 0) or 0
+            delay = max(computed, retry_after)
+            time.sleep(delay)
             continue
 
         raise Exception(result.error.message)
@@ -95,5 +119,5 @@ def classify_with_retry(nawa: Nawa, text: str, platform: str, max_retries: int =
 </CodeGroup>
 
 <Warning>
-  Do not retry in a tight loop without backoff. This will extend your rate limit window and may result in longer delays.
+  Do not retry in a tight loop without backoff. This amplifies rate-limit pressure and may result in longer lockout windows. The NAWA backend also retries internally, so your client retries should be a safety net rather than the primary mechanism.
 </Warning>