rh-ai-quickstart · ganeshmurthy · Jun 29, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/README.md b/README.md
@@ -95,8 +95,8 @@ The solution is built on:
 | Function   | Model Name                             | Hardware    | AWS example      |
 |-----------|----------------------------------------|-------------|------------------|
 | Embedding | `all-MiniLM-L6-v2`                     | CPU/GPU/HPU | —                |
-| Generation| `meta-llama/Llama-3.2-3B-Instruct`     | L4/HPU      | g6.2xlarge       |
-| Generation| `meta-llama/Llama-3.1-8B-Instruct`     | L4/HPU      | g6.2xlarge       |
+| Generation| `meta-llama/Llama-3.2-3B-Instruct`     | L4/HPU/XEON | g6.2xlarge       |
+| Generation| `meta-llama/Llama-3.1-8B-Instruct`     | L4/HPU/XEON | g6.2xlarge       |
 | Generation| `meta-llama/Meta-Llama-3-70B-Instruct` | A100 x2/HPU | p4d.24xlarge     |
 | Safety    | `meta-llama/Llama-Guard-3-8B`          | L4/HPU      | g6.2xlarge       |
 
@@ -169,6 +169,25 @@ The 70B model is not required for initial testing. Llama-Guard-3-8B is optional.
 
    > **Tip:** You can combine both options—for example, run the embedding model locally while pointing the generation model at a remote server.
 
+   ---
+   #### Option C: Deploy models locally using Intel Xeon processors
+   Best when you have Intel Xeon nodes on your cluster and want to run models entirely on-premises.
+
+   In `rag-values.yaml`, enable one or more local models by setting `enabled: true`:
+
+   ```yaml
+   global:
+     models:
+       llama-3-2-3b-instruct:
+        id: meta-llama/Llama-3.2-3B-Instruct
+        enabled: true
+        device: "xeon"
+        args:
+        - --max-model-len
+        - "14336"
+        - --max-num-seqs
+        - "32"
+   ```
    ---
 
    Once your values file is ready, deploy:
@@ -181,14 +200,26 @@ The 70B model is not required for initial testing. Llama-Guard-3-8B is optional.
    [SUCCESS] rag installed successfully
    ```
 
-4. **Verify (optional)**  
+4. **Verify (optional)**:
+
+   This step uses port-forwarding to locally validate the LlamaStack service
+
+   Port-forward the LlamaStack service:
+   ```bash
+   oc port-forward svc/llamastack 8321:8321
+   ```
+
    List models:
    ```bash
-   curl -sS http://llamastack-<NAMESPACE>.<YOUR_OPENSHIFT_CLUSTER>.com/v1/models
+   curl -sS http://localhost:8321/v1/models
    ```
+   Expected result: a JSON response containing one or more model IDs.
+
    Test chat (LlamaStack):
+
+   Replace <MODEL_ID> with a model ID returned by the previous command.
    ```bash
-   curl -sS http://llamastack-<NAMESPACE>.<YOUR_OPENSHIFT_CLUSTER>.com/v1/openai/v1/chat/completions \
+   curl -sS http://localhost:8321/v1/chat/completions \
      -H "Content-Type: application/json" \
      -d '{"model": "<MODEL_ID>", "messages": [{"role": "user", "content": "Say hello in one sentence."}], "max_tokens": 64, "temperature": 0}' | jq
    ```

diff --git a/deploy/helm/rag-values.yaml.example b/deploy/helm/rag-values.yaml.example
@@ -141,6 +141,27 @@ global:
     #     operator: Exists
     #     effect: NoSchedule
 
+    # Example Xeon configurations:
+    # llama-3-2-3b-instruct:
+    #   id: meta-llama/Llama-3.2-3B-Instruct
+    #   enabled: true
+    #   device: "xeon"
+    #   args:
+    #   - --max-model-len
+    #   - "14336"
+    #   - --max-num-seqs
+    #   - "32"
+    #
+    # llama-3-1-8b-instruct:
+    #   id: meta-llama/Llama-3.1-8B-Instruct
+    #   enabled: true
+    #   device: "xeon"
+    #   args:
+    #   - --max-model-len
+    #   - "14336"
+    #   - --max-num-seqs
+    #   - "32"
+
 # Database Configuration (pgvector)
 pgvector:
   secret:
@@ -153,7 +174,7 @@ pgvector:
 # Llama Stack Configuration
 llama-stack:
   image:
-    tag: 0.2.23  # Match frontend client version
+    tag: 0.6.1
 
 # Suggested Questions Configuration (Optional)
 # These questions appear in the chat UI when users select a database

diff --git a/deploy/helm/rag/Chart.yaml b/deploy/helm/rag/Chart.yaml
@@ -11,7 +11,7 @@ dependencies:
     version: 0.1.0
     repository: https://rh-ai-quickstart.github.io/ai-architecture-charts
   - name: llm-service
-    version: 0.5.2
+    version: 0.5.10
     repository: https://rh-ai-quickstart.github.io/ai-architecture-charts
   - name: llama-stack
     version: 0.8.6