diff --git a/README.md b/README.md index 18ff95f..03c1620 100644 --- a/README.md +++ b/README.md @@ -95,8 +95,8 @@ The solution is built on: | Function | Model Name | Hardware | AWS example | |-----------|----------------------------------------|-------------|------------------| | Embedding | `all-MiniLM-L6-v2` | CPU/GPU/HPU | — | -| Generation| `meta-llama/Llama-3.2-3B-Instruct` | L4/HPU | g6.2xlarge | -| Generation| `meta-llama/Llama-3.1-8B-Instruct` | L4/HPU | g6.2xlarge | +| Generation| `meta-llama/Llama-3.2-3B-Instruct` | L4/HPU/XEON | g6.2xlarge | +| Generation| `meta-llama/Llama-3.1-8B-Instruct` | L4/HPU/XEON | g6.2xlarge | | Generation| `meta-llama/Meta-Llama-3-70B-Instruct` | A100 x2/HPU | p4d.24xlarge | | Safety | `meta-llama/Llama-Guard-3-8B` | L4/HPU | g6.2xlarge | @@ -169,6 +169,25 @@ The 70B model is not required for initial testing. Llama-Guard-3-8B is optional. > **Tip:** You can combine both options—for example, run the embedding model locally while pointing the generation model at a remote server. + --- + #### Option C: Deploy models locally using Intel Xeon processors + Best when you have Intel Xeon nodes on your cluster and want to run models entirely on-premises. + + In `rag-values.yaml`, enable one or more local models by setting `enabled: true`: + + ```yaml + global: + models: + llama-3-2-3b-instruct: + id: meta-llama/Llama-3.2-3B-Instruct + enabled: true + device: "xeon" + args: + - --max-model-len + - "14336" + - --max-num-seqs + - "32" + ``` --- Once your values file is ready, deploy: @@ -181,14 +200,26 @@ The 70B model is not required for initial testing. Llama-Guard-3-8B is optional. [SUCCESS] rag installed successfully ``` -4. **Verify (optional)** +4. **Verify (optional)**: + + This step uses port-forwarding to locally validate the LlamaStack service + + Port-forward the LlamaStack service: + ```bash + oc port-forward svc/llamastack 8321:8321 + ``` + List models: ```bash - curl -sS http://llamastack-..com/v1/models + curl -sS http://localhost:8321/v1/models ``` + Expected result: a JSON response containing one or more model IDs. + Test chat (LlamaStack): + + Replace with a model ID returned by the previous command. ```bash - curl -sS http://llamastack-..com/v1/openai/v1/chat/completions \ + curl -sS http://localhost:8321/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{"model": "", "messages": [{"role": "user", "content": "Say hello in one sentence."}], "max_tokens": 64, "temperature": 0}' | jq ``` diff --git a/deploy/helm/rag-values.yaml.example b/deploy/helm/rag-values.yaml.example index 8e40941..cb4b41e 100644 --- a/deploy/helm/rag-values.yaml.example +++ b/deploy/helm/rag-values.yaml.example @@ -141,6 +141,27 @@ global: # operator: Exists # effect: NoSchedule + # Example Xeon configurations: + # llama-3-2-3b-instruct: + # id: meta-llama/Llama-3.2-3B-Instruct + # enabled: true + # device: "xeon" + # args: + # - --max-model-len + # - "14336" + # - --max-num-seqs + # - "32" + # + # llama-3-1-8b-instruct: + # id: meta-llama/Llama-3.1-8B-Instruct + # enabled: true + # device: "xeon" + # args: + # - --max-model-len + # - "14336" + # - --max-num-seqs + # - "32" + # Database Configuration (pgvector) pgvector: secret: @@ -153,7 +174,7 @@ pgvector: # Llama Stack Configuration llama-stack: image: - tag: 0.2.23 # Match frontend client version + tag: 0.6.1 # Suggested Questions Configuration (Optional) # These questions appear in the chat UI when users select a database diff --git a/deploy/helm/rag/Chart.yaml b/deploy/helm/rag/Chart.yaml index 4a72288..14218ed 100644 --- a/deploy/helm/rag/Chart.yaml +++ b/deploy/helm/rag/Chart.yaml @@ -11,7 +11,7 @@ dependencies: version: 0.1.0 repository: https://rh-ai-quickstart.github.io/ai-architecture-charts - name: llm-service - version: 0.5.2 + version: 0.5.10 repository: https://rh-ai-quickstart.github.io/ai-architecture-charts - name: llama-stack version: 0.8.6