NVIDIA · kjw3 · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -78,16 +78,26 @@ RUN chmod +x /usr/local/bin/nemoclaw-start
 # Build args for config that varies per deployment.
 # nemoclaw onboard passes these at image build time.
 ARG NEMOCLAW_MODEL=nvidia/nemotron-3-super-120b-a12b
+ARG NEMOCLAW_PROVIDER_KEY=nvidia
+ARG NEMOCLAW_PRIMARY_MODEL_REF=nvidia/nemotron-3-super-120b-a12b
 ARG CHAT_UI_URL=http://127.0.0.1:18789
+ARG NEMOCLAW_INFERENCE_BASE_URL=https://inference.local/v1
+ARG NEMOCLAW_INFERENCE_API=openai-completions
+ARG NEMOCLAW_INFERENCE_COMPAT_B64=e30=
 # Unique per build to ensure each image gets a fresh auth token.
 # Pass --build-arg NEMOCLAW_BUILD_ID=$(date +%s) to bust the cache.
 ARG NEMOCLAW_BUILD_ID=default
 
 # SECURITY: Promote build-args to env vars so the Python script reads them
 # via os.environ, never via string interpolation into Python source code.
-# Direct ARG interpolation into python3 -c is a code injection vector (C-2).
+# Direct ARG interpolation into python3 -c is a code injection vector.
 ENV NEMOCLAW_MODEL=${NEMOCLAW_MODEL} \
-    CHAT_UI_URL=${CHAT_UI_URL}
+    NEMOCLAW_PROVIDER_KEY=${NEMOCLAW_PROVIDER_KEY} \
+    NEMOCLAW_PRIMARY_MODEL_REF=${NEMOCLAW_PRIMARY_MODEL_REF} \
+    CHAT_UI_URL=${CHAT_UI_URL} \
+    NEMOCLAW_INFERENCE_BASE_URL=${NEMOCLAW_INFERENCE_BASE_URL} \
+    NEMOCLAW_INFERENCE_API=${NEMOCLAW_INFERENCE_API} \
+    NEMOCLAW_INFERENCE_COMPAT_B64=${NEMOCLAW_INFERENCE_COMPAT_B64}
 
 WORKDIR /sandbox
 USER sandbox
@@ -98,30 +108,30 @@ USER sandbox
 # Build args (NEMOCLAW_MODEL, CHAT_UI_URL) customize per deployment.
 # Auth token is generated per build so each image has a unique token.
 RUN python3 -c "\
-import json, os, secrets; \
+import base64, json, os, secrets; \
 from urllib.parse import urlparse; \
 model = os.environ['NEMOCLAW_MODEL']; \
+provider_key = os.environ['NEMOCLAW_PROVIDER_KEY']; \
+primary_model_ref = os.environ['NEMOCLAW_PRIMARY_MODEL_REF']; \
 chat_ui_url = os.environ['CHAT_UI_URL']; \
+inference_base_url = os.environ['NEMOCLAW_INFERENCE_BASE_URL']; \
+inference_api = os.environ['NEMOCLAW_INFERENCE_API']; \
+inference_compat = json.loads(base64.b64decode(os.environ['NEMOCLAW_INFERENCE_COMPAT_B64']).decode('utf-8')); \
 parsed = urlparse(chat_ui_url); \
 chat_origin = f'{parsed.scheme}://{parsed.netloc}' if parsed.scheme and parsed.netloc else 'http://127.0.0.1:18789'; \
 origins = ['http://127.0.0.1:18789']; \
 origins = list(dict.fromkeys(origins + [chat_origin])); \
+providers = { \
+    provider_key: { \
+        'baseUrl': inference_base_url, \
+        'apiKey': 'unused', \
+        'api': inference_api, \
+        'models': [{**({'compat': inference_compat} if inference_compat else {}), 'id': model, 'name': primary_model_ref, 'reasoning': False, 'input': ['text'], 'cost': {'input': 0, 'output': 0, 'cacheRead': 0, 'cacheWrite': 0}, 'contextWindow': 131072, 'maxTokens': 4096}] \
+    } \
+}; \
 config = { \
-    'agents': {'defaults': {'model': {'primary': f'inference/{model}'}}}, \
-    'models': {'mode': 'merge', 'providers': { \
-        'nvidia': { \
-            'baseUrl': 'https://inference.local/v1', \
-            'apiKey': 'openshell-managed', \
-            'api': 'openai-completions', \
-            'models': [{'id': model.split('/')[-1], 'name': model, 'reasoning': False, 'input': ['text'], 'cost': {'input': 0, 'output': 0, 'cacheRead': 0, 'cacheWrite': 0}, 'contextWindow': 131072, 'maxTokens': 4096}] \
-        }, \
-        'inference': { \
-            'baseUrl': 'https://inference.local/v1', \
-            'apiKey': 'unused', \
-            'api': 'openai-completions', \
-            'models': [{'id': model, 'name': model, 'reasoning': False, 'input': ['text'], 'cost': {'input': 0, 'output': 0, 'cacheRead': 0, 'cacheWrite': 0}, 'contextWindow': 131072, 'maxTokens': 4096}] \
-        } \
-    }}, \
+    'agents': {'defaults': {'model': {'primary': primary_model_ref}}}, \
+    'models': {'mode': 'merge', 'providers': providers}, \
     'channels': {'defaults': {'configWrites': False}}, \
     'gateway': { \
         'mode': 'local', \

diff --git a/README.md b/README.md
@@ -162,14 +162,14 @@ curl -fsSL https://raw.githubusercontent.com/NVIDIA/NemoClaw/refs/heads/main/uni
 
 ## How It Works
 
-NemoClaw installs the NVIDIA OpenShell runtime and Nemotron models, then uses a versioned blueprint to create a sandboxed environment where every network request, file access, and inference call is governed by declarative policy. The `nemoclaw` CLI orchestrates the full stack: OpenShell gateway, sandbox, inference provider, and network policy.
+NemoClaw installs the NVIDIA OpenShell runtime, then creates a sandboxed OpenClaw environment where every network request, file access, and inference call is governed by declarative policy. The `nemoclaw` CLI orchestrates the full stack: OpenShell gateway, sandbox, inference provider, and network policy.
 
 | Component        | Role                                                                                      |
 |------------------|-------------------------------------------------------------------------------------------|
 | **Plugin**       | TypeScript CLI commands for launch, connect, status, and logs.                            |
 | **Blueprint**    | Versioned Python artifact that orchestrates sandbox creation, policy, and inference setup. |
 | **Sandbox**      | Isolated OpenShell container running OpenClaw with policy-enforced egress and filesystem.  |
-| **Inference**    | NVIDIA Endpoint model calls, routed through the OpenShell gateway, transparent to the agent.  |
+| **Inference**    | Provider-routed model calls, routed through the OpenShell gateway, transparent to the agent. |
 
 The blueprint lifecycle follows four stages: resolve the artifact, verify its digest, plan the resources, and apply through the OpenShell CLI.
 
@@ -179,15 +179,29 @@ When something goes wrong, errors may originate from either NemoClaw or the Open
 
 ## Inference
 
-Inference requests from the agent never leave the sandbox directly. OpenShell intercepts every call and routes it to the NVIDIA Endpoint provider.
+Inference requests from the agent never leave the sandbox directly. OpenShell intercepts every call and routes it to the provider you selected during onboarding.
 
-| Provider     | Model                               | Use Case                                       |
-|--------------|--------------------------------------|-------------------------------------------------|
-| NVIDIA Endpoint | `nvidia/nemotron-3-super-120b-a12b` | Production. Requires an NVIDIA API key.         |
+Supported non-experimental onboarding paths:
 
-Get an API key from [build.nvidia.com](https://build.nvidia.com). The `nemoclaw onboard` command prompts for this key during setup.
+| Provider | Notes |
+|---|---|
+| NVIDIA Endpoints | Curated hosted models on `integrate.api.nvidia.com`. |
+| OpenAI | Curated GPT models plus `Other...` for manual model entry. |
+| Other OpenAI-compatible endpoint | For proxies and compatible gateways. |
+| Anthropic | Curated Claude models plus `Other...` for manual model entry. |
+| Other Anthropic-compatible endpoint | For Claude proxies and compatible gateways. |
+| Google Gemini | Google's OpenAI-compatible endpoint. |
+| Local Ollama | Local model serving through Ollama with pull, warmup, and validation in onboarding. |
 
-Local inference options such as Ollama and vLLM are still experimental. On macOS, they also depend on OpenShell host-routing support in addition to the local service itself being reachable on the host.
+During onboarding, NemoClaw validates the selected provider and model before it creates the sandbox:
+
+- OpenAI-compatible providers: tries `/responses` first, then `/chat/completions`
+- Anthropic-compatible providers: tries `/v1/messages`
+- If validation fails, the wizard prompts you to fix the selection before continuing
+
+Credentials stay on the host in `~/.nemoclaw/credentials.json`. The sandbox only sees the routed `inference.local` endpoint, not your raw provider key.
+
+Local Ollama is supported in the standard onboarding flow. Local vLLM remains experimental, and local host-routed inference on macOS still depends on OpenShell host-routing support in addition to the local service itself being reachable on the host.
 
 ---
 
@@ -252,7 +266,7 @@ Refer to the documentation for more information on NemoClaw.
 - [Overview](https://docs.nvidia.com/nemoclaw/latest/about/overview.html): Learn what NemoClaw does and how it fits together.
 - [How It Works](https://docs.nvidia.com/nemoclaw/latest/about/how-it-works.html): Learn about the plugin, blueprint, and sandbox lifecycle.
 - [Architecture](https://docs.nvidia.com/nemoclaw/latest/reference/architecture.html): Learn about the plugin structure, blueprint lifecycle, and sandbox environment.
-- [Inference Profiles](https://docs.nvidia.com/nemoclaw/latest/reference/inference-profiles.html): Learn about the NVIDIA Endpoint inference configuration.
+- [Inference Profiles](https://docs.nvidia.com/nemoclaw/latest/reference/inference-profiles.html): Learn how NemoClaw configures routed inference providers.
 - [Network Policies](https://docs.nvidia.com/nemoclaw/latest/reference/network-policies.html): Learn about egress control and policy customization.
 - [CLI Commands](https://docs.nvidia.com/nemoclaw/latest/reference/commands.html): Learn about the full command reference.
 - [Troubleshooting](https://docs.nvidia.com/nemoclaw/latest/reference/troubleshooting.html): Troubleshoot common issues and resolution steps.

diff --git a/bin/lib/credentials.js b/bin/lib/credentials.js
@@ -31,8 +31,97 @@ function getCredential(key) {
   return creds[key] || null;
 }
 
-function prompt(question) {
+function promptSecret(question) {
+  return new Promise((resolve, reject) => {
+    const input = process.stdin;
+    const output = process.stderr;
+    let answer = "";
+    let rawModeEnabled = false;
+    let finished = false;
+
+    function cleanup() {
+      input.removeListener("data", onData);
+      if (rawModeEnabled && typeof input.setRawMode === "function") {
+        input.setRawMode(false);
+      }
+      if (typeof input.pause === "function") {
+        input.pause();
+      }
+    }
+
+    function finish(fn, value) {
+      if (finished) return;
+      finished = true;
+      cleanup();
+      output.write("\n");
+      fn(value);
+    }
+
+    function onData(chunk) {
+      const text = chunk.toString("utf8");
+      for (let i = 0; i < text.length; i += 1) {
+        const ch = text[i];
+
+        if (ch === "\u0003") {
+          finish(reject, Object.assign(new Error("Prompt interrupted"), { code: "SIGINT" }));
+          return;
+        }
+
+        if (ch === "\r" || ch === "\n") {
+          finish(resolve, answer.trim());
+          return;
+        }
+
+        if (ch === "\u0008" || ch === "\u007f") {
+          answer = answer.slice(0, -1);
+          continue;
+        }
+
+        if (ch === "\u001b") {
+          // Ignore terminal escape/control sequences such as Delete, arrows,
+          // Home/End, etc. while leaving the buffered secret untouched.
+          const rest = text.slice(i);
+          const match = rest.match(/^\u001b(?:\[[0-9;?]*[~A-Za-z]|\][^\u0007]*\u0007|.)/);
+          if (match) {
+            i += match[0].length - 1;
+          }
+          continue;
+        }
+
+        if (ch >= " ") {
+          answer += ch;
+        }
+      }
+    }
+
+    output.write(question);
+    input.setEncoding("utf8");
+    if (typeof input.resume === "function") {
+      input.resume();
+    }
+    if (typeof input.setRawMode === "function") {
+      input.setRawMode(true);
+      rawModeEnabled = true;
+    }
+    input.on("data", onData);
+  });
+}
+
+function prompt(question, opts = {}) {
   return new Promise((resolve) => {
+    const silent = opts.secret === true && process.stdin.isTTY && process.stderr.isTTY;
+    if (silent) {
+      promptSecret(question)
+        .then(resolve)
+        .catch((err) => {
+          if (err && err.code === "SIGINT") {
+            process.kill(process.pid, "SIGINT");
+            return;
+          }
+          throw err;
+        });
+      return;
+    }
-function prompt(question, opts = {}) {
-  return new Promise((resolve) => {
-    const silent = opts.secret === true && process.stdin.isTTY && process.stderr.isTTY;
-    if (silent) {
-      promptSecret(question)
-        .then(resolve)
-        .catch((err) => {
-          if (err && err.code === "SIGINT") {
-            process.kill(process.pid, "SIGINT");
-            return;
-          }
-          throw err;
-        });
-      return;
-    }
+function prompt(question, opts = {}) {
+  return new Promise((resolve, reject) => {
+    const silent = opts.secret === true && process.stdin.isTTY && process.stderr.isTTY;
+    if (silent) {
+      promptSecret(question)
+        .then(resolve)
+        .catch((err) => {
+          if (err && err.code === "SIGINT") {
+            process.kill(process.pid, "SIGINT");
+            reject(err);
+            return;
+          }
+          reject(err);
+        });
+      return;
+    }
-function prompt(question, opts = {}) {
-  return new Promise((resolve) => {
-    const silent = opts.secret === true && process.stdin.isTTY && process.stderr.isTTY;
-    if (silent) {
-      promptSecret(question)
-        .then(resolve)
-        .catch((err) => {
-          if (err && err.code === "SIGINT") {
-            process.kill(process.pid, "SIGINT");
-            return;
-          }
-          throw err;
-        });
-      return;
-    }
+function prompt(question, opts = {}) {
+  return new Promise((resolve, reject) => {
+    const silent = opts.secret === true && process.stdin.isTTY && process.stderr.isTTY;
+    if (silent) {
+      promptSecret(question)
+        .then(resolve)
+        .catch((err) => {
+          if (err && err.code === "SIGINT") {
+            process.kill(process.pid, "SIGINT");
+            reject(err);
+            return;
+          }
+          reject(err);
+        });
+      return;
+    }
     const rl = readline.createInterface({ input: process.stdin, output: process.stderr });
     rl.question(question, (answer) => {
       rl.close();
@@ -67,7 +156,7 @@ async function ensureApiKey() {
   console.log("  └─────────────────────────────────────────────────────────────────┘");
   console.log("");
 
-  key = await prompt("  NVIDIA API Key: ");
+  key = await prompt("  NVIDIA API Key: ", { secret: true });
 
   if (!key || !key.startsWith("nvapi-")) {
     console.error("  Invalid key. Must start with nvapi-");
@@ -114,7 +203,7 @@ async function ensureGithubToken() {
   console.log("  └──────────────────────────────────────────────────┘");
   console.log("");
 
-  token = await prompt("  GitHub Token: ");
+  token = await prompt("  GitHub Token: ", { secret: true });
 
   if (!token) {
     console.error("  Token required for deploy (repo is private).");

diff --git a/bin/lib/inference-config.js b/bin/lib/inference-config.js
@@ -18,6 +18,7 @@ const { DEFAULT_OLLAMA_MODEL } = require("./local-inference");
 
 function getProviderSelectionConfig(provider, model) {
   switch (provider) {
+    case "nvidia-prod":
     case "nvidia-nim":
       return {
         endpointType: "custom",
@@ -27,7 +28,62 @@ function getProviderSelectionConfig(provider, model) {
         profile: DEFAULT_ROUTE_PROFILE,
         credentialEnv: DEFAULT_ROUTE_CREDENTIAL_ENV,
         provider,
-        providerLabel: "NVIDIA Endpoint API",
+        providerLabel: "NVIDIA Endpoints",
+      };
+    case "openai-api":
+      return {
+        endpointType: "custom",
+        endpointUrl: INFERENCE_ROUTE_URL,
+        ncpPartner: null,
+        model: model || "gpt-5.4",
+        profile: DEFAULT_ROUTE_PROFILE,
+        credentialEnv: "OPENAI_API_KEY",
+        provider,
+        providerLabel: "OpenAI",
+      };
+    case "anthropic-prod":
+      return {
+        endpointType: "custom",
+        endpointUrl: INFERENCE_ROUTE_URL,
+        ncpPartner: null,
+        model: model || "claude-sonnet-4-6",
+        profile: DEFAULT_ROUTE_PROFILE,
+        credentialEnv: "ANTHROPIC_API_KEY",
+        provider,
+        providerLabel: "Anthropic",
+      };
+    case "compatible-anthropic-endpoint":
+      return {
+        endpointType: "custom",
+        endpointUrl: INFERENCE_ROUTE_URL,
+        ncpPartner: null,
+        model: model || "custom-anthropic-model",
+        profile: DEFAULT_ROUTE_PROFILE,
+        credentialEnv: "COMPATIBLE_ANTHROPIC_API_KEY",
+        provider,
+        providerLabel: "Other Anthropic-compatible endpoint",
+      };
+    case "gemini-api":
+      return {
+        endpointType: "custom",
+        endpointUrl: INFERENCE_ROUTE_URL,
+        ncpPartner: null,
+        model: model || "gemini-2.5-flash",
+        profile: DEFAULT_ROUTE_PROFILE,
+        credentialEnv: "GEMINI_API_KEY",
+        provider,
+        providerLabel: "Google Gemini",
+      };
+    case "compatible-endpoint":
+      return {
+        endpointType: "custom",
+        endpointUrl: INFERENCE_ROUTE_URL,
+        ncpPartner: null,
+        model: model || "custom-model",
+        profile: DEFAULT_ROUTE_PROFILE,
+        credentialEnv: "COMPATIBLE_API_KEY",
+        provider,
+        providerLabel: "Other OpenAI-compatible endpoint",
       };
     case "vllm-local":
       return {

diff --git a/bin/lib/local-inference.js b/bin/lib/local-inference.js
@@ -6,6 +6,8 @@ const { shellQuote } = require("./runner");
 const HOST_GATEWAY_URL = "http://host.openshell.internal";
 const CONTAINER_REACHABILITY_IMAGE = "curlimages/curl:8.10.1";
 const DEFAULT_OLLAMA_MODEL = "nemotron-3-nano:30b";
+const SMALL_OLLAMA_MODEL = "qwen2.5:7b";
+const LARGE_OLLAMA_MIN_MEMORY_MB = 32768;
 
 function getLocalProviderBaseUrl(provider) {
   switch (provider) {
@@ -18,6 +20,17 @@ function getLocalProviderBaseUrl(provider) {
   }
 }
 
+function getLocalProviderValidationBaseUrl(provider) {
+  switch (provider) {
+    case "vllm-local":
+      return "http://localhost:8000/v1";
+    case "ollama-local":
+      return "http://localhost:11434/v1";
+    default:
+      return null;
+  }
+}
+
 function getLocalProviderHealthCheck(provider) {
   switch (provider) {
     case "vllm-local":
@@ -105,14 +118,23 @@ function parseOllamaList(output) {
 function getOllamaModelOptions(runCapture) {
   const output = runCapture("ollama list 2>/dev/null", { ignoreError: true });
   const parsed = parseOllamaList(output);
-  if (parsed.length > 0) {
-    return parsed;
+  return parsed;
+}
+
+function getBootstrapOllamaModelOptions(gpu) {
+  const options = [SMALL_OLLAMA_MODEL];
+  if (gpu && gpu.totalMemoryMB >= LARGE_OLLAMA_MIN_MEMORY_MB) {
+    options.push(DEFAULT_OLLAMA_MODEL);
   }
-  return [DEFAULT_OLLAMA_MODEL];
+  return options;
 }
 
-function getDefaultOllamaModel(runCapture) {
+function getDefaultOllamaModel(runCapture, gpu = null) {
   const models = getOllamaModelOptions(runCapture);
+  if (models.length === 0) {
+    const bootstrap = getBootstrapOllamaModelOptions(gpu);
+    return bootstrap[0];
+  }
   return models.includes(DEFAULT_OLLAMA_MODEL) ? DEFAULT_OLLAMA_MODEL : models[0];
 }
 
@@ -164,8 +186,12 @@ module.exports = {
   CONTAINER_REACHABILITY_IMAGE,
   DEFAULT_OLLAMA_MODEL,
   HOST_GATEWAY_URL,
+  LARGE_OLLAMA_MIN_MEMORY_MB,
+  SMALL_OLLAMA_MODEL,
   getDefaultOllamaModel,
+  getBootstrapOllamaModelOptions,
   getLocalProviderBaseUrl,
+  getLocalProviderValidationBaseUrl,
   getLocalProviderContainerReachabilityCheck,
   getLocalProviderHealthCheck,
   getOllamaModelOptions,