Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ Create a `.env` file in the project root (or export these in your shell):
```bash
ANTHROPIC_API_KEY=<your-anthropic-api-key> # if using anthropic models
OPENAI_API_KEY=<your-openai-api-key> # if using openai models
GEMINI_API_KEY=<your-ai-studio-key> # if using gemini/ (Google AI Studio) models
VERTEXAI_PROJECT=<gcp-project-id> # if using vertex_ai/ models
VERTEXAI_LOCATION=us-central1 # GCP region for vertex_ai/ models
GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json # ADC for vertex_ai/
LOCAL_LLM_BASE_URL=http://localhost:8000 # shared fallback for local model prefixes
LOCAL_LLM_API_KEY=<optional-local-api-key> # optional shared local API key
HF_TOKEN=<your-hugging-face-token>
Expand Down Expand Up @@ -61,6 +65,9 @@ ml-intern "fine-tune llama on my dataset"
```bash
ml-intern --model anthropic/claude-opus-4-7 "your prompt" # requires ANTHROPIC_API_KEY
ml-intern --model openai/gpt-5.5 "your prompt" # requires OPENAI_API_KEY
ml-intern --model vertex_ai/gemini-3.1-pro "your prompt" # GCP Vertex AI (ADC + VERTEXAI_*)
ml-intern --model vertex_ai/gemini-3.5-flash "your prompt" # GCP Vertex AI
ml-intern --model gemini/gemini-3.5-flash "your prompt" # Google AI Studio (GEMINI_API_KEY)
ml-intern --model ollama/llama3.1:8b "your prompt"
ml-intern --model vllm/meta-llama/Llama-3.1-8B-Instruct "your prompt"
ml-intern --sandbox-tools "your prompt" # use HF Space sandbox tools
Expand Down Expand Up @@ -98,6 +105,27 @@ one shared local endpoint, or override a specific provider with its matching
`VLLM_API_KEY`. Provider-specific variables take precedence over the shared
local variables. Base URLs may include or omit `/v1`.

**GCP models (Vertex AI / Gemini):**

Google Cloud models route through LiteLLM, no extra dependency needed:

```text
/model vertex_ai/gemini-3.1-pro # Gemini 3.1 Pro on Vertex AI
/model vertex_ai/gemini-3.5-flash # Gemini 3.5 Flash on Vertex AI
/model vertex_ai/claude-opus-4-6 # Anthropic models via Vertex Model Garden
/model gemini/gemini-3.5-flash # Google AI Studio (simpler, key-only)
```

- `vertex_ai/<model>` — full Vertex AI. Set `VERTEXAI_PROJECT` and
`VERTEXAI_LOCATION`, and authenticate with Application Default Credentials:
either `GOOGLE_APPLICATION_CREDENTIALS` pointing at a service-account JSON,
or run `gcloud auth application-default login`. Covers Gemini and the
Anthropic models published in Vertex Model Garden.
- `gemini/<model>` — Google AI Studio. Only needs `GEMINI_API_KEY`.

Reasoning effort is not forwarded to these providers (each has its own
thinking shape); calls run without an effort level.

**CLI tool runtime:**

By default, the CLI runs `bash`, `read`, `write`, and `edit` on your local
Expand Down
6 changes: 5 additions & 1 deletion agent/core/agent_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,11 @@ def _friendly_error_message(error: Exception) -> str | None:
"To fix this, set the API key for your model provider:\n"
" • Anthropic: export ANTHROPIC_API_KEY=sk-...\n"
" • OpenAI: export OPENAI_API_KEY=sk-...\n"
" • HF Router: export HF_TOKEN=hf_...\n\n"
" • HF Router: export HF_TOKEN=hf_...\n"
" • Gemini (AI Studio): export GEMINI_API_KEY=...\n"
" • Vertex AI: export VERTEXAI_PROJECT=... VERTEXAI_LOCATION=...\n"
" then: gcloud auth application-default login\n"
" (or export GOOGLE_APPLICATION_CREDENTIALS=/path/sa.json)\n\n"
"You can also add it to a .env file in the project root.\n"
"To switch models, use the /model command."
)
Expand Down
26 changes: 26 additions & 0 deletions agent/core/llm_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,15 @@ def _resolve_llm_params(
• ``openai/<model>`` — ``reasoning_effort`` forwarded as a top-level
kwarg (GPT-5 / o-series). LiteLLM uses the user's ``OPENAI_API_KEY``.

• ``bedrock/<model>`` — AWS Bedrock via the Converse adapter; creds
come from the standard ``AWS_*`` env vars.

• ``vertex_ai/<model>`` / ``gemini/<model>`` — GCP-hosted models.
``vertex_ai/`` covers Gemini and Anthropic Model Garden models and
reads ``VERTEXAI_PROJECT`` / ``VERTEXAI_LOCATION`` + Application
Default Credentials; ``gemini/`` is Google AI Studio and reads
``GEMINI_API_KEY``. ``reasoning_effort`` is not forwarded.

• ``ollama/<model>``, ``vllm/<model>``, ``lm_studio/<model>``, and
``llamacpp/<model>`` — local OpenAI-compatible endpoints. The id prefix
selects a configurable localhost base URL, and the model suffix is sent
Expand Down Expand Up @@ -231,6 +240,23 @@ def _resolve_llm_params(
# the same way, so we leave it off for now.
return {"model": model_name}

if model_name.startswith(("vertex_ai/", "gemini/")):
# GCP-hosted models via LiteLLM:
# ``vertex_ai/<model>`` — Vertex AI. Covers Gemini
# (``vertex_ai/gemini-3.1-pro``, ``vertex_ai/gemini-3.5-flash``)
# and Anthropic models served through Model Garden
# (``vertex_ai/claude-opus-4-...``).
# LiteLLM reads ``VERTEXAI_PROJECT`` / ``VERTEXAI_LOCATION`` from
# the env and authenticates via Application Default Credentials
# (``GOOGLE_APPLICATION_CREDENTIALS`` service-account JSON, or
# ``gcloud auth application-default login``).
# ``gemini/<model>`` — Google AI Studio. Simpler path: LiteLLM
# reads a single ``GEMINI_API_KEY`` from the env.
# As with ``bedrock/``, each provider has its own thinking/effort
# shape, so we don't forward ``reasoning_effort`` here — the probe
# cascade resolves effort to "off" and real calls run without it.
return {"model": model_name}

if model_name.startswith("openai/"):
params = {"model": model_name}
if reasoning_effort:
Expand Down
11 changes: 10 additions & 1 deletion agent/core/model_switcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
"id": "bedrock/us.anthropic.claude-opus-4-6-v1",
"label": "Claude Opus 4.6 via Bedrock",
},
{"id": "vertex_ai/gemini-3.1-pro", "label": "Gemini 3.1 Pro via Vertex AI"},
{"id": "vertex_ai/gemini-3.5-flash", "label": "Gemini 3.5 Flash via Vertex AI"},
{"id": "MiniMaxAI/MiniMax-M2.7", "label": "MiniMax M2.7"},
{"id": "moonshotai/Kimi-K2.6", "label": "Kimi K2.6"},
{"id": "zai-org/GLM-5.1", "label": "GLM 5.1"},
Expand All @@ -50,7 +52,14 @@


_ROUTING_POLICIES = {"fastest", "cheapest", "preferred"}
_DIRECT_PREFIXES = ("anthropic/", "openai/", *LOCAL_MODEL_PREFIXES)
_DIRECT_PREFIXES = (
"anthropic/",
"openai/",
"bedrock/",
"vertex_ai/",
"gemini/",
*LOCAL_MODEL_PREFIXES,
)
_LOCAL_PROBE_TIMEOUT = 15.0


Expand Down
6 changes: 6 additions & 0 deletions agent/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1618,6 +1618,12 @@ def cli():
warnings.filterwarnings("ignore", category=DeprecationWarning, module="litellm")
# Suppress whoosh invalid escape sequence warnings (third-party, unfixed upstream)
warnings.filterwarnings("ignore", category=SyntaxWarning, module="whoosh")
# Suppress Pydantic serializer warnings raised when LiteLLM serializes
warnings.filterwarnings(
"ignore",
message="Pydantic serializer warnings",
category=UserWarning,
)

parser = argparse.ArgumentParser(description="Hugging Face Agent CLI")
parser.add_argument(
Expand Down
37 changes: 37 additions & 0 deletions tests/unit/test_llm_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,43 @@ def test_openai_max_effort_is_still_rejected():
raise AssertionError("Expected UnsupportedEffortError for max effort")


def test_vertex_ai_gemini_id_passes_through_untouched():
params = _resolve_llm_params("vertex_ai/gemini-3.5-flash")

assert params == {"model": "vertex_ai/gemini-3.5-flash"}


def test_vertex_ai_anthropic_model_garden_id_passes_through():
params = _resolve_llm_params("vertex_ai/claude-opus-4-6")

assert params == {"model": "vertex_ai/claude-opus-4-6"}


def test_gemini_ai_studio_id_passes_through_untouched():
params = _resolve_llm_params("gemini/gemini-3.5-flash")

assert params == {"model": "gemini/gemini-3.5-flash"}


def test_vertex_ai_id_does_not_fall_through_to_hf_router():
# Regression: before the vertex_ai/ branch existed, this id was mangled
# into ``openai/vertex_ai/...`` and sent to the HF router.
params = _resolve_llm_params("vertex_ai/gemini-3.5-flash")

assert "api_base" not in params
assert params["model"] == "vertex_ai/gemini-3.5-flash"


def test_gemini_id_drops_reasoning_effort_in_non_strict_mode():
params = _resolve_llm_params(
"gemini/gemini-3.5-flash",
reasoning_effort="high",
strict=False,
)

assert params == {"model": "gemini/gemini-3.5-flash"}


def test_resolve_ollama_params_adds_v1_and_uses_default_key(monkeypatch):
monkeypatch.delenv("OLLAMA_API_KEY", raising=False)
monkeypatch.setenv("OLLAMA_BASE_URL", "http://localhost:11434")
Expand Down
Loading