ecs-agent/examples/vision_agent.py at master · MoveCloudROY/ecs-agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""Multimodal vision agent example.

This example demonstrates how to send an image URL to a vision-capable LLM
using the ECS-based agent framework, with full prompt normalization via
SystemPromptRenderSystem and UserPromptNormalizationSystem.

Dual-mode:
- Without LLM_API_KEY: Uses FakeProvider with a mock image description.
- With LLM_API_KEY: Uses OpenAIProvider with Chat Completions API.

Environment variables:
  LLM_API_KEY   — API key (required for real LLM mode)
  LLM_BASE_URL  — API base URL (default: https://dashscope.aliyuncs.com/compatible-mode/v1)
  LLM_MODEL     — Model name (default: qwen3-vl-flash)
  IMAGE_URL     — Image URL to analyze (default: dog and girl demo image)
"""

import asyncio
import os

from ecs_agent.components import (
    ConversationComponent,
    LLMComponent,
    UserPromptConfigComponent,
)
from ecs_agent.core import Runner, World
from ecs_agent.logging import configure_logging
from ecs_agent.providers import FakeProvider, OpenAIProvider
from ecs_agent.providers.config import ApiFormat, ProviderConfig
from ecs_agent.providers.protocol import LLMProvider
from ecs_agent.prompts.contracts import PromptTemplateSource, SystemPromptConfigSpec
from ecs_agent.systems.error_handling import ErrorHandlingSystem
from ecs_agent.systems.memory import MemorySystem
from ecs_agent.systems.reasoning import ReasoningSystem
from ecs_agent.systems.system_prompt_render_system import SystemPromptRenderSystem
from ecs_agent.systems.user_prompt_normalization_system import (
    UserPromptNormalizationSystem,
)
from ecs_agent.types import CompletionResult, ImageUrlPart, Message


async def main() -> None:
    """Run a multimodal vision agent example."""
    configure_logging(json_output=False)

    world = World()

    api_key: str = os.environ.get("LLM_API_KEY", "")
    base_url: str = os.environ.get(
        "LLM_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"
    )
    model: str = os.environ.get("LLM_MODEL", "qwen3-vl-flash")
    image_url: str = os.environ.get(
        "IMAGE_URL",
        "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg",
    )

    provider: LLMProvider
    if api_key:
        print(f"Using OpenAIProvider (Chat Completions) with model: {model}")
        provider = OpenAIProvider(
            config=ProviderConfig(
                provider_id="aliyun",
                base_url=base_url,
                api_key=api_key,
                api_format=ApiFormat.OPENAI_CHAT_COMPLETIONS,
            ),
            model=model,
        )
    else:
        print("No LLM_API_KEY set. Using FakeProvider for demonstration.")
        provider = FakeProvider(
            responses=[
                CompletionResult(
                    message=Message(
                        role="assistant",
                        content=(
                            "I see a girl playing with a dog outdoors. "
                            "The dog appears happy and the scene is cheerful."
                        ),
                    )
                )
            ]
        )

    print(f"Analyzing image URL: {image_url}")

    agent_id = world.create_entity()
    world.add_component(
        agent_id,
        LLMComponent(
            provider=provider,
            model=model if api_key else "fake",
            system_prompt="",
        ),
    )
    world.add_component(
        agent_id,
        SystemPromptConfigSpec(
            template_source=PromptTemplateSource(
                inline="You are a helpful vision assistant."
            ),
            placeholders=[],
        ),
    )
    world.add_component(
        agent_id,
        UserPromptConfigComponent(),
    )
    world.add_component(
        agent_id,
        ConversationComponent(
            messages=[
                Message(
                    role="user",
                    content="Describe this image in detail.",
                    parts=[
                        ImageUrlPart(url=image_url),
                    ],
                )
            ]
        ),
    )

    world.register_system(SystemPromptRenderSystem(priority=-20), priority=-20)
    world.register_system(UserPromptNormalizationSystem(priority=-10), priority=-10)
    world.register_system(ReasoningSystem(priority=0), priority=0)
    world.register_system(MemorySystem(), priority=10)
    world.register_system(ErrorHandlingSystem(priority=99), priority=99)

    runner = Runner()
    await runner.run(world, max_ticks=3)

    conv = world.get_component(agent_id, ConversationComponent)
    if conv is not None and conv.messages:
        print(f"Assistant response: {conv.messages[-1].content}")
    else:
        print("No conversation found")


if __name__ == "__main__":
    asyncio.run(main())