OpenMobile/executor.py at main · chaursia/OpenMobile · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
executor.py — Vision Actor model for OpenMobile.

Uses moondream (~1.6B) to:
  1. Analyse the current screenshot
  2. Describe what is visible on screen
  3. Suggest the UI element to interact with and its approximate coordinates

The Planner reads this description before deciding its action.
The Executor also provides a coordinate fallback when Planner confidence is low.
"""
import json
import asyncio
import requests
from config import OLLAMA_URL, VISION_MODEL, log

VISION_DESCRIBE_PROMPT = """\
You are the Vision Actor for OpenMobile, an Android device agent.

Analyze the Android screenshot and respond ONLY with valid JSON:
{
  "screen_description": "Short description of what is on screen (apps visible, text, icons, current app etc.)",
  "ui_elements": ["list", "of", "visible", "interactive", "elements"],
  "suggested_element": "The element most likely needed for the goal",
  "suggested_coords": [x_percent, y_percent]
}

suggested_coords are percentages (0–100) of width and height.
If the screen is a home screen, describe the icons visible.
If the screen is inside an app, describe the key interactive elements.
Keep screen_description under 100 words.
Respond ONLY with the JSON, no other text.
"""


class ExecutorModel:
    def __init__(self, ollama_url: str = OLLAMA_URL, model: str = VISION_MODEL):
        self.ollama_url = ollama_url
        self.model = model
        log(f"Executor initialised →  model={model}", "INFO")

    def _build_vision_payload(self, goal: str, image_b64: str) -> dict:
        return {
            "model": self.model,
            "prompt": (
                f"The user's goal is: '{goal}'\n\n"
                f"{VISION_DESCRIBE_PROMPT}"
            ),
            "stream": False,
            "images": [image_b64],
        }

    async def analyse(self, goal: str, image_b64: str) -> dict | None:
        """
        Sends the screenshot to moondream and returns an ExecutorResponse dict:
          {
            "screen_description": str,
            "ui_elements": list[str],
            "suggested_element": str,
            "suggested_coords": [float, float]   # percentages 0–100
          }
        Returns None on failure.
        """
        payload = self._build_vision_payload(goal, image_b64)
        try:
            loop = asyncio.get_event_loop()
            resp = await loop.run_in_executor(
                None,
                lambda: requests.post(
                    f"{self.ollama_url}/api/generate", json=payload, timeout=90
                ),
            )
            resp.raise_for_status()
            raw = resp.json().get("response", "{}").strip()

            # moondream sometimes wraps JSON in markdown fences — strip them
            if raw.startswith("```"):
                raw = raw.split("```")[1]
                if raw.startswith("json"):
                    raw = raw[4:]
            raw = raw.strip()

            data = json.loads(raw)
            data.setdefault("screen_description", "Unknown screen state.")
            data.setdefault("ui_elements", [])
            data.setdefault("suggested_element", "")
            data.setdefault("suggested_coords", [50.0, 50.0])

            log(f"Vision → {data['screen_description'][:80]}…", "VISION")
            log(f"Vision suggested: {data['suggested_element']} @ {data['suggested_coords']}", "DEBUG")
            return data

        except json.JSONDecodeError as e:
            log(f"Executor JSON parse error: {e}", "WARN")
            # Return minimal fallback so the planner can still proceed
            return {
                "screen_description": "Unable to parse screen description.",
                "ui_elements": [],
                "suggested_element": "",
                "suggested_coords": [50.0, 50.0],
            }
        except Exception as e:
            log(f"Executor LLM error: {e}", "ERROR")
            return None

    def resolve_coords(self, suggested_coords: list, resolution: tuple) -> list[int]:
        """
        Converts [x%, y%] from vision model → absolute pixel [x, y].
        resolution = (width, height) in pixels.
        """
        try:
            rx, ry = float(suggested_coords[0]), float(suggested_coords[1])
            abs_x = int((rx / 100.0) * resolution[0])
            abs_y = int((ry / 100.0) * resolution[1])
            return [abs_x, abs_y]
        except Exception as e:
            log(f"Coord resolution failed: {e}", "WARN")
            w, h = resolution
            return [w // 2, h // 2]