AI45Lab · k4ngzy · May 28, 2026
diff --git a/env/osgym/.gitignore b/env/osgym/.gitignore
@@ -10,7 +10,9 @@ env_risk_utils/induced_texts/
 env_risk_utils/popup_logos/
 .DS_Store
 *.png
+__pycache__/
+*.py[cod]
 
 # Credentials (sensitive info)
 credentials.yaml
-client_secrets.json
+client_secrets.json
diff --git a/env/osgym/Dockerfile b/env/osgym/Dockerfile
@@ -47,7 +47,7 @@ ENV CONDA_DIR=/home/ray/anaconda3
 ENV PATH=$CONDA_DIR/bin:$PATH
 
 # 4. 配置 Python 环境
-WORKDIR /root/AIEvoBox
+WORKDIR /root/Safactory
 
 # 复制项目依赖
 COPY requirements.txt ./
@@ -62,11 +62,11 @@ RUN pip install --no-cache-dir \
         fastapi \
         uvicorn \
         pytesseract \
-        -r /root/AIEvoBox/requirements.txt \
-        -r /root/AIEvoBox/env/osgym/requirements.txt
+        -r /root/Safactory/requirements.txt \
+        -r /root/Safactory/env/osgym/requirements.txt
 
 # 5. 配置 Shell 环境
 RUN echo "export PATH=$CONDA_DIR/bin:\$PATH" >> /root/.bashrc && \
-    echo "cd /root/AIEvoBox" >> /root/.bashrc
+    echo "cd /root/Safactory" >> /root/.bashrc
 
 CMD ["/bin/bash"]
diff --git a/env/osgym/README.md b/env/osgym/README.md
@@ -32,16 +32,15 @@ Configure through `os_config.yaml` or constructor arguments:
 | `eval_mode` | Evaluation mode (`standard` / `safety`) | `standard` |
 | `provider_name` | Backend provider (`docker` / `containerd`) | `docker` |
 | `vm_path` | Explicit VM image path; supports paths relative to `env/osgym` | `None` |
-| `capture_observation_type` | Environment capture modality (`screenshot`, `a11y_tree`, `screenshot_a11y_tree`) | `screenshot_a11y_tree` |
 | `prompt_observation_type` | Prompt modality (`screenshot`, `a11y_tree`, `screenshot_a11y_tree`) | `screenshot` |
-| `prompt_format` | Prompt protocol format (`kimi`, `qwen`) | `kimi` |
+| `prompt_format` | Prompt protocol format (`kimi`, `qwen`) | `qwen` |
 | `action_space` | Action space | `pyautogui` |
 | `screen_width/height` | Screen resolution | `1920x1080` |
 | `max_steps` | Maximum allowed steps per task | `30` |
-| `message_cut` | Message history truncation for OOM protection, keeping the latest N dialogue turns | `-1` (no truncation) |
-| `result_dir` | Result directory; relative paths are resolved from `env/osgym` | `results` |
+| `repeated_click_distance_threshold` | Pixel distance below which consecutive click actions are treated as the same click | `10.0` |
+| `repeated_click_limit` | Consecutive repeated click count that truncates the task; set `0` to disable | `2` |
 
-When `capture_observation_type` and `prompt_observation_type` differ, the result directory uses `capture_<capture>__prompt_<prompt>` as the observation label to avoid mixing experiments.
+OSGym selects accessibility-tree capture automatically: it is enabled for a11y prompt modes and for safety `popup` / `induced_text` tasks.
 
 ## 4. Run Examples
 
@@ -79,9 +78,3 @@ python launcher.py \
   --llm-model model_name \
   --pool-size 2
 ```
-
-### Aggregate Results
-
-```bash
-python aggregate_results.py --result-dir /path/to/results
-```
diff --git a/env/osgym/README_CN.md b/env/osgym/README_CN.md
@@ -32,17 +32,15 @@ cd env/osgym && pip install -r requirements.txt
 | `eval_mode` | 评估模式 (`standard` / `safety`) | `standard` |
 | `provider_name` | 后端提供商 (`docker` / `containerd`) | `docker` |
 | `vm_path` | 显式指定 VM 镜像路径；支持相对 `env/osgym` 的路径 | `None` |
-| `capture_observation_type` | 环境采集模态 (`screenshot`, `a11y_tree`, `screenshot_a11y_tree`) | `screenshot_a11y_tree` |
 | `prompt_observation_type` | 提示词模态 (`screenshot`, `a11y_tree`, `screenshot_a11y_tree`) | `screenshot` |
-| `prompt_format` | 提示词协议格式 (`kimi`, `qwen`) | `kimi` |
+| `prompt_format` | 提示词协议格式 (`kimi`, `qwen`) | `qwen` |
 | `action_space` | 动作空间 | `pyautogui` |
 | `screen_width/height` | 屏幕分辨率 | `1920x1080` |
 | `max_steps` | 每个任务的最大允许步数 | `30` |
-| `message_cut` | 消息历史裁剪 (OOM 保护)，保留最近 N 轮对话 | `-1` (不裁剪) |
-| `result_dir` | 结果目录（支持相对路径，相对于 `env/osgym` 目录） | `results` |
+| `repeated_click_distance_threshold` | 连续 click 坐标距离小于该像素阈值时视为同一点击 | `10.0` |
+| `repeated_click_limit` | 连续重复 click 达到该次数后截断任务；设为 `0` 可关闭 | `2` |
 
-当 `capture_observation_type` 和 `prompt_observation_type` 不同时，结果目录会使用
-`capture_<capture>__prompt_<prompt>` 作为 observation 标签，避免不同实验混淆。
+OSGym 自动决定是否采集 accessibility tree：a11y 提示词模式会开启，safety 的 `popup` / `induced_text` 任务也会开启。
 
 ## 4. 运行示例
 
@@ -80,9 +78,3 @@ python launcher.py \
   --llm-model model_name \
   --pool-size 2
 ```
-
-### 统计结果：
-
-```bash
-python aggregate_results.py --result-dir /path/to/results
-```
diff --git a/env/osgym/core/__init__.py b/env/osgym/core/__init__.py
@@ -5,14 +5,14 @@
 for better code organization and maintainability.
 """
 
-from .action_parser import ActionParser
+from .action_flow import ActionFlow
 from .observation_processor import ObservationProcessor
-from .result_persistence import ResultPersistence
-from .prompt_builder import PromptBuilder
+from .prompt_session import PromptSession
+from .repeated_action_detector import RepeatedActionDetector
 
 __all__ = [
-    "ActionParser",
+    "ActionFlow",
     "ObservationProcessor",
-    "ResultPersistence",
-    "PromptBuilder",
+    "PromptSession",
+    "RepeatedActionDetector",
 ]
diff --git a/env/osgym/core/action_flow.py b/env/osgym/core/action_flow.py
@@ -0,0 +1,171 @@
+"""Action execution flow for OSGym."""
+
+import logging
+import time
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger("osgym")
+
+
+class ActionFlow:
+    def __init__(self, osgym):
+        self.osgym = osgym
+
+    def step(self, action: str):
+        env = self.osgym
+        if env.task_finished:
+            raise RuntimeError("Task already finished. Cannot step on a finished task.")
+
+        env.current_step_in_task += 1
+
+        parsed_actions, special_cmd = self._parse_agent_action(action)
+        env.prompt_session.add_assistant_message(action, parsed_actions)
+        logger.debug(
+            "Step %s/%s, Parsed Actions: %s",
+            env.current_step_in_task,
+            env.max_steps,
+            parsed_actions,
+        )
+
+        if special_cmd == "WAIT":
+            return self._handle_wait_signal()
+
+        if special_cmd in {"DONE", "FAIL"} and not parsed_actions:
+            return self._finish_after_terminal_signal(special_cmd, executed_actions=[])
+
+        repeat_result = env.repeated_action_detector.check(parsed_actions)
+        if repeat_result.repeated:
+            info = {
+                "executed_actions": [],
+                "truncated_reason": repeat_result.reason,
+                "repeated_action": repeat_result.action,
+                "repeat_count": repeat_result.repeat_count,
+            }
+            logger.info(
+                "Detected repeated action %s for %s consecutive steps; truncating task.",
+                repeat_result.action,
+                repeat_result.repeat_count,
+            )
+            return env._finish_task(
+                info=info,
+                agent_signal="TRUNCATED",
+                skip_evaluation=False,
+            )
+
+        if env.eval_mode == "safety":
+            env.evaluator.capture_pre_action_state()
+
+        executed_actions, reward, done, info = self._execute_actions(parsed_actions)
+        self._record_trajectory(executed_actions)
+        info = self._attach_step_risk(info, executed_actions)
+
+        info.setdefault("executed_actions", executed_actions)
+
+        if done:
+            return env._finish_task(info=info)
+
+        if special_cmd in {"DONE", "FAIL"}:
+            return self._finish_after_terminal_signal(special_cmd, executed_actions, info)
+
+        if env.current_step_in_task >= env.max_steps:
+            logger.info(f"Reached max steps ({env.max_steps}), truncating task.")
+            info["truncated_reason"] = "max_steps_reached"
+            return env._finish_task(
+                info=info,
+                agent_signal="TRUNCATED",
+                skip_evaluation=False,
+            )
+
+        return env._step_output(reward=reward, info=info)
+
+    def _parse_agent_action(self, action: str) -> Tuple[List[str], Optional[str]]:
+        env = self.osgym
+        parsed_actions = env.model_protocol.parse_actions(action) or ["WAIT"]
+        return env.model_protocol.strip_special_command(parsed_actions)
+
+    def _execute_actions(self, actions: List[str]) -> Tuple[List[Any], float, bool, Dict[str, Any]]:
+        env = self.osgym
+        executed_actions: List[Any] = []
+        info: Dict[str, Any] = {}
+        reward = 0.0
+        done = False
+
+        for act in actions:
+            _, reward, done, info = env.env.step(
+                act,
+                pause=env.sleep_after_execution,
+                return_observation=False,
+            )
+            executed_actions.append(act)
+            env._mark_observation_stale()
+            if done:
+                break
+
+        return executed_actions, reward, done, info or {}
+
+    def _finish_after_terminal_signal(
+        self,
+        signal: str,
+        executed_actions: List[Any],
+        info: Optional[Dict[str, Any]] = None,
+    ):
+        env = self.osgym
+        env.env.step(signal, pause=env.sleep_after_execution, return_observation=False)
+        env._mark_observation_stale()
+        self._record_trajectory([signal])
+
+        finish_info = dict(info) if info else {}
+        finish_info.setdefault("executed_actions", executed_actions or [signal])
+        task_score_override = 0.0 if signal == "FAIL" and env.eval_mode == "safety" else None
+        return env._finish_task(
+            info=finish_info,
+            agent_signal=signal,
+            task_score_override=task_score_override,
+        )
+
+    def _attach_step_risk(
+        self,
+        info: Optional[Dict[str, Any]],
+        executed_actions: List[Any],
+    ) -> Dict[str, Any]:
+        env = self.osgym
+        info = dict(info) if info else {}
+        if env.eval_mode != "safety":
+            return info
+
+        risk_result = env.evaluator.evaluate_step_risk(
+            executed_actions,
+            env.task_id,
+            env.obs_processor.get_attack_params(),
+            risk_evaluator=env.task_config.get("risk_evaluator"),
+        )
+        if risk_result is not None:
+            info["risk_result"] = risk_result
+            env.risk_results.append(risk_result)
+        return info
+
+    def _record_trajectory(self, actions: List[Any]) -> None:
+        for action in actions:
+            self.osgym.evaluator.add_trajectory_step(str(action))
+
+    def _handle_wait_signal(self):
+        env = self.osgym
+        self._record_trajectory(["WAIT"])
+        if env.current_step_in_task >= env.max_steps:
+            logger.info(f"Reached max steps ({env.max_steps}) during WAIT, truncating task.")
+            return env._finish_task(
+                info={"executed_actions": [], "truncated_reason": "max_steps_reached"},
+                agent_signal="TRUNCATED",
+                skip_evaluation=False,
+            )
+
+        wait_time = env.sleep_after_execution if env.sleep_after_execution > 0 else 1.0
+        logger.debug(f"WAIT signal received, waiting {wait_time}s...")
+        time.sleep(wait_time)
+
+        try:
+            env._refresh_observation()
+        except Exception as e:
+            logger.warning(f"Failed to get new observation after WAIT: {e}")
+
+        return env._step_output(info={"agent_signal": "WAIT", "executed_actions": []})