Skip to content

Commit a126ffa

Browse files
authored
Merge pull request #549 from tamoghnokandar/main
[AGENT] - New model UI-Ins added
2 parents 0fb61ba + e109b42 commit a126ffa

File tree

2 files changed

+177
-0
lines changed

2 files changed

+177
-0
lines changed

libs/python/agent/agent/loops/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
openai,
1818
opencua,
1919
qwen,
20+
uiins,
2021
uitars,
2122
)
2223

@@ -34,5 +35,6 @@
3435
"moondream3",
3536
"gemini",
3637
"qwen",
38+
"uiins",
3739
"gelato",
3840
]
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
"""
2+
UI-Ins agent loop implementation for click prediction using litellm.acompletion
3+
Paper: https://arxiv.org/pdf/2510.202861
4+
Code: https://github.com/alibaba/UI-Ins
5+
"""
6+
7+
import asyncio
8+
import base64
9+
import json
10+
import math
11+
import re
12+
import uuid
13+
from io import BytesIO
14+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
15+
16+
import litellm
17+
from PIL import Image
18+
19+
from ..decorators import register_agent
20+
from ..loops.base import AsyncAgentConfig
21+
from ..types import AgentCapability, AgentResponse, Messages, Tools
22+
23+
SYSTEM_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\nReturn a json object with a reasoning process in tags, a function name and arguments within XML tags:\n```\n\n...\n\n\n{"name": "grounding", "arguments": }\n\n```\n represents the following item of the action space:\n## Action Space{"action": "click", "coordinate": [x, y]}\nYour task is to accurately locate a UI element based on the instruction. You should first analyze instruction in tags and finally output the function in tags.\n"""
24+
25+
26+
def parse_coordinates(raw_string: str) -> tuple[int, int]:
27+
matches = re.findall(r"\[(\d+),\s*(\d+)\]", raw_string)
28+
if matches:
29+
return tuple(map(int, matches[0]))
30+
return -1, -1
31+
32+
33+
def smart_resize(
34+
height: int,
35+
width: int,
36+
factor: int = 28,
37+
min_pixels: int = 3136,
38+
max_pixels: int = 8847360,
39+
) -> Tuple[int, int]:
40+
"""Smart resize function similar to qwen_vl_utils."""
41+
# Calculate the total pixels
42+
total_pixels = height * width
43+
44+
# If already within bounds, return original dimensions
45+
if min_pixels <= total_pixels <= max_pixels:
46+
# Round to nearest factor
47+
new_height = (height // factor) * factor
48+
new_width = (width // factor) * factor
49+
return new_height, new_width
50+
51+
# Calculate scaling factor
52+
if total_pixels > max_pixels:
53+
scale = (max_pixels / total_pixels) ** 0.5
54+
else:
55+
scale = (min_pixels / total_pixels) ** 0.5
56+
57+
# Apply scaling
58+
new_height = int(height * scale)
59+
new_width = int(width * scale)
60+
61+
# Round to nearest factor
62+
new_height = (new_height // factor) * factor
63+
new_width = (new_width // factor) * factor
64+
65+
# Ensure minimum size
66+
new_height = max(new_height, factor)
67+
new_width = max(new_width, factor)
68+
69+
return new_height, new_width
70+
71+
72+
@register_agent(models=r".*UI-Ins.*")
73+
class UIInsConfig(AsyncAgentConfig):
74+
"""UI-Ins agent configuration implementing AsyncAgentConfig protocol for click prediction."""
75+
76+
def __init__(self):
77+
self.current_model = None
78+
self.last_screenshot_b64 = None
79+
80+
async def predict_step(
81+
self,
82+
messages: List[Dict[str, Any]],
83+
model: str,
84+
tools: Optional[List[Dict[str, Any]]] = None,
85+
max_retries: Optional[int] = None,
86+
stream: bool = False,
87+
computer_handler=None,
88+
_on_api_start=None,
89+
_on_api_end=None,
90+
_on_usage=None,
91+
_on_screenshot=None,
92+
**kwargs,
93+
) -> Dict[str, Any]:
94+
raise NotImplementedError()
95+
96+
async def predict_click(
97+
self, model: str, image_b64: str, instruction: str, **kwargs
98+
) -> Optional[Tuple[float, float]]:
99+
"""
100+
Predict click coordinates using UI-Ins model via litellm.acompletion.
101+
102+
Args:
103+
model: The UI-Ins model name
104+
image_b64: Base64 encoded image
105+
instruction: Instruction for where to click
106+
107+
Returns:
108+
Tuple of (x, y) coordinates or None if prediction fails
109+
"""
110+
# Decode base64 image
111+
image_data = base64.b64decode(image_b64)
112+
image = Image.open(BytesIO(image_data))
113+
width, height = image.width, image.height
114+
115+
# Smart resize the image (similar to qwen_vl_utils)
116+
resized_height, resized_width = smart_resize(
117+
height,
118+
width,
119+
factor=28, # Default factor for Qwen models
120+
min_pixels=3136,
121+
max_pixels=4096 * 2160,
122+
)
123+
resized_image = image.resize((resized_width, resized_height))
124+
scale_x, scale_y = width / resized_width, height / resized_height
125+
126+
# Convert resized image back to base64
127+
buffered = BytesIO()
128+
resized_image.save(buffered, format="PNG")
129+
resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
130+
131+
# Prepare system and user messages
132+
system_message = {
133+
"role": "system",
134+
"content": [
135+
{"type": "text", "text": "You are a helpful assistant."},
136+
{"type": "text", "text": SYSTEM_PROMPT},
137+
],
138+
}
139+
140+
user_message = {
141+
"role": "user",
142+
"content": [
143+
{
144+
"type": "image_url",
145+
"image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
146+
},
147+
{"type": "text", "text": instruction},
148+
],
149+
}
150+
151+
# Prepare API call kwargs
152+
api_kwargs = {
153+
"model": model,
154+
"messages": [system_message, user_message],
155+
"max_tokens": 2056,
156+
"temperature": 0.0,
157+
**kwargs,
158+
}
159+
160+
# Use liteLLM acompletion
161+
response = await litellm.acompletion(**api_kwargs)
162+
163+
# Extract response text
164+
output_text = response.choices[0].message.content # type: ignore
165+
166+
# Extract and rescale coordinates
167+
pred_x, pred_y = parse_coordinates(output_text) # type: ignore
168+
pred_x *= scale_x
169+
pred_y *= scale_y
170+
171+
return (math.floor(pred_x), math.floor(pred_y))
172+
173+
def get_capabilities(self) -> List[AgentCapability]:
174+
"""Return the capabilities supported by this agent."""
175+
return ["click"]

0 commit comments

Comments
 (0)