Skip to content

Commit 0d91fe6

Browse files
authored
Merge pull request #502 from trycua/lint/fix-qwen-lint
Run uv run pre-commit run --all-files on qwen.py
2 parents bdb5f89 + 2142734 commit 0d91fe6

File tree

3 files changed

+65
-30
lines changed

3 files changed

+65
-30
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ With the Agent SDK, you can:
4848
| `openai/computer-use-preview` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | any VLM (using liteLLM, requires `tools` parameter) |
4949
| `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` | any LLM (using liteLLM, requires `moondream3+` prefix ) |
5050
| `gemini-2.5-computer-use-preview-10-2025` | any-all-in-one CUA | |
51-
| `openrouter/qwen/qwen3-vl-235b-a22b-instruct` | | |
51+
| `openrouter/qwen/qwen3-vl-235b-a22b-instruct` | | |
5252
| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | | |
5353
| `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | |
5454
| `moondream3+{ui planning}` (supports text-only models) | |

libs/python/agent/agent/loops/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
omniparser,
1616
openai,
1717
opencua,
18-
uitars,
1918
qwen,
19+
uitars,
2020
)
2121

2222
__all__ = [

libs/python/agent/agent/loops/qwen.py

Lines changed: 63 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,25 @@
33
- Passes a ComputerUse tool schema to acompletion
44
- Converts between Responses items and completion messages using helpers
55
"""
6-
from __future__ import annotations
76

8-
from typing import Any, Dict, List, Optional, Tuple
7+
from __future__ import annotations
98

109
import json
1110
import re
11+
from typing import Any, Dict, List, Optional, Tuple
12+
1213
import litellm
1314
from litellm.responses.litellm_completion_transformation.transformation import (
1415
LiteLLMCompletionResponsesConfig,
1516
)
1617

1718
from ..decorators import register_agent
1819
from ..loops.base import AsyncAgentConfig
19-
from ..types import AgentCapability
2020
from ..responses import (
21-
convert_responses_items_to_completion_messages,
2221
convert_completion_messages_to_responses_items,
22+
convert_responses_items_to_completion_messages,
2323
)
24-
24+
from ..types import AgentCapability
2525

2626
# ComputerUse tool schema (OpenAI function tool format)
2727
QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
@@ -96,18 +96,29 @@
9696
},
9797
}
9898

99+
99100
def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
100101
"""Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema."""
101102
try:
102103
from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
103-
NousFnCallPrompt,
104-
Message as NousMessage,
105104
ContentItem as NousContentItem,
106105
)
106+
from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
107+
Message as NousMessage,
108+
)
109+
from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
110+
NousFnCallPrompt,
111+
)
107112
except ImportError:
108-
raise ImportError("qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`.")
113+
raise ImportError(
114+
"qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`."
115+
)
109116
msgs = NousFnCallPrompt().preprocess_fncall_messages(
110-
messages=[NousMessage(role="system", content=[NousContentItem(text="You are a helpful assistant.")])],
117+
messages=[
118+
NousMessage(
119+
role="system", content=[NousContentItem(text="You are a helpful assistant.")]
120+
)
121+
],
111122
functions=functions,
112123
lang="en",
113124
)
@@ -116,6 +127,7 @@ def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, An
116127
content = [{"type": "text", "text": c["text"]} for c in sys.get("content", [])]
117128
return {"role": "system", "content": content}
118129

130+
119131
def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
120132
"""Extract JSON object within <tool_call>...</tool_call> from model text."""
121133
m = re.search(r"<tool_call>\s*(\{[\s\S]*?\})\s*</tool_call>", text)
@@ -126,6 +138,7 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
126138
except Exception:
127139
return None
128140

141+
129142
async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]:
130143
"""Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
131144
coord = args.get("coordinate")
@@ -262,7 +275,9 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
262275
pre_output_items: List[Dict[str, Any]] = []
263276
if not _has_any_image(completion_messages):
264277
if computer_handler is None or not hasattr(computer_handler, "screenshot"):
265-
raise RuntimeError("No screenshots present and computer_handler.screenshot is not available.")
278+
raise RuntimeError(
279+
"No screenshots present and computer_handler.screenshot is not available."
280+
)
266281
screenshot_b64 = await computer_handler.screenshot()
267282
if not screenshot_b64:
268283
raise RuntimeError("Failed to capture screenshot from computer_handler.")
@@ -271,7 +286,10 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
271286
{
272287
"role": "user",
273288
"content": [
274-
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
289+
{
290+
"type": "image_url",
291+
"image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
292+
},
275293
{"type": "text", "text": "Current screen"},
276294
],
277295
}
@@ -282,7 +300,10 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
282300
"type": "message",
283301
"role": "assistant",
284302
"content": [
285-
{"type": "text", "text": "Taking a screenshot to see the current computer screen."}
303+
{
304+
"type": "text",
305+
"text": "Taking a screenshot to see the current computer screen.",
306+
}
286307
],
287308
}
288309
)
@@ -294,26 +315,32 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
294315
MIN_PIXELS = 3136
295316
MAX_PIXELS = 12845056
296317
try:
297-
from qwen_vl_utils import smart_resize # type: ignore
318+
import base64
319+
import io
320+
298321
from PIL import Image # type: ignore
299-
import base64, io
322+
from qwen_vl_utils import smart_resize # type: ignore
300323
except Exception:
301-
raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
324+
raise ImportError(
325+
"qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
326+
)
302327

303328
for msg in completion_messages:
304329
content = msg.get("content")
305330
if not isinstance(content, list):
306331
continue
307332
for part in content:
308333
if isinstance(part, dict) and part.get("type") == "image_url":
309-
url = (((part.get("image_url") or {}).get("url")) or "")
334+
url = ((part.get("image_url") or {}).get("url")) or ""
310335
# Expect data URL like data:image/png;base64,<b64>
311336
if url.startswith("data:") and "," in url:
312337
b64 = url.split(",", 1)[1]
313338
img_bytes = base64.b64decode(b64)
314339
im = Image.open(io.BytesIO(img_bytes))
315340
h, w = im.height, im.width
316-
rh, rw = smart_resize(h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
341+
rh, rw = smart_resize(
342+
h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
343+
)
317344
# Attach hints on this image block
318345
part["min_pixels"] = MIN_PIXELS
319346
part["max_pixels"] = MAX_PIXELS
@@ -349,7 +376,7 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
349376
# Parse tool call from text; then convert to responses items via fake tool_calls
350377
resp_dict = response.model_dump() # type: ignore
351378
choice = (resp_dict.get("choices") or [{}])[0]
352-
content_text = (((choice.get("message") or {}).get("content")) or "")
379+
content_text = ((choice.get("message") or {}).get("content")) or ""
353380
tool_call = _parse_tool_call_from_text(content_text)
354381

355382
output_items: List[Dict[str, Any]] = []
@@ -358,7 +385,9 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
358385
raw_args = tool_call.get("arguments") or {}
359386
# Unnormalize coordinates to actual screen size using last resized dims
360387
if last_rw is None or last_rh is None:
361-
raise RuntimeError("No screenshots found to derive dimensions for coordinate unnormalization.")
388+
raise RuntimeError(
389+
"No screenshots found to derive dimensions for coordinate unnormalization."
390+
)
362391
args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh))
363392

364393
# Build an OpenAI-style tool call so we can reuse the converter
@@ -426,27 +455,29 @@ async def predict_click(
426455
max_pixels = 12845056
427456
try:
428457
# Lazy import to avoid hard dependency
429-
from qwen_vl_utils import smart_resize # type: ignore
458+
import base64
459+
import io
460+
430461
# If PIL is available, estimate size from image to derive smart bounds
431462
from PIL import Image
432-
import io, base64
463+
from qwen_vl_utils import smart_resize # type: ignore
433464

434465
img_bytes = base64.b64decode(image_b64)
435466
im = Image.open(io.BytesIO(img_bytes))
436467
h, w = im.height, im.width
437468
# Qwen notebook suggests factor=32 and a wide min/max range
438469
rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
439470
except Exception:
440-
raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
471+
raise ImportError(
472+
"qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
473+
)
441474

442475
messages = []
443476
if nous_system:
444477
messages.append(nous_system)
445478
image_block: Dict[str, Any] = {
446-
"type": "image_url",
447-
"image_url": {
448-
"url": f"data:image/png;base64,{image_b64}"
449-
},
479+
"type": "image_url",
480+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
450481
"min_pixels": min_pixels,
451482
"max_pixels": max_pixels,
452483
}
@@ -461,11 +492,15 @@ async def predict_click(
461492
}
462493
)
463494

464-
api_kwargs: Dict[str, Any] = {"model": model, "messages": messages, **{k: v for k, v in kwargs.items()}}
495+
api_kwargs: Dict[str, Any] = {
496+
"model": model,
497+
"messages": messages,
498+
**{k: v for k, v in kwargs.items()},
499+
}
465500
response = await litellm.acompletion(**api_kwargs)
466501
resp = response.model_dump() # type: ignore
467502
choice = (resp.get("choices") or [{}])[0]
468-
content_text = (((choice.get("message") or {}).get("content")) or "")
503+
content_text = ((choice.get("message") or {}).get("content")) or ""
469504
tool_call = _parse_tool_call_from_text(content_text) or {}
470505
args = tool_call.get("arguments") or {}
471506
args = await _unnormalize_coordinate(args, (rh, rw))

0 commit comments

Comments
 (0)