33- Passes a ComputerUse tool schema to acompletion
44- Converts between Responses items and completion messages using helpers
55"""
6- from __future__ import annotations
76
8- from typing import Any , Dict , List , Optional , Tuple
7+ from __future__ import annotations
98
109import json
1110import re
11+ from typing import Any , Dict , List , Optional , Tuple
12+
1213import litellm
1314from litellm .responses .litellm_completion_transformation .transformation import (
1415 LiteLLMCompletionResponsesConfig ,
1516)
1617
1718from ..decorators import register_agent
1819from ..loops .base import AsyncAgentConfig
19- from ..types import AgentCapability
2020from ..responses import (
21- convert_responses_items_to_completion_messages ,
2221 convert_completion_messages_to_responses_items ,
22+ convert_responses_items_to_completion_messages ,
2323)
24-
24+ from .. types import AgentCapability
2525
2626# ComputerUse tool schema (OpenAI function tool format)
2727QWEN3_COMPUTER_TOOL : Dict [str , Any ] = {
9696 },
9797}
9898
99+
99100def _build_nous_system (functions : List [Dict [str , Any ]]) -> Optional [Dict [str , Any ]]:
100101 """Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema."""
101102 try :
102103 from qwen_agent .llm .fncall_prompts .nous_fncall_prompt import (
103- NousFnCallPrompt ,
104- Message as NousMessage ,
105104 ContentItem as NousContentItem ,
106105 )
106+ from qwen_agent .llm .fncall_prompts .nous_fncall_prompt import (
107+ Message as NousMessage ,
108+ )
109+ from qwen_agent .llm .fncall_prompts .nous_fncall_prompt import (
110+ NousFnCallPrompt ,
111+ )
107112 except ImportError :
108- raise ImportError ("qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`." )
113+ raise ImportError (
114+ "qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`."
115+ )
109116 msgs = NousFnCallPrompt ().preprocess_fncall_messages (
110- messages = [NousMessage (role = "system" , content = [NousContentItem (text = "You are a helpful assistant." )])],
117+ messages = [
118+ NousMessage (
119+ role = "system" , content = [NousContentItem (text = "You are a helpful assistant." )]
120+ )
121+ ],
111122 functions = functions ,
112123 lang = "en" ,
113124 )
@@ -116,6 +127,7 @@ def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, An
116127 content = [{"type" : "text" , "text" : c ["text" ]} for c in sys .get ("content" , [])]
117128 return {"role" : "system" , "content" : content }
118129
130+
119131def _parse_tool_call_from_text (text : str ) -> Optional [Dict [str , Any ]]:
120132 """Extract JSON object within <tool_call>...</tool_call> from model text."""
121133 m = re .search (r"<tool_call>\s*(\{[\s\S]*?\})\s*</tool_call>" , text )
@@ -126,6 +138,7 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
126138 except Exception :
127139 return None
128140
141+
129142async def _unnormalize_coordinate (args : Dict [str , Any ], dims : Tuple [int , int ]) -> Dict [str , Any ]:
130143 """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
131144 coord = args .get ("coordinate" )
@@ -262,7 +275,9 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
262275 pre_output_items : List [Dict [str , Any ]] = []
263276 if not _has_any_image (completion_messages ):
264277 if computer_handler is None or not hasattr (computer_handler , "screenshot" ):
265- raise RuntimeError ("No screenshots present and computer_handler.screenshot is not available." )
278+ raise RuntimeError (
279+ "No screenshots present and computer_handler.screenshot is not available."
280+ )
266281 screenshot_b64 = await computer_handler .screenshot ()
267282 if not screenshot_b64 :
268283 raise RuntimeError ("Failed to capture screenshot from computer_handler." )
@@ -271,7 +286,10 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
271286 {
272287 "role" : "user" ,
273288 "content" : [
274- {"type" : "image_url" , "image_url" : {"url" : f"data:image/png;base64,{ screenshot_b64 } " }},
289+ {
290+ "type" : "image_url" ,
291+ "image_url" : {"url" : f"data:image/png;base64,{ screenshot_b64 } " },
292+ },
275293 {"type" : "text" , "text" : "Current screen" },
276294 ],
277295 }
@@ -282,7 +300,10 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
282300 "type" : "message" ,
283301 "role" : "assistant" ,
284302 "content" : [
285- {"type" : "text" , "text" : "Taking a screenshot to see the current computer screen." }
303+ {
304+ "type" : "text" ,
305+ "text" : "Taking a screenshot to see the current computer screen." ,
306+ }
286307 ],
287308 }
288309 )
@@ -294,26 +315,32 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
294315 MIN_PIXELS = 3136
295316 MAX_PIXELS = 12845056
296317 try :
297- from qwen_vl_utils import smart_resize # type: ignore
318+ import base64
319+ import io
320+
298321 from PIL import Image # type: ignore
299- import base64 , io
322+ from qwen_vl_utils import smart_resize # type: ignore
300323 except Exception :
301- raise ImportError ("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`." )
324+ raise ImportError (
325+ "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
326+ )
302327
303328 for msg in completion_messages :
304329 content = msg .get ("content" )
305330 if not isinstance (content , list ):
306331 continue
307332 for part in content :
308333 if isinstance (part , dict ) and part .get ("type" ) == "image_url" :
309- url = ((( part .get ("image_url" ) or {}).get ("url" )) or "" )
334+ url = ((part .get ("image_url" ) or {}).get ("url" )) or ""
310335 # Expect data URL like data:image/png;base64,<b64>
311336 if url .startswith ("data:" ) and "," in url :
312337 b64 = url .split ("," , 1 )[1 ]
313338 img_bytes = base64 .b64decode (b64 )
314339 im = Image .open (io .BytesIO (img_bytes ))
315340 h , w = im .height , im .width
316- rh , rw = smart_resize (h , w , factor = 32 , min_pixels = MIN_PIXELS , max_pixels = MAX_PIXELS )
341+ rh , rw = smart_resize (
342+ h , w , factor = 32 , min_pixels = MIN_PIXELS , max_pixels = MAX_PIXELS
343+ )
317344 # Attach hints on this image block
318345 part ["min_pixels" ] = MIN_PIXELS
319346 part ["max_pixels" ] = MAX_PIXELS
@@ -349,7 +376,7 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
349376 # Parse tool call from text; then convert to responses items via fake tool_calls
350377 resp_dict = response .model_dump () # type: ignore
351378 choice = (resp_dict .get ("choices" ) or [{}])[0 ]
352- content_text = ((( choice .get ("message" ) or {}).get ("content" )) or "" )
379+ content_text = ((choice .get ("message" ) or {}).get ("content" )) or ""
353380 tool_call = _parse_tool_call_from_text (content_text )
354381
355382 output_items : List [Dict [str , Any ]] = []
@@ -358,7 +385,9 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
358385 raw_args = tool_call .get ("arguments" ) or {}
359386 # Unnormalize coordinates to actual screen size using last resized dims
360387 if last_rw is None or last_rh is None :
361- raise RuntimeError ("No screenshots found to derive dimensions for coordinate unnormalization." )
388+ raise RuntimeError (
389+ "No screenshots found to derive dimensions for coordinate unnormalization."
390+ )
362391 args = await _unnormalize_coordinate (raw_args , (last_rw , last_rh ))
363392
364393 # Build an OpenAI-style tool call so we can reuse the converter
@@ -426,27 +455,29 @@ async def predict_click(
426455 max_pixels = 12845056
427456 try :
428457 # Lazy import to avoid hard dependency
429- from qwen_vl_utils import smart_resize # type: ignore
458+ import base64
459+ import io
460+
430461 # If PIL is available, estimate size from image to derive smart bounds
431462 from PIL import Image
432- import io , base64
463+ from qwen_vl_utils import smart_resize # type: ignore
433464
434465 img_bytes = base64 .b64decode (image_b64 )
435466 im = Image .open (io .BytesIO (img_bytes ))
436467 h , w = im .height , im .width
437468 # Qwen notebook suggests factor=32 and a wide min/max range
438469 rh , rw = smart_resize (h , w , factor = 32 , min_pixels = min_pixels , max_pixels = max_pixels )
439470 except Exception :
440- raise ImportError ("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`." )
471+ raise ImportError (
472+ "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
473+ )
441474
442475 messages = []
443476 if nous_system :
444477 messages .append (nous_system )
445478 image_block : Dict [str , Any ] = {
446- "type" : "image_url" ,
447- "image_url" : {
448- "url" : f"data:image/png;base64,{ image_b64 } "
449- },
479+ "type" : "image_url" ,
480+ "image_url" : {"url" : f"data:image/png;base64,{ image_b64 } " },
450481 "min_pixels" : min_pixels ,
451482 "max_pixels" : max_pixels ,
452483 }
@@ -461,11 +492,15 @@ async def predict_click(
461492 }
462493 )
463494
464- api_kwargs : Dict [str , Any ] = {"model" : model , "messages" : messages , ** {k : v for k , v in kwargs .items ()}}
495+ api_kwargs : Dict [str , Any ] = {
496+ "model" : model ,
497+ "messages" : messages ,
498+ ** {k : v for k , v in kwargs .items ()},
499+ }
465500 response = await litellm .acompletion (** api_kwargs )
466501 resp = response .model_dump () # type: ignore
467502 choice = (resp .get ("choices" ) or [{}])[0 ]
468- content_text = ((( choice .get ("message" ) or {}).get ("content" )) or "" )
503+ content_text = ((choice .get ("message" ) or {}).get ("content" )) or ""
469504 tool_call = _parse_tool_call_from_text (content_text ) or {}
470505 args = tool_call .get ("arguments" ) or {}
471506 args = await _unnormalize_coordinate (args , (rh , rw ))
0 commit comments