Merge pull request #502 from trycua/lint/fix-qwen-lint

ddupont808 · web-flow · commit 0d91fe6f389f · 2025-10-22T16:41:43.000-07:00
Run uv run pre-commit run --all-files on qwen.py
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ With the Agent SDK, you can:
 | `openai/computer-use-preview`                                                                  | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`                                                | any VLM (using liteLLM, requires `tools` parameter)                                           |
 | `openrouter/z-ai/glm-4.5v`                                                                     | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`                                               | any LLM (using liteLLM, requires `moondream3+` prefix )                                       |
 | `gemini-2.5-computer-use-preview-10-2025`                                                      | any-all-in-one CUA                                                                             |                                                                                               |
-| `openrouter/qwen/qwen3-vl-235b-a22b-instruct`                                                   |                                                                                                |                                                                                               |
+| `openrouter/qwen/qwen3-vl-235b-a22b-instruct`                                                  |                                                                                                |                                                                                               |
 | `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`                                    |                                                                                                |                                                                                               |
 | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`                                              |                                                                                                |
 | `moondream3+{ui planning}` (supports text-only models)                                         |                                                                                                |
diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py
@@ -15,8 +15,8 @@
     omniparser,
     openai,
     opencua,
-    uitars,
     qwen,
+    uitars,
 )
 
 __all__ = [
diff --git a/libs/python/agent/agent/loops/qwen.py b/libs/python/agent/agent/loops/qwen.py
@@ -3,25 +3,25 @@
 - Passes a ComputerUse tool schema to acompletion
 - Converts between Responses items and completion messages using helpers
 """
-from __future__ import annotations
 
-from typing import Any, Dict, List, Optional, Tuple
+from __future__ import annotations
 
 import json
 import re
+from typing import Any, Dict, List, Optional, Tuple
+
 import litellm
 from litellm.responses.litellm_completion_transformation.transformation import (
     LiteLLMCompletionResponsesConfig,
 )
 
 from ..decorators import register_agent
 from ..loops.base import AsyncAgentConfig
-from ..types import AgentCapability
 from ..responses import (
-    convert_responses_items_to_completion_messages,
     convert_completion_messages_to_responses_items,
+    convert_responses_items_to_completion_messages,
 )
-
+from ..types import AgentCapability
 
 # ComputerUse tool schema (OpenAI function tool format)
 QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
@@ -96,18 +96,29 @@
     },
 }
 
+
 def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
     """Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema."""
     try:
         from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
-            NousFnCallPrompt,
-            Message as NousMessage,
             ContentItem as NousContentItem,
         )
+        from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
+            Message as NousMessage,
+        )
+        from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
+            NousFnCallPrompt,
+        )
     except ImportError:
-        raise ImportError("qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`.")
+        raise ImportError(
+            "qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`."
+        )
     msgs = NousFnCallPrompt().preprocess_fncall_messages(
-        messages=[NousMessage(role="system", content=[NousContentItem(text="You are a helpful assistant.")])],
+        messages=[
+            NousMessage(
+                role="system", content=[NousContentItem(text="You are a helpful assistant.")]
+            )
+        ],
         functions=functions,
         lang="en",
     )
@@ -116,6 +127,7 @@ def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, An
     content = [{"type": "text", "text": c["text"]} for c in sys.get("content", [])]
     return {"role": "system", "content": content}
 
+
 def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
     """Extract JSON object within <tool_call>...</tool_call> from model text."""
     m = re.search(r"<tool_call>\s*(\{[\s\S]*?\})\s*</tool_call>", text)
@@ -126,6 +138,7 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
     except Exception:
         return None
 
+
 async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]:
     """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
     coord = args.get("coordinate")
@@ -262,7 +275,9 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
         pre_output_items: List[Dict[str, Any]] = []
         if not _has_any_image(completion_messages):
             if computer_handler is None or not hasattr(computer_handler, "screenshot"):
-                raise RuntimeError("No screenshots present and computer_handler.screenshot is not available.")
+                raise RuntimeError(
+                    "No screenshots present and computer_handler.screenshot is not available."
+                )
             screenshot_b64 = await computer_handler.screenshot()
             if not screenshot_b64:
                 raise RuntimeError("Failed to capture screenshot from computer_handler.")
@@ -271,7 +286,10 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
                 {
                     "role": "user",
                     "content": [
-                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
+                        },
                         {"type": "text", "text": "Current screen"},
                     ],
                 }
@@ -282,7 +300,10 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
                     "type": "message",
                     "role": "assistant",
                     "content": [
-                        {"type": "text", "text": "Taking a screenshot to see the current computer screen."}
+                        {
+                            "type": "text",
+                            "text": "Taking a screenshot to see the current computer screen.",
+                        }
                     ],
                 }
             )
@@ -294,26 +315,32 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
         MIN_PIXELS = 3136
         MAX_PIXELS = 12845056
         try:
-            from qwen_vl_utils import smart_resize  # type: ignore
+            import base64
+            import io
+
             from PIL import Image  # type: ignore
-            import base64, io
+            from qwen_vl_utils import smart_resize  # type: ignore
         except Exception:
-            raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
+            raise ImportError(
+                "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
+            )
 
         for msg in completion_messages:
             content = msg.get("content")
             if not isinstance(content, list):
                 continue
             for part in content:
                 if isinstance(part, dict) and part.get("type") == "image_url":
-                    url = (((part.get("image_url") or {}).get("url")) or "")
+                    url = ((part.get("image_url") or {}).get("url")) or ""
                     # Expect data URL like data:image/png;base64,<b64>
                     if url.startswith("data:") and "," in url:
                         b64 = url.split(",", 1)[1]
                         img_bytes = base64.b64decode(b64)
                         im = Image.open(io.BytesIO(img_bytes))
                         h, w = im.height, im.width
-                        rh, rw = smart_resize(h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
+                        rh, rw = smart_resize(
+                            h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
+                        )
                         # Attach hints on this image block
                         part["min_pixels"] = MIN_PIXELS
                         part["max_pixels"] = MAX_PIXELS
@@ -349,7 +376,7 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
         # Parse tool call from text; then convert to responses items via fake tool_calls
         resp_dict = response.model_dump()  # type: ignore
         choice = (resp_dict.get("choices") or [{}])[0]
-        content_text = (((choice.get("message") or {}).get("content")) or "")
+        content_text = ((choice.get("message") or {}).get("content")) or ""
         tool_call = _parse_tool_call_from_text(content_text)
 
         output_items: List[Dict[str, Any]] = []
@@ -358,7 +385,9 @@ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
             raw_args = tool_call.get("arguments") or {}
             # Unnormalize coordinates to actual screen size using last resized dims
             if last_rw is None or last_rh is None:
-                raise RuntimeError("No screenshots found to derive dimensions for coordinate unnormalization.")
+                raise RuntimeError(
+                    "No screenshots found to derive dimensions for coordinate unnormalization."
+                )
             args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh))
 
             # Build an OpenAI-style tool call so we can reuse the converter
@@ -426,27 +455,29 @@ async def predict_click(
         max_pixels = 12845056
         try:
             # Lazy import to avoid hard dependency
-            from qwen_vl_utils import smart_resize  # type: ignore
+            import base64
+            import io
+
             # If PIL is available, estimate size from image to derive smart bounds
             from PIL import Image
-            import io, base64
+            from qwen_vl_utils import smart_resize  # type: ignore
 
             img_bytes = base64.b64decode(image_b64)
             im = Image.open(io.BytesIO(img_bytes))
             h, w = im.height, im.width
             # Qwen notebook suggests factor=32 and a wide min/max range
             rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
         except Exception:
-            raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
+            raise ImportError(
+                "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
+            )
 
         messages = []
         if nous_system:
             messages.append(nous_system)
         image_block: Dict[str, Any] = {
-            "type": "image_url", 
-            "image_url": {
-                "url": f"data:image/png;base64,{image_b64}"
-            },
+            "type": "image_url",
+            "image_url": {"url": f"data:image/png;base64,{image_b64}"},
             "min_pixels": min_pixels,
             "max_pixels": max_pixels,
         }
@@ -461,11 +492,15 @@ async def predict_click(
             }
         )
 
-        api_kwargs: Dict[str, Any] = {"model": model, "messages": messages, **{k: v for k, v in kwargs.items()}}
+        api_kwargs: Dict[str, Any] = {
+            "model": model,
+            "messages": messages,
+            **{k: v for k, v in kwargs.items()},
+        }
         response = await litellm.acompletion(**api_kwargs)
         resp = response.model_dump()  # type: ignore
         choice = (resp.get("choices") or [{}])[0]
-        content_text = (((choice.get("message") or {}).get("content")) or "")
+        content_text = ((choice.get("message") or {}).get("content")) or ""
         tool_call = _parse_tool_call_from_text(content_text) or {}
         args = tool_call.get("arguments") or {}
         args = await _unnormalize_coordinate(args, (rh, rw))

Original file line number	Diff line number	Diff line change
`@@ -15,8 +15,8 @@`
`15`	`15`	`omniparser,`
`16`	`16`	`openai,`
`17`	`17`	`opencua,`
`18`		`- uitars,`
`19`	`18`	`qwen,`
	`19`	`+ uitars,`
`20`	`20`	`)`
`21`	`21`
`22`	`22`	`__all__ = [`