diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d11c5fdf3a9..05ddf47feab 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -163,6 +163,7 @@ ddtrace/contrib/internal/litellm @DataDog/ml-observ ddtrace/contrib/internal/pydantic_ai @DataDog/ml-observability ddtrace/contrib/internal/ray @DataDog/ml-observability ddtrace/contrib/internal/mcp @DataDog/ml-observability +ddtrace/contrib/internal/vllm @DataDog/ml-observability tests/llmobs @DataDog/ml-observability tests/contrib/openai @DataDog/ml-observability tests/contrib/langchain @DataDog/ml-observability @@ -184,6 +185,7 @@ tests/contrib/litellm @DataDog/ml-observ tests/contrib/pydantic_ai @DataDog/ml-observability tests/contrib/ray @DataDog/ml-observability tests/contrib/mcp @DataDog/ml-observability +tests/contrib/vllm @DataDog/ml-observability .gitlab/tests/llmobs.yml @DataDog/ml-observability # MLObs snapshot tests tests/snapshots/tests.contrib.anthropic.* @DataDog/ml-observability @@ -198,6 +200,7 @@ tests/snapshots/tests.contrib.langgraph.* @DataDog/ml-observ tests/snapshots/tests.contrib.crewai.* @DataDog/ml-observability tests/snapshots/tests.contrib.openai_agents.* @DataDog/ml-observability tests/snapshots/tests.contrib.litellm.* @DataDog/ml-observability +tests/snapshots/tests.contrib.vllm.* @DataDog/ml-observability # Remote Config ddtrace/internal/remoteconfig @DataDog/remote-config @DataDog/apm-core-python diff --git a/.gitlab/testrunner.yml b/.gitlab/testrunner.yml index e605b74e229..4fa3d40fc74 100644 --- a/.gitlab/testrunner.yml +++ b/.gitlab/testrunner.yml @@ -1,5 +1,6 @@ variables: TESTRUNNER_IMAGE: registry.ddbuild.io/dd-trace-py:v81799905-0f9bd8b-testrunner@sha256:f29e49e817cb3fdcc188414b9e94bd9244df7db00b72205738a752ab23953cc5 + TESTRUNNER_GPU_IMAGE: ${TESTRUNNER_IMAGE} .testrunner: image: @@ -18,3 +19,14 @@ variables: artifacts: reports: junit: test-results/junit*.xml + +.testrunner_gpu: + extends: .testrunner + image: + name: ${TESTRUNNER_GPU_IMAGE} + tags: [ "gpu:a10-amd64" ] + timeout: 40m + before_script: + - !reference [.testrunner, before_script] + - export NVIDIA_VISIBLE_DEVICES=all + - export NVIDIA_DRIVER_CAPABILITIES=compute,utility diff --git a/.gitlab/tests.yml b/.gitlab/tests.yml index 1c8d9365271..3e2078b9bed 100644 --- a/.gitlab/tests.yml +++ b/.gitlab/tests.yml @@ -43,7 +43,6 @@ include: DD_FAST_BUILD = "1" - .test_base_hatch_snapshot: extends: .test_base_hatch services: @@ -101,4 +100,31 @@ include: - export DD_TRACE_AGENT_URL="http://testagent:9126" - ln -s "${CI_PROJECT_DIR}" "/home/bits/project" +.test_base_riot_gpu: + extends: .test_base_riot + image: + name: ${TESTRUNNER_GPU_IMAGE} + tags: [ "gpu:a10-amd64" ] + timeout: 40m + parallel: 2 + variables: + KUBERNETES_MEMORY_REQUEST: "12Gi" + KUBERNETES_MEMORY_LIMIT: "12Gi" + KUBERNETES_CPU_REQUEST: "2" + KUBERNETES_CPU_LIMIT: "2" + before_script: + - !reference [.test_base_riot, before_script] + - export NVIDIA_VISIBLE_DEVICES=all + - export NVIDIA_DRIVER_CAPABILITIES=compute,utility + +.test_base_riot_gpu_snapshot: + extends: .test_base_riot_gpu + services: + - !reference [.test_base_riot_gpu, services] + - !reference [.services, testagent] + before_script: + - !reference [.test_base_riot_gpu, before_script] + - export DD_TRACE_AGENT_URL="http://testagent:9126" + - ln -s "${CI_PROJECT_DIR}" "/home/bits/project" + # Required jobs will appear here diff --git a/.riot/requirements/2043c14.txt b/.riot/requirements/2043c14.txt new file mode 100644 index 00000000000..d72b1a57e39 --- /dev/null +++ b/.riot/requirements/2043c14.txt @@ -0,0 +1,163 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --allow-unsafe --cert=None --client-cert=None --index-url=None --no-annotate --pip-args=None .riot/requirements/2043c14.in +# +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +annotated-types==0.7.0 +anyio==4.11.0 +astor==0.8.1 +async-timeout==5.0.1 +attrs==25.3.0 +blake3==1.0.7 +cachetools==6.2.0 +cbor2==5.7.0 +certifi==2025.8.3 +cffi==2.0.0 +charset-normalizer==3.4.3 +click==8.3.0 +cloudpickle==3.1.1 +compressed-tensors==0.11.0 +coverage[toml]==7.10.7 +cupy-cuda12x==13.6.0 +depyf==0.19.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +einops==0.8.1 +email-validator==2.3.0 +exceptiongroup==1.3.0 +fastapi[standard]==0.118.0 +fastapi-cli[standard]==0.0.13 +fastapi-cloud-cli==0.2.1 +fastrlock==0.8.3 +filelock==3.19.1 +frozendict==2.4.6 +frozenlist==1.7.0 +fsspec==2025.9.0 +gguf==0.17.1 +h11==0.16.0 +hf-xet==1.1.10 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.35.3 +hypothesis==6.45.0 +idna==3.10 +iniconfig==2.1.0 +interegular==0.3.3 +jinja2==3.1.6 +jiter==0.11.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==0.7.30 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +markdown-it-py==4.0.0 +markupsafe==3.0.3 +mdurl==0.1.2 +mistral-common[audio,image,opencv,soundfile,soxr]==1.8.5 +mock==5.2.0 +mpmath==1.3.0 +msgpack==1.1.1 +msgspec==0.19.0 +multidict==6.6.4 +networkx==3.4.2 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.3 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvtx-cu12==12.8.90 +openai==2.0.0 +openai-harmony==0.0.4 +opencv-python-headless==4.12.0.88 +opentracing==2.4.0 +outlines-core==0.2.11 +packaging==25.0 +partial-json-parser==0.2.1.1.post6 +pillow==11.3.0 +pluggy==1.6.0 +prometheus-client==0.23.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.2 +protobuf==6.32.1 +psutil==7.1.0 +py-cpuinfo==9.0.0 +pybase64==1.4.2 +pycountry==24.6.1 +pycparser==2.23 +pydantic[email]==2.11.9 +pydantic-core==2.33.2 +pydantic-extra-types[pycountry]==2.10.5 +pygments==2.19.2 +pytest==8.4.2 +pytest-asyncio==0.21.1 +pytest-cov==7.0.0 +pytest-mock==3.15.1 +pytest-randomly==4.0.1 +python-dotenv==1.1.1 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pyyaml==6.0.3 +pyzmq==27.1.0 +ray[cgraph]==2.49.2 +referencing==0.36.2 +regex==2025.9.18 +requests==2.32.5 +rich==14.1.0 +rich-toolkit==0.15.1 +rignore==0.6.4 +rpds-py==0.27.1 +safetensors==0.6.2 +scipy==1.15.3 +sentencepiece==0.2.1 +sentry-sdk==2.39.0 +setproctitle==1.3.7 +shellingham==1.5.4 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soundfile==0.13.1 +soxr==1.0.0 +starlette==0.48.0 +sympy==1.14.0 +tiktoken==0.11.0 +tokenizers==0.22.1 +tomli==2.2.1 +torch==2.8.0 +torchaudio==2.8.0 +torchvision==0.23.0 +tqdm==4.67.1 +transformers==4.56.2 +triton==3.4.0 +typer==0.19.2 +typing-extensions==4.15.0 +typing-inspection==0.4.2 +urllib3==2.5.0 +uvicorn[standard]==0.37.0 +uvloop==0.21.0 +vllm==0.10.2 +watchfiles==1.1.0 +websockets==15.0.1 +xformers==0.0.32.post1 +xgrammar==0.1.23 +yarl==1.20.1 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==80.9.0 diff --git a/.riot/requirements/460aab7.txt b/.riot/requirements/460aab7.txt new file mode 100644 index 00000000000..f7bc03da935 --- /dev/null +++ b/.riot/requirements/460aab7.txt @@ -0,0 +1,161 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/460aab7.in +# +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +annotated-types==0.7.0 +anyio==4.11.0 +astor==0.8.1 +attrs==25.3.0 +blake3==1.0.7 +cachetools==6.2.0 +cbor2==5.7.0 +certifi==2025.8.3 +cffi==2.0.0 +charset-normalizer==3.4.3 +click==8.3.0 +cloudpickle==3.1.1 +compressed-tensors==0.11.0 +coverage[toml]==7.10.7 +cupy-cuda12x==13.6.0 +depyf==0.19.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +einops==0.8.1 +email-validator==2.3.0 +fastapi[standard]==0.118.0 +fastapi-cli[standard]==0.0.13 +fastapi-cloud-cli==0.2.1 +fastrlock==0.8.3 +filelock==3.19.1 +frozendict==2.4.6 +frozenlist==1.7.0 +fsspec==2025.9.0 +gguf==0.17.1 +h11==0.16.0 +hf-xet==1.1.10 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.35.3 +hypothesis==6.45.0 +idna==3.10 +iniconfig==2.1.0 +interegular==0.3.3 +jinja2==3.1.6 +jiter==0.11.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==0.7.30 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +markdown-it-py==4.0.0 +markupsafe==3.0.3 +mdurl==0.1.2 +mistral-common[audio,image,opencv,soundfile,soxr]==1.8.5 +mock==5.2.0 +mpmath==1.3.0 +msgpack==1.1.1 +msgspec==0.19.0 +multidict==6.6.4 +networkx==3.5 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.3 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvtx-cu12==12.8.90 +openai==2.0.0 +openai-harmony==0.0.4 +opencv-python-headless==4.12.0.88 +opentracing==2.4.0 +outlines-core==0.2.11 +packaging==25.0 +partial-json-parser==0.2.1.1.post6 +pillow==11.3.0 +pluggy==1.6.0 +prometheus-client==0.23.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.2 +protobuf==6.32.1 +psutil==7.1.0 +py-cpuinfo==9.0.0 +pybase64==1.4.2 +pycountry==24.6.1 +pycparser==2.23 +pydantic[email]==2.11.9 +pydantic-core==2.33.2 +pydantic-extra-types[pycountry]==2.10.5 +pygments==2.19.2 +pytest==8.4.2 +pytest-asyncio==0.21.1 +pytest-cov==7.0.0 +pytest-mock==3.15.1 +pytest-randomly==4.0.1 +python-dotenv==1.1.1 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pyyaml==6.0.3 +pyzmq==27.1.0 +ray[cgraph]==2.49.2 +referencing==0.36.2 +regex==2025.9.18 +requests==2.32.5 +rich==14.1.0 +rich-toolkit==0.15.1 +rignore==0.6.4 +rpds-py==0.27.1 +safetensors==0.6.2 +scipy==1.16.2 +sentencepiece==0.2.1 +sentry-sdk==2.39.0 +setproctitle==1.3.7 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soundfile==0.13.1 +soxr==1.0.0 +starlette==0.48.0 +sympy==1.14.0 +tiktoken==0.11.0 +tokenizers==0.22.1 +torch==2.8.0 +torchaudio==2.8.0 +torchvision==0.23.0 +tqdm==4.67.1 +transformers==4.56.2 +triton==3.4.0 +typer==0.19.2 +typing-extensions==4.15.0 +typing-inspection==0.4.2 +urllib3==2.5.0 +uvicorn[standard]==0.37.0 +uvloop==0.21.0 +vllm==0.10.2 +watchfiles==1.1.0 +websockets==15.0.1 +xformers==0.0.32.post1 +xgrammar==0.1.23 +yarl==1.20.1 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==79.0.1 diff --git a/.riot/requirements/494e77a.txt b/.riot/requirements/494e77a.txt new file mode 100644 index 00000000000..c288c7061a0 --- /dev/null +++ b/.riot/requirements/494e77a.txt @@ -0,0 +1,160 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --allow-unsafe --cert=None --client-cert=None --index-url=None --no-annotate --pip-args=None .riot/requirements/494e77a.in +# +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +annotated-types==0.7.0 +anyio==4.11.0 +astor==0.8.1 +attrs==25.3.0 +blake3==1.0.7 +cachetools==6.2.0 +cbor2==5.7.0 +certifi==2025.8.3 +cffi==2.0.0 +charset-normalizer==3.4.3 +click==8.3.0 +cloudpickle==3.1.1 +compressed-tensors==0.11.0 +coverage[toml]==7.10.7 +cupy-cuda12x==13.6.0 +depyf==0.19.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +einops==0.8.1 +email-validator==2.3.0 +fastapi[standard]==0.118.0 +fastapi-cli[standard]==0.0.13 +fastapi-cloud-cli==0.2.1 +fastrlock==0.8.3 +filelock==3.19.1 +frozendict==2.4.6 +frozenlist==1.7.0 +fsspec==2025.9.0 +gguf==0.17.1 +h11==0.16.0 +hf-xet==1.1.10 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.35.3 +hypothesis==6.45.0 +idna==3.10 +iniconfig==2.1.0 +interegular==0.3.3 +jinja2==3.1.6 +jiter==0.11.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==0.7.30 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +markdown-it-py==4.0.0 +markupsafe==3.0.3 +mdurl==0.1.2 +mistral-common[audio,image,opencv,soundfile,soxr]==1.8.5 +mock==5.2.0 +mpmath==1.3.0 +msgpack==1.1.1 +msgspec==0.19.0 +multidict==6.6.4 +networkx==3.5 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.3 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvtx-cu12==12.8.90 +openai==2.0.0 +openai-harmony==0.0.4 +opencv-python-headless==4.12.0.88 +opentracing==2.4.0 +outlines-core==0.2.11 +packaging==25.0 +partial-json-parser==0.2.1.1.post6 +pillow==11.3.0 +pluggy==1.6.0 +prometheus-client==0.23.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.2 +protobuf==6.32.1 +psutil==7.1.0 +py-cpuinfo==9.0.0 +pybase64==1.4.2 +pycountry==24.6.1 +pycparser==2.23 +pydantic[email]==2.11.9 +pydantic-core==2.33.2 +pydantic-extra-types[pycountry]==2.10.5 +pygments==2.19.2 +pytest==8.4.2 +pytest-asyncio==0.21.1 +pytest-cov==7.0.0 +pytest-mock==3.15.1 +pytest-randomly==4.0.1 +python-dotenv==1.1.1 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pyyaml==6.0.3 +pyzmq==27.1.0 +ray[cgraph]==2.49.2 +referencing==0.36.2 +regex==2025.9.18 +requests==2.32.5 +rich==14.1.0 +rich-toolkit==0.15.1 +rignore==0.6.4 +rpds-py==0.27.1 +safetensors==0.6.2 +scipy==1.16.2 +sentencepiece==0.2.1 +sentry-sdk==2.39.0 +setproctitle==1.3.7 +shellingham==1.5.4 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soundfile==0.13.1 +soxr==1.0.0 +starlette==0.48.0 +sympy==1.14.0 +tiktoken==0.11.0 +tokenizers==0.22.1 +torch==2.8.0 +torchaudio==2.8.0 +torchvision==0.23.0 +tqdm==4.67.1 +transformers==4.56.2 +triton==3.4.0 +typer==0.19.2 +typing-extensions==4.15.0 +typing-inspection==0.4.2 +urllib3==2.5.0 +uvicorn[standard]==0.37.0 +uvloop==0.21.0 +vllm==0.10.2 +watchfiles==1.1.0 +websockets==15.0.1 +xformers==0.0.32.post1 +xgrammar==0.1.23 +yarl==1.20.1 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==80.9.0 diff --git a/ddtrace/_monkey.py b/ddtrace/_monkey.py index f437674fe34..dc3253c8bc0 100644 --- a/ddtrace/_monkey.py +++ b/ddtrace/_monkey.py @@ -28,6 +28,7 @@ log = get_logger(__name__) + # Default set of modules to automatically patch or not PATCH_MODULES = { "aiokafka": True, @@ -109,6 +110,7 @@ "anthropic": True, "crewai": True, "pydantic_ai": True, + "vllm": True, "subprocess": True, "unittest": True, "coverage": False, diff --git a/ddtrace/contrib/integration_registry/registry.yaml b/ddtrace/contrib/integration_registry/registry.yaml index 145df088cc6..d94a35cc5ab 100644 --- a/ddtrace/contrib/integration_registry/registry.yaml +++ b/ddtrace/contrib/integration_registry/registry.yaml @@ -929,6 +929,16 @@ integrations: min: 0.6.14 max: 0.7.4 +- integration_name: vllm + is_external_package: true + is_tested: true + dependency_names: + - vllm + tested_versions_by_dependency: + vllm: + min: 0.10.2 + max: 0.10.2 + - integration_name: webbrowser is_external_package: false is_tested: true diff --git a/ddtrace/contrib/internal/fastapi/patch.py b/ddtrace/contrib/internal/fastapi/patch.py index ad624febab5..68e6174fa23 100644 --- a/ddtrace/contrib/internal/fastapi/patch.py +++ b/ddtrace/contrib/internal/fastapi/patch.py @@ -1,8 +1,10 @@ +import copyreg import os from typing import Dict import fastapi import fastapi.routing +import wrapt from wrapt import wrap_function_wrapper as _w from ddtrace import config @@ -22,6 +24,37 @@ log = get_logger(__name__) +_WRAPT_REDUCERS_REGISTERED = False + + +def _identity(x): + """Identity function for pickle reconstruction - returns unwrapped object.""" + return x + + +def _reduce_wrapt_proxy(proxy): + """Pickle reducer for wrapt proxies. + + Returns (callable, args) tuple for pickle reconstruction. + Using _identity(proxy.__wrapped__) strips the wrapper. + """ + return (_identity, (proxy.__wrapped__,)) + + +def _register_wrapt_pickle_reducers(): + """Register pickle reducers for wrapt proxy types. + + Must be called before FastAPI app is pickled (e.g., by Ray/vLLM). + """ + global _WRAPT_REDUCERS_REGISTERED + if _WRAPT_REDUCERS_REGISTERED: + return + for cls in [wrapt.ObjectProxy, wrapt.FunctionWrapper, wrapt.BoundFunctionWrapper]: + if cls not in copyreg.dispatch_table: + copyreg.dispatch_table[cls] = _reduce_wrapt_proxy + _WRAPT_REDUCERS_REGISTERED = True + + config._add( "fastapi", dict( @@ -88,6 +121,8 @@ def patch(): if getattr(fastapi, "_datadog_patch", False): return + _register_wrapt_pickle_reducers() + fastapi._datadog_patch = True Pin().onto(fastapi) _w("fastapi.applications", "FastAPI.build_middleware_stack", wrap_middleware_stack) diff --git a/ddtrace/contrib/internal/vllm/__init__.py b/ddtrace/contrib/internal/vllm/__init__.py new file mode 100644 index 00000000000..7dfaf2d3b53 --- /dev/null +++ b/ddtrace/contrib/internal/vllm/__init__.py @@ -0,0 +1,124 @@ +""" +The vLLM integration traces requests through the vLLM V1 engine. + +**Note**: This integration **only supports vLLM V1** (VLLM_USE_V1=1). V0 engine support has been +removed as V0 is deprecated and will be removed in a future vLLM release. + + +Enabling +~~~~~~~~ + +The vLLM integration is enabled automatically when using +:ref:`ddtrace-run` or :func:`patch_all()`. + +Alternatively, use :func:`patch()` to manually enable the integration:: + + from ddtrace import patch + patch(vllm=True) + + +Global Configuration +~~~~~~~~~~~~~~~~~~~~ + +.. py:data:: ddtrace.config.vllm["service"] + + The service name reported by default for vLLM requests. + + This option can also be set with the ``DD_VLLM_SERVICE`` environment variable. + + Default: ``"vllm"`` + + +Instance Configuration +~~~~~~~~~~~~~~~~~~~~~~ + +To configure particular vLLM instances, use the ``Pin`` API:: + + import vllm + from ddtrace import Pin + + Pin.override(vllm, service="my-vllm-service") + + +Architecture +~~~~~~~~~~~~ + +The integration uses **engine-side tracing** to capture all requests regardless of API entry point: + +1. **Model Name Injection** (``LLMEngine.__init__`` / ``AsyncLLM.__init__``): + - Extracts and stores model name for span tagging + - Forces ``log_stats=True`` to enable latency and token metrics collection + +2. **Context Injection** (``Processor.process_inputs``): + - Injects Datadog trace context into ``trace_headers`` + - Context propagates through the engine automatically + +3. **Span Creation** (``OutputProcessor.process_outputs``): + - Creates spans when requests finish + - Extracts data from ``RequestState`` and ``EngineCoreOutput`` + - Decodes prompt from token IDs for chat requests when text is unavailable + - Works for all operations: completion, chat, embedding, cross-encoding + +This design ensures: +- All requests are traced (AsyncLLM, LLM, API server, chat) +- Complete timing and token metrics from engine stats +- Full prompt text capture (including chat conversations) +- Minimal performance overhead + + +Span Tags +~~~~~~~~~ + +All spans are tagged with: + +**Request Information**: +- ``vllm.request.model``: Model name +- ``vllm.request.provider``: ``"vllm"`` + +**Latency Metrics**: +- ``vllm.latency.ttft``: Time to first token (seconds) +- ``vllm.latency.queue``: Queue wait time (seconds) +- ``vllm.latency.prefill``: Prefill phase time (seconds) +- ``vllm.latency.decode``: Decode phase time (seconds) +- ``vllm.latency.inference``: Total inference time (seconds) + +**LLMObs Tags** (when LLMObs is enabled): + +For completion/chat operations: +- ``input_messages``: Prompt text (auto-decoded for chat requests) +- ``output_messages``: Generated text +- ``input_tokens``: Number of input tokens +- ``output_tokens``: Number of generated tokens +- ``temperature``, ``max_tokens``, ``top_p``, ``n``: Sampling parameters +- ``num_cached_tokens``: Number of KV cache hits + +For embedding operations: +- ``input_documents``: Input text or token IDs +- ``output_value``: Embedding shape description +- ``embedding_dim``: Embedding dimension +- ``num_embeddings``: Number of embeddings returned + + +Supported Operations +~~~~~~~~~~~~~~~~~~~~ + +**Async Streaming** (``AsyncLLM``): +- ``generate()``: Text completion +- ``encode()``: Text embedding + +**Offline Batch** (``LLM``): +- ``generate()``: Text completion +- ``chat()``: Multi-turn conversations +- ``encode()``: Text embedding +- ``_cross_encoding_score()``: Cross-encoding scores + +**API Server**: +- All OpenAI-compatible endpoints (automatically traced through engine) + + +Requirements +~~~~~~~~~~~~ + +- vLLM V1 (``VLLM_USE_V1=1``) +- vLLM >= 0.10.2 (for V1 trace header propagation support) +""" diff --git a/ddtrace/contrib/internal/vllm/_constants.py b/ddtrace/contrib/internal/vllm/_constants.py new file mode 100644 index 00000000000..6a64b6ef31d --- /dev/null +++ b/ddtrace/contrib/internal/vllm/_constants.py @@ -0,0 +1,21 @@ +SPAN_NAME = "vllm.request" +OPERATION_ID = "vllm.request" + +TAG_MODEL = "vllm.request.model" +TAG_PROVIDER = "vllm.request.provider" +PROVIDER_NAME = "vllm" + +METRIC_LATENCY_TTFT = "vllm.latency.ttft" +METRIC_LATENCY_QUEUE = "vllm.latency.queue" +METRIC_LATENCY_PREFILL = "vllm.latency.prefill" +METRIC_LATENCY_DECODE = "vllm.latency.decode" +METRIC_LATENCY_INFERENCE = "vllm.latency.inference" + +MIN_VERSION = "0.10.2" + +ATTR_MODEL_NAME = "_dd_model_name" +ATTR_DATADOG_PATCH = "_datadog_patch" +ATTR_DATADOG_INTEGRATION = "_datadog_integration" + +ARG_POSITION_LOG_STATS = 2 +ARG_POSITION_TRACE_HEADERS = 6 diff --git a/ddtrace/contrib/internal/vllm/extractors.py b/ddtrace/contrib/internal/vllm/extractors.py new file mode 100644 index 00000000000..42d5d60636f --- /dev/null +++ b/ddtrace/contrib/internal/vllm/extractors.py @@ -0,0 +1,236 @@ +from __future__ import annotations + +from dataclasses import dataclass +import re +from typing import TYPE_CHECKING +from typing import List +from typing import Optional + +from ddtrace.llmobs.types import Message + +from ._constants import ATTR_MODEL_NAME + + +if TYPE_CHECKING: + from vllm.v1.engine.core import EngineCoreOutput + from vllm.v1.engine.output_processor import RequestState + from vllm.v1.stats import RequestStateStats + + +@dataclass +class RequestData: + """Container for vLLM request data extracted from engine outputs.""" + + prompt: Optional[str] = None + input_tokens: int = 0 + output_tokens: int = 0 + output_text: str = "" + finish_reason: Optional[str] = None + embedding_dim: Optional[int] = None + num_embeddings: int = 1 + lora_name: Optional[str] = None + num_cached_tokens: int = 0 + temperature: Optional[float] = None + top_p: Optional[float] = None + n: Optional[int] = None + max_tokens: Optional[int] = None + input_: Optional[list[int]] = None + + +@dataclass +class LatencyMetrics: + """Computed latency metrics from vLLM RequestStateStats.""" + + time_to_first_token: Optional[float] = None + time_in_queue: Optional[float] = None + time_in_model_prefill: Optional[float] = None + time_in_model_decode: Optional[float] = None + time_in_model_inference: Optional[float] = None + + +def get_embedding_shape(tensor) -> tuple[int, Optional[int]]: + """Extract (num_embeddings, embedding_dim) from torch.Tensor.""" + if tensor is None or len(tensor.shape) == 0: + return 1, None + + if len(tensor.shape) == 1: + return 1, int(tensor.shape[0]) + + first, last = int(tensor.shape[0]), int(tensor.shape[-1]) + if last == 1: + return 1, first + return first, last + + +def extract_request_data(req_state: "RequestState", engine_core_output: "EngineCoreOutput") -> RequestData: + """Extract request data from engine-side structures. + + Args: + req_state: RequestState from OutputProcessor.request_states + engine_core_output: EngineCoreOutput from engine_core + + Returns: + RequestData for LLMObs tagging + """ + is_embedding = engine_core_output.pooling_output is not None + + # Get prompt text - if not available, decode from token IDs (but not for embeddings) + # skip_special_tokens=False preserves chat template markers for parsing + prompt_text = req_state.prompt + if not is_embedding and prompt_text is None and req_state.prompt_token_ids and req_state.detokenizer: + tokenizer = getattr(req_state.detokenizer, "tokenizer", None) + if tokenizer: + prompt_text = tokenizer.decode(req_state.prompt_token_ids, skip_special_tokens=False) + + data = RequestData( + prompt=prompt_text, + input_tokens=req_state.prompt_len or 0, + lora_name=req_state.lora_name, + num_cached_tokens=engine_core_output.num_cached_tokens, + temperature=req_state.temperature, + top_p=req_state.top_p, + n=req_state.n, + max_tokens=req_state.max_tokens_param, + ) + + if engine_core_output.finish_reason: + data.finish_reason = str(engine_core_output.finish_reason) + + if is_embedding: + num_emb, emb_dim = get_embedding_shape(engine_core_output.pooling_output) + data.num_embeddings = num_emb + data.embedding_dim = emb_dim + data.input_ = req_state.prompt_token_ids + else: + # Don't extract output_tokens here - stats not updated yet + # Will be extracted later from captured stats reference + + if req_state.detokenizer: + data.output_text = req_state.detokenizer.output_text + + return data + + +def get_model_name(instance) -> Optional[str]: + """Extract injected model name (set by traced_engine_init)""" + return getattr(instance, ATTR_MODEL_NAME, None) + + +def extract_latency_metrics(stats: Optional["RequestStateStats"]) -> Optional[LatencyMetrics]: + """Extract latency metrics from vLLM RequestStateStats. + + Single source of truth for latency calculation logic. + """ + if not stats: + return None + + metrics = LatencyMetrics() + + if stats.first_token_latency: + metrics.time_to_first_token = float(stats.first_token_latency) + + queued = stats.queued_ts + scheduled = stats.scheduled_ts + first_token = stats.first_token_ts + last_token = stats.last_token_ts + + if queued and scheduled: + metrics.time_in_queue = float(scheduled - queued) + + if scheduled and first_token: + metrics.time_in_model_prefill = float(first_token - scheduled) + + if first_token and last_token and last_token > first_token: + metrics.time_in_model_decode = float(last_token - first_token) + + if scheduled and last_token: + metrics.time_in_model_inference = float(last_token - scheduled) + + return metrics + + +# Role patterns: (quick_check_marker, regex_pattern) +# Quick check avoids running all regexes - we first check if marker exists in prompt +_ROLE_PATTERNS = [ + # Llama 3: <|start_header_id|>role<|end_header_id|> + ("<|start_header_id|>", re.compile(r"<\|start_header_id\|>(system|user|assistant)<\|end_header_id\|>")), + # Llama 4: <|header_start|>role<|header_end|> + ("<|header_start|>", re.compile(r"<\|header_start\|>(system|user|assistant)<\|header_end\|>")), + # Granite: <|start_of_role|>role<|end_of_role|> (includes document/documents roles) + ("<|start_of_role|>", re.compile(r"<\|start_of_role\|>(system|user|assistant|documents?)<\|end_of_role\|>")), + # Gemma: role (uses "model" for assistant) + ("", re.compile(r"(system|user|model)")), + # MiniMax: role + ("", re.compile(r"(system|user|ai)")), + # ChatML/Qwen/Hermes: <|im_start|>role + ("<|im_start|>", re.compile(r"<\|im_start\|>(system|user|assistant)")), + # DeepSeek VL2: <|User|>: / <|Assistant|>: (normal | with colon) + ("<|User|>:", re.compile(r"<\|(User|Assistant)\|>:")), + # Phi: <|system|> / <|user|> / <|assistant|> + ("<|system|>", re.compile(r"<\|(system|user|assistant)\|>")), + ("<|user|>", re.compile(r"<\|(system|user|assistant)\|>")), + # DeepSeek V3 (fullwidth | - U+FF5C, not the same as |): <|User|> + ("<|", re.compile(r"<|(User|Assistant)|>")), + # TeleFLM: <_role> + ("<_user>", re.compile(r"<_(system|user|bot)>")), + ("<_system>", re.compile(r"<_(system|user|bot)>")), + # Inkbot: <#role#> + ("<#user#>", re.compile(r"<#(system|user|bot)#>")), + ("<#system#>", re.compile(r"<#(system|user|bot)#>")), + # Alpaca: ### Instruction: / ### Response: + ("### Instruction", re.compile(r"### (Instruction|Response|Input):")), + # Falcon: Role: prefix (capitalized) + ("User:", re.compile(r"^(System|User|Assistant|Falcon): ?", re.MULTILINE)), +] + + +# End-of-turn markers to strip (case-sensitive) +_END_MARKERS = re.compile( + r"<\|im_end\|>|<\|eot_id\|>|<\|end\|>|<\|eot\|>|<\|eom\|>|" + r"<\|end_of_text\|>|||<\|eos\|>|" + r"<|end▁of▁sentence|>", +) + +# Roles that represent the "assistant" in various templates +_ASSISTANT_ROLES = ("assistant", "model", "ai", "bot", "response", "falcon") + + +def parse_prompt_to_messages(prompt: Optional[str]) -> List[Message]: + """Parse a formatted prompt into structured messages.""" + if not prompt: + return [] + + # Quick check: find first matching marker, then use its pattern + for marker, pattern in _ROLE_PATTERNS: + if marker in prompt: + messages = _parse_with_pattern(prompt, pattern) + if messages: + return messages + + # No recognized format - return raw prompt as single message + return [Message(role="", content=prompt)] + + +def _parse_with_pattern(prompt: str, role_pattern) -> List[Message]: + """Parse prompt using a specific role pattern.""" + matches = list(role_pattern.finditer(prompt)) + if not matches: + return [] + + messages: List[Message] = [] + for i, match in enumerate(matches): + role_match = match.group(1) + if not role_match: + continue + role = role_match.lower() + start = match.end() + end = matches[i + 1].start() if i + 1 < len(matches) else len(prompt) + content = _END_MARKERS.sub("", prompt[start:end]).lstrip(":").strip() + + # Skip empty trailing assistant-like roles (generation prompt marker) + if role in _ASSISTANT_ROLES and not content and i == len(matches) - 1: + continue + + messages.append(Message(role=role, content=content)) + + return messages diff --git a/ddtrace/contrib/internal/vllm/patch.py b/ddtrace/contrib/internal/vllm/patch.py new file mode 100644 index 00000000000..ffc9f78b10a --- /dev/null +++ b/ddtrace/contrib/internal/vllm/patch.py @@ -0,0 +1,227 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import Dict +from typing import Optional + +import vllm + +from ddtrace import config +from ddtrace._trace.pin import Pin +from ddtrace.contrib.trace_utils import unwrap +from ddtrace.contrib.trace_utils import with_traced_module +from ddtrace.contrib.trace_utils import wrap +from ddtrace.internal.logger import get_logger +from ddtrace.llmobs._integrations.vllm import VLLMIntegration + + +if TYPE_CHECKING: + from vllm.v1.engine.output_processor import OutputProcessor + +from ._constants import ARG_POSITION_LOG_STATS +from ._constants import ARG_POSITION_TRACE_HEADERS +from ._constants import ATTR_DATADOG_INTEGRATION +from ._constants import ATTR_DATADOG_PATCH +from ._constants import ATTR_MODEL_NAME +from ._constants import MIN_VERSION +from .extractors import extract_latency_metrics +from .extractors import extract_request_data +from .extractors import get_model_name +from .utils import create_span +from .utils import inject_trace_context +from .utils import set_latency_metrics + + +logger = get_logger(__name__) + +config._add("vllm", {}) + + +@with_traced_module +def traced_engine_init(vllm, pin, func, instance, args, kwargs): + """Inject model name into OutputProcessor and force-enable stats for tracing.""" + if len(args) > ARG_POSITION_LOG_STATS: + args = args[:ARG_POSITION_LOG_STATS] + (True,) + args[ARG_POSITION_LOG_STATS + 1 :] + else: + kwargs["log_stats"] = True + + result = func(*args, **kwargs) + + if hasattr(instance, "model_config") and hasattr(instance, "output_processor"): + model_name = getattr(instance.model_config, "model", None) + if model_name: + setattr(instance.output_processor, ATTR_MODEL_NAME, model_name) + + return result + + +@with_traced_module +def traced_processor_process_inputs(vllm, pin, func, instance, args, kwargs): + """Inject Datadog trace context into trace_headers for propagation.""" + tracer = pin.tracer + + if len(args) > ARG_POSITION_TRACE_HEADERS: + trace_headers = args[ARG_POSITION_TRACE_HEADERS] + injected_headers = inject_trace_context(tracer, trace_headers) + args = args[:ARG_POSITION_TRACE_HEADERS] + (injected_headers,) + args[ARG_POSITION_TRACE_HEADERS + 1 :] + else: + trace_headers = kwargs.get("trace_headers") + kwargs["trace_headers"] = inject_trace_context(tracer, trace_headers) + + return func(*args, **kwargs) + + +def _capture_request_states( + instance: "OutputProcessor", + engine_core_outputs: Any, + iteration_stats: Optional[Any], +) -> Dict[str, Dict[str, Any]]: + """Capture request state data before original function removes them. + + Returns dict mapping request_id -> captured_data. + """ + spans_data = {} + for engine_core_output in engine_core_outputs: + req_id = engine_core_output.request_id + req_state = instance.request_states.get(req_id) + + if not req_state: + continue + + spans_data[req_id] = { + "trace_headers": engine_core_output.trace_headers, + "arrival_time": req_state.stats.arrival_time if req_state.stats else None, + "data": extract_request_data(req_state, engine_core_output), + "stats": req_state.stats, + "iteration_stats": iteration_stats, + } + + return spans_data + + +def _create_finished_spans( + pin: Pin, + integration: VLLMIntegration, + model_name: Optional[str], + instance: "OutputProcessor", + spans_data: Dict[str, Dict[str, Any]], +) -> None: + """Create and finish spans for completed requests.""" + for req_id, span_info in spans_data.items(): + if req_id in instance.request_states: + continue + + span = create_span( + pin=pin, + integration=integration, + model_name=model_name, + trace_headers=span_info["trace_headers"], + arrival_time=span_info["arrival_time"], + ) + + data = span_info["data"] + operation = "embedding" if data.embedding_dim is not None else "completion" + + if operation == "completion" and span_info["stats"]: + data.output_tokens = span_info["stats"].num_generation_tokens + + latency_metrics = extract_latency_metrics(span_info["stats"]) + + integration.llmobs_set_tags( + span, + args=[], + kwargs={ + "request_data": data, + "latency_metrics": latency_metrics, + }, + response=None, + operation=operation, + ) + + set_latency_metrics(span, latency_metrics) + span.finish() + + +@with_traced_module +def traced_output_processor_process_outputs(vllm, pin, func, instance, args, kwargs): + """Create Datadog spans for finished requests.""" + integration = getattr(vllm, ATTR_DATADOG_INTEGRATION) + + engine_core_outputs = args[0] if args else kwargs.get("engine_core_outputs") + iteration_stats = kwargs.get("iteration_stats") if kwargs else (args[2] if len(args) > 2 else None) + + if not engine_core_outputs: + return func(*args, **kwargs) + + model_name = get_model_name(instance) + spans_data = _capture_request_states(instance, engine_core_outputs, iteration_stats) + + result = func(*args, **kwargs) + + _create_finished_spans(pin, integration, model_name, instance, spans_data) + + return result + + +def patch(): + """Patch vLLM V1 library for Datadog tracing.""" + if getattr(vllm, ATTR_DATADOG_PATCH, False): + return + + try: + from packaging.version import parse as parse_version + + version_str = getattr(vllm, "__version__", "0.0.0") + base_version = parse_version(version_str).base_version + if parse_version(base_version) < parse_version(MIN_VERSION): + logger.warning( + "vLLM integration requires vLLM >= %s for V1 engine support. " + "Found version %s. Skipping instrumentation.", + MIN_VERSION, + version_str, + ) + return + except (ImportError, AttributeError) as e: + logger.debug( + "Could not verify vLLM version (missing packaging library or __version__): %s. " + "Proceeding with instrumentation - may fail if version < %s", + e, + MIN_VERSION, + ) + except Exception as e: + logger.warning( + "Unexpected error verifying vLLM version: %s. " + "Proceeding with instrumentation, but compatibility issues may occur.", + e, + exc_info=True, + ) + + setattr(vllm, ATTR_DATADOG_PATCH, True) + + Pin().onto(vllm) + integration = VLLMIntegration(integration_config=config.vllm) + setattr(vllm, ATTR_DATADOG_INTEGRATION, integration) + + wrap("vllm.v1.engine.llm_engine", "LLMEngine.__init__", traced_engine_init(vllm)) + wrap("vllm.v1.engine.async_llm", "AsyncLLM.__init__", traced_engine_init(vllm)) + wrap("vllm.v1.engine.processor", "Processor.process_inputs", traced_processor_process_inputs(vllm)) + wrap( + "vllm.v1.engine.output_processor", + "OutputProcessor.process_outputs", + traced_output_processor_process_outputs(vllm), + ) + + +def unpatch(): + if not getattr(vllm, ATTR_DATADOG_PATCH, False): + return + + setattr(vllm, ATTR_DATADOG_PATCH, False) + + unwrap(vllm.v1.engine.llm_engine.LLMEngine, "__init__") + unwrap(vllm.v1.engine.async_llm.AsyncLLM, "__init__") + unwrap(vllm.v1.engine.processor.Processor, "process_inputs") + unwrap(vllm.v1.engine.output_processor.OutputProcessor, "process_outputs") + + delattr(vllm, ATTR_DATADOG_INTEGRATION) diff --git a/ddtrace/contrib/internal/vllm/utils.py b/ddtrace/contrib/internal/vllm/utils.py new file mode 100644 index 00000000000..179ce16ff74 --- /dev/null +++ b/ddtrace/contrib/internal/vllm/utils.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from typing import Optional + +from ddtrace._trace.pin import Pin +from ddtrace.propagation.http import HTTPPropagator +from ddtrace.trace import Context +from ddtrace.trace import Span + +from ._constants import METRIC_LATENCY_DECODE +from ._constants import METRIC_LATENCY_INFERENCE +from ._constants import METRIC_LATENCY_PREFILL +from ._constants import METRIC_LATENCY_QUEUE +from ._constants import METRIC_LATENCY_TTFT +from ._constants import OPERATION_ID +from .extractors import LatencyMetrics + + +def create_span( + pin: Pin, + integration, + model_name: Optional[str], + trace_headers: Optional[dict[str, str]], + arrival_time: Optional[float] = None, +): + """Create a vLLM span with parent context from trace headers.""" + parent_ctx = None + if trace_headers: + parent_ctx = HTTPPropagator.extract(trace_headers) + + span = integration.trace( + pin=pin, + operation_id=OPERATION_ID, + submit_to_llmobs=True, + parent_context=parent_ctx, + model_name=model_name, + ) + + if arrival_time: + span.start_ns = int(arrival_time * 1e9) + + return span + + +def set_latency_metrics(span: Span, latency_metrics: Optional[LatencyMetrics]) -> None: + """Set latency span tags from pre-computed metrics.""" + if not latency_metrics: + return + + metric_map = { + "time_to_first_token": METRIC_LATENCY_TTFT, + "time_in_queue": METRIC_LATENCY_QUEUE, + "time_in_model_prefill": METRIC_LATENCY_PREFILL, + "time_in_model_decode": METRIC_LATENCY_DECODE, + "time_in_model_inference": METRIC_LATENCY_INFERENCE, + } + + for attr, tag_name in metric_map.items(): + value = getattr(latency_metrics, attr, None) + if value is not None: + span.set_metric(tag_name, value) + + +def inject_trace_context(tracer, trace_headers: Optional[dict[str, str]]) -> dict[str, str]: + """Inject current trace context into headers for propagation.""" + headers = dict(trace_headers) if trace_headers else {} + + active = tracer.context_provider.active() + if active: + if isinstance(active, Span): + HTTPPropagator.inject(active.context, headers) + elif isinstance(active, Context): + HTTPPropagator.inject(active, headers) + + return headers diff --git a/ddtrace/internal/settings/_config.py b/ddtrace/internal/settings/_config.py index ddeb6e347ae..a8d31385ab5 100644 --- a/ddtrace/internal/settings/_config.py +++ b/ddtrace/internal/settings/_config.py @@ -173,6 +173,7 @@ "openai", "crewai", "pydantic_ai", + "vllm", "logging", "boto", "mariadb", diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index 8202d1a77cc..557fde68d2f 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -44,6 +44,14 @@ CACHE_READ_INPUT_TOKENS_METRIC_KEY = "cache_read_input_tokens" BILLABLE_CHARACTER_COUNT_METRIC_KEY = "billable_character_count" +TIME_TO_FIRST_TOKEN_METRIC_KEY = "time_to_first_token" # nosec B105 +TIME_IN_QUEUE_METRIC_KEY = "time_in_queue" +TIME_IN_MODEL_PREFILL_METRIC_KEY = "time_in_model_prefill" +TIME_IN_MODEL_DECODE_METRIC_KEY = "time_in_model_decode" +TIME_IN_MODEL_INFERENCE_METRIC_KEY = "time_in_model_inference" +# TIME_E2E_METRIC_KEY = "time_e2e" + +EVP_PROXY_AGENT_BASE_PATH = "/evp_proxy/v2" EVAL_ENDPOINT = "/api/intake/llm-obs/v2/eval-metric" SPAN_ENDPOINT = "/api/v2/llmobs" SPAN_SUBDOMAIN_NAME = "llmobs-intake" diff --git a/ddtrace/llmobs/_integrations/base.py b/ddtrace/llmobs/_integrations/base.py index 1321773b9e8..85f0e23943f 100644 --- a/ddtrace/llmobs/_integrations/base.py +++ b/ddtrace/llmobs/_integrations/base.py @@ -51,12 +51,18 @@ def trace(self, pin: Pin, operation_id: str, submit_to_llmobs: bool = False, **k Eventually those should also be internal service spans once peer.service is implemented. """ span_name = kwargs.get("span_name", None) or "{}.request".format(self._integration_name) - span = pin.tracer.trace( + span_type = SpanTypes.LLM if (submit_to_llmobs and self.llmobs_enabled) else None + parent_context = kwargs.get("parent_context") or pin.tracer.context_provider.active() + + span = pin.tracer.start_span( span_name, - resource=operation_id, + child_of=parent_context, service=int_service(pin, self.integration_config), - span_type=SpanTypes.LLM if (submit_to_llmobs and self.llmobs_enabled) else None, + resource=operation_id, + span_type=span_type, + activate=True, ) + log.debug("Creating LLM span with type %s", span.span_type) # determine if the span represents a proxy request base_url = self._get_base_url(**kwargs) diff --git a/ddtrace/llmobs/_integrations/vllm.py b/ddtrace/llmobs/_integrations/vllm.py new file mode 100644 index 00000000000..7b052753ce4 --- /dev/null +++ b/ddtrace/llmobs/_integrations/vllm.py @@ -0,0 +1,165 @@ +"""LLMObs integration for vLLM V1 library.""" + +from __future__ import annotations + +from typing import Any +from typing import Dict +from typing import List +from typing import Optional + +from ddtrace.contrib.internal.vllm._constants import PROVIDER_NAME +from ddtrace.contrib.internal.vllm._constants import TAG_MODEL +from ddtrace.contrib.internal.vllm._constants import TAG_PROVIDER +from ddtrace.contrib.internal.vllm.extractors import LatencyMetrics +from ddtrace.contrib.internal.vllm.extractors import RequestData +from ddtrace.contrib.internal.vllm.extractors import parse_prompt_to_messages +from ddtrace.llmobs._constants import INPUT_DOCUMENTS +from ddtrace.llmobs._constants import INPUT_MESSAGES +from ddtrace.llmobs._constants import INPUT_TOKENS_METRIC_KEY +from ddtrace.llmobs._constants import METADATA +from ddtrace.llmobs._constants import METRICS +from ddtrace.llmobs._constants import MODEL_NAME +from ddtrace.llmobs._constants import MODEL_PROVIDER +from ddtrace.llmobs._constants import OUTPUT_MESSAGES +from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY +from ddtrace.llmobs._constants import OUTPUT_VALUE +from ddtrace.llmobs._constants import SPAN_KIND +from ddtrace.llmobs._constants import TIME_IN_MODEL_DECODE_METRIC_KEY +from ddtrace.llmobs._constants import TIME_IN_MODEL_INFERENCE_METRIC_KEY +from ddtrace.llmobs._constants import TIME_IN_MODEL_PREFILL_METRIC_KEY +from ddtrace.llmobs._constants import TIME_IN_QUEUE_METRIC_KEY +from ddtrace.llmobs._constants import TIME_TO_FIRST_TOKEN_METRIC_KEY +from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY +from ddtrace.llmobs._integrations.base import BaseLLMIntegration +from ddtrace.llmobs.types import Message +from ddtrace.llmobs.utils import Document +from ddtrace.trace import Span + + +class VLLMIntegration(BaseLLMIntegration): + """LLMObs integration for vLLM V1 library.""" + + _integration_name = "vllm" + + _METADATA_FIELDS = { + "temperature", + "max_tokens", + "top_p", + "n", + "num_cached_tokens", + "embedding_dim", + "finish_reason", + "lora_name", + } + + def _set_base_span_tags(self, span: Span, **kwargs: Any) -> None: + """Set base tags on vLLM spans.""" + model_name = kwargs.get("model_name") + if model_name: + span._set_tag_str(TAG_MODEL, model_name) + span._set_tag_str(TAG_PROVIDER, PROVIDER_NAME) + + def _build_metadata(self, data: RequestData) -> Dict[str, Any]: + """Extract metadata from request data.""" + md: Dict[str, Any] = {} + + for key in self._METADATA_FIELDS: + val = getattr(data, key, None) + if val is not None: + md[key] = val + + return md + + def _build_metrics(self, data: RequestData, latency_metrics: Optional[LatencyMetrics] = None) -> Dict[str, Any]: + """Build token and latency metrics from request data.""" + it = int(data.input_tokens or 0) + ot = int(data.output_tokens or 0) + metrics: Dict[str, Any] = { + INPUT_TOKENS_METRIC_KEY: it, + OUTPUT_TOKENS_METRIC_KEY: ot, + TOTAL_TOKENS_METRIC_KEY: it + ot, + } + + if latency_metrics: + metric_map = { + "time_to_first_token": TIME_TO_FIRST_TOKEN_METRIC_KEY, + "time_in_queue": TIME_IN_QUEUE_METRIC_KEY, + "time_in_model_prefill": TIME_IN_MODEL_PREFILL_METRIC_KEY, + "time_in_model_decode": TIME_IN_MODEL_DECODE_METRIC_KEY, + "time_in_model_inference": TIME_IN_MODEL_INFERENCE_METRIC_KEY, + } + + for attr, constant_key in metric_map.items(): + value = getattr(latency_metrics, attr, None) + if value is not None: + metrics[constant_key] = value + + return metrics + + def _build_embedding_context( + self, data: RequestData, latency_metrics: Optional[LatencyMetrics] = None + ) -> Dict[str, Any]: + """Build LLMObs context for embedding operations.""" + ctx: Dict[str, Any] = { + SPAN_KIND: "embedding", + METADATA: self._build_metadata(data), + METRICS: self._build_metrics(data, latency_metrics), + } + + docs: List[Document] = [] + if data.prompt: + docs = [Document(text=data.prompt)] + elif data.input_: + docs = [Document(text=str(data.input_))] + + if docs: + ctx[INPUT_DOCUMENTS] = docs + + num_emb = data.num_embeddings + dim = data.embedding_dim + ctx[OUTPUT_VALUE] = ( + f"[{num_emb} embedding(s) returned with size {dim}]" if dim else f"[{num_emb} embedding(s) returned]" + ) + + return ctx + + def _build_completion_context( + self, data: RequestData, latency_metrics: Optional[LatencyMetrics] = None + ) -> Dict[str, Any]: + """Build LLMObs context for completion operations.""" + ctx: Dict[str, Any] = { + SPAN_KIND: "llm", + METADATA: self._build_metadata(data), + METRICS: self._build_metrics(data, latency_metrics), + } + + if data.prompt: + ctx[INPUT_MESSAGES] = parse_prompt_to_messages(data.prompt) + + if data.output_text: + ctx[OUTPUT_MESSAGES] = [Message(content=data.output_text)] + + return ctx + + def _llmobs_set_tags( + self, + span: Span, + args: List[Any], + kwargs: Dict[str, Any], + response: Optional[Any] = None, + operation: str = "", + ) -> None: + """Set LLMObs tags on span.""" + data: Optional[RequestData] = kwargs.get("request_data") + if data is None: + return + + latency_metrics = kwargs.get("latency_metrics") + ctx = ( + self._build_embedding_context(data, latency_metrics) + if operation == "embedding" + else self._build_completion_context(data, latency_metrics) + ) + ctx[MODEL_NAME] = span.get_tag(TAG_MODEL) or "" + ctx[MODEL_PROVIDER] = span.get_tag(TAG_PROVIDER) or "" + span._set_ctx_items(ctx) diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml new file mode 100644 index 00000000000..a423ce83a5f --- /dev/null +++ b/docker-compose.gpu.yml @@ -0,0 +1,9 @@ +services: + testrunner: + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] diff --git a/docs/integrations.rst b/docs/integrations.rst index 9f57ccb5c3f..26fec625ca7 100644 --- a/docs/integrations.rst +++ b/docs/integrations.rst @@ -612,6 +612,13 @@ Vertica .. _webbrowser: +vLLM +^^^^ +.. automodule:: ddtrace.contrib.internal.vllm + + +.. _vllm: + Webbrowser ^^^^^^^^^^ .. automodule:: ddtrace.contrib.internal.webbrowser diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 71b4d4f6414..13c8384e5ad 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -228,6 +228,7 @@ posix postgres pre preconfigured +Prefill preload prepend prepended @@ -336,6 +337,7 @@ vendored versioned vertexai vertica +vLLM w3c Webbrowser websocket diff --git a/releasenotes/notes/add-vllm-integration-b93a517daeb45f61.yaml b/releasenotes/notes/add-vllm-integration-b93a517daeb45f61.yaml new file mode 100644 index 00000000000..64b362db1c9 --- /dev/null +++ b/releasenotes/notes/add-vllm-integration-b93a517daeb45f61.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + vllm: Introduces tracing and LLM Observability support for vLLM V1 engine. + Requires vLLM >= 0.10.2. See `the docs `_ + for more information. diff --git a/riotfile.py b/riotfile.py index b62742a6885..19408704057 100644 --- a/riotfile.py +++ b/riotfile.py @@ -3243,6 +3243,17 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT }, pys=select_pys(min_version="3.9", max_version="3.13"), ), + Venv( + name="vllm", + command="pytest {cmdargs} tests/contrib/vllm", + pkgs={ + "pytest-asyncio": "==0.21.1", + "pytest-randomly": latest, + "torch": latest, + "vllm": latest, + }, + pys=select_pys(min_version="3.10", max_version="3.12"), + ), Venv( name="valkey", command="pytest {cmdargs} tests/contrib/valkey", diff --git a/scripts/ddtest b/scripts/ddtest index 44c2be6e103..e05779053a1 100755 --- a/scripts/ddtest +++ b/scripts/ddtest @@ -32,7 +32,12 @@ fi # Ensure the directories are writable chmod -R u+w "$CACHE_DIR" 2>/dev/null || true -docker compose run \ +COMPOSE_FILES=("-f" "$PROJECT_ROOT/docker-compose.yml") +if command -v nvidia-smi >/dev/null 2>&1; then + COMPOSE_FILES+=("-f" "$PROJECT_ROOT/docker-compose.gpu.yml") +fi + +docker compose ${COMPOSE_FILES[@]} run \ -e DD_TRACE_AGENT_URL \ --rm \ -i \ diff --git a/scripts/gen_gitlab_config.py b/scripts/gen_gitlab_config.py index c754bd1172d..7e139a3bdb4 100755 --- a/scripts/gen_gitlab_config.py +++ b/scripts/gen_gitlab_config.py @@ -42,10 +42,13 @@ class JobSpec: allow_failure: bool = False paths: t.Optional[t.Set[str]] = None # ignored only: t.Optional[t.Set[str]] = None # ignored + gpu: bool = False def __str__(self) -> str: lines = [] base = f".test_base_{self.runner}" + if self.gpu: + base += "_gpu" if self.snapshot: base += "_snapshot" diff --git a/supported_versions_output.json b/supported_versions_output.json index ede501d5c8c..57d94d8d799 100644 --- a/supported_versions_output.json +++ b/supported_versions_output.json @@ -693,6 +693,13 @@ "pinned": "true", "auto-instrumented": true }, + { + "dependency": "vllm", + "integration": "vllm", + "minimum_tracer_supported": "0.10.2", + "max_tracer_supported": "0.10.2", + "auto-instrumented": true + }, { "dependency": "yaaredis", "integration": "yaaredis", diff --git a/supported_versions_table.csv b/supported_versions_table.csv index 116cb467333..8d8ea4d5093 100644 --- a/supported_versions_table.csv +++ b/supported_versions_table.csv @@ -96,4 +96,5 @@ valkey,valkey,6.0.2,6.1.1,True google-cloud-aiplatform,vertexai,1.71.1,1.71.1,True vertexai,vertexai,1.71.1,1.71.1,True vertica-python,vertica *,0.6.14,0.7.4,True +vllm,vllm,0.10.2,0.10.2,True yaaredis,yaaredis,2.0.4,3.0.0,True diff --git a/tests/contrib/vllm/__init__.py b/tests/contrib/vllm/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/contrib/vllm/_utils.py b/tests/contrib/vllm/_utils.py new file mode 100644 index 00000000000..52ab989c4c1 --- /dev/null +++ b/tests/contrib/vllm/_utils.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import gc + +import torch +from vllm.engine.arg_utils import AsyncEngineArgs + + +def create_async_engine(model: str, *, engine_mode: str = "0", **kwargs): + """Create an async engine (V0 or V1) with auto-tuned GPU memory utilization.""" + gpu_util = kwargs.pop("gpu_memory_utilization", None) + gpu_util_candidates = [gpu_util] if gpu_util else [0.1, 0.2, 0.3, 0.5] + + for util in gpu_util_candidates: + try: + args = AsyncEngineArgs(model=model, gpu_memory_utilization=util, **kwargs) + if engine_mode == "1": + from vllm.v1.engine.async_llm import AsyncLLM + + return AsyncLLM.from_engine_args(args) + else: + from vllm.engine.async_llm_engine import AsyncLLMEngine + + return AsyncLLMEngine.from_engine_args(args) + except Exception as exc: + last_error = exc + continue + raise last_error # type: ignore[possibly-unbound] + + +def get_simple_chat_template() -> str: + """Return a ChatML-style template for testing.""" + return ( + "{% for message in messages %}" + "<|im_start|>{{ message['role'] }}\n{{ message['content'] }}<|im_end|>\n" + "{% endfor %}" + "<|im_start|>assistant\n" + ) + + +def shutdown_cached_llms() -> None: + """Free GPU memory after tests.""" + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/tests/contrib/vllm/api_app.py b/tests/contrib/vllm/api_app.py new file mode 100644 index 00000000000..cada0042442 --- /dev/null +++ b/tests/contrib/vllm/api_app.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +from typing import List + +from fastapi import FastAPI +from fastapi import Request +from pydantic import BaseModel +import torch +import torch.nn.functional as F +import vllm + +from ddtrace import tracer as ddtracer +from ddtrace.propagation.http import HTTPPropagator + +from ._utils import create_async_engine + + +class RagRequest(BaseModel): + query: str + documents: List[str] + + +app = FastAPI() + +# Common engine parameters (V1 only) +EMBED_PARAMS = { + "model": "intfloat/e5-small-v2", + "enforce_eager": True, + "max_model_len": 256, + "compilation_config": {"use_inductor": False}, + "trust_remote_code": True, + "gpu_memory_utilization": 0.1, + "runner": "pooling", +} + +GEN_PARAMS = { + "model": "facebook/opt-125m", + "enforce_eager": True, + "max_model_len": 256, + "compilation_config": {"use_inductor": False}, + "trust_remote_code": True, + "gpu_memory_utilization": 0.1, +} + + +async def embed_texts(engine, texts: List[str], base_request_id: str) -> List[torch.Tensor]: + """Embed a list of texts and return their vector representations.""" + pooling_params = vllm.PoolingParams(task="encode") + vectors: List[torch.Tensor] = [] + + for i, text in enumerate(texts): + request_id = f"{base_request_id}-{i}" if len(texts) > 1 else base_request_id + last = None + async for out in engine.encode( + prompt=text, + pooling_params=pooling_params, + request_id=request_id, + ): + last = out + if out.finished: + break + + if last and last.outputs is not None and hasattr(last.outputs, "data"): + emb = last.outputs.data + if emb.dim() > 1: + emb = emb.mean(dim=0) + vectors.append(emb.detach().to("cpu", copy=True).float()) + + return vectors + + +async def generate_text(engine, prompt: str, sampling_params: vllm.SamplingParams, request_id: str) -> str: + """Generate text using the given prompt and sampling parameters.""" + last = None + async for out in engine.generate( + prompt=prompt, + sampling_params=sampling_params, + request_id=request_id, + ): + last = out + if out.finished: + break + + if last and last.outputs: + sample = last.outputs[0] if isinstance(last.outputs, list) and last.outputs else None + if sample and hasattr(sample, "text") and sample.text: + return sample.text + return "" + + +@app.post("/rag") +async def rag(req: RagRequest, request: Request): + """RAG endpoint using vLLM V1 for embedding and text generation.""" + # Activate trace context from client headers if provided + headers = dict(request.headers) + ctx = HTTPPropagator.extract(headers) + if ctx: + ddtracer.context_provider.activate(ctx) + + # Create V1 embedding engine + embed_engine = create_async_engine(**EMBED_PARAMS) + doc_vecs = await embed_texts(embed_engine, req.documents, "embed") + query_vecs = await embed_texts(embed_engine, [req.query], "embed-query") + query_vec = query_vecs[0] if query_vecs else None + + # Find most similar document + top_doc = req.documents[0] + if query_vec is not None and doc_vecs: + sims = [F.cosine_similarity(query_vec.unsqueeze(0), d.unsqueeze(0)).item() for d in doc_vecs] + top_idx = int(max(range(len(sims)), key=lambda i: sims[i])) + top_doc = req.documents[top_idx] + + torch.cuda.empty_cache() + + # Create V1 generation engine + gen_engine = create_async_engine(**GEN_PARAMS) + sampling = vllm.SamplingParams(temperature=0.8, top_p=0.95, max_tokens=64, seed=42) + prompt = f"Context: {top_doc}\nQuestion: {req.query}\nAnswer:" + generated_text = await generate_text(gen_engine, prompt, sampling, "gen-0") + + return {"generated_text": generated_text, "retrieved_document": top_doc} diff --git a/tests/contrib/vllm/conftest.py b/tests/contrib/vllm/conftest.py new file mode 100644 index 00000000000..9f1f5bf69e2 --- /dev/null +++ b/tests/contrib/vllm/conftest.py @@ -0,0 +1,148 @@ +import gc +import weakref + +import pytest +import torch + +from ddtrace._trace.pin import Pin +from ddtrace.contrib.internal.vllm.patch import patch +from ddtrace.contrib.internal.vllm.patch import unpatch +from ddtrace.llmobs import LLMObs as llmobs_service +from tests.llmobs._utils import TestLLMObsSpanWriter +from tests.utils import DummyTracer +from tests.utils import DummyWriter +from tests.utils import override_global_config + +from ._utils import shutdown_cached_llms + + +@pytest.fixture(scope="session", autouse=True) +def _shutdown_cached_llms_session(): + yield + shutdown_cached_llms() + + +@pytest.fixture(autouse=True) +def _per_test_llm_cleanup(): + """Free CUDA memory after each test.""" + yield + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + +@pytest.fixture(autouse=True, scope="session") +def require_gpu(): + """Skip vLLM tests if GPU is not available.""" + if not (hasattr(torch, "cuda") and torch.cuda.is_available()): + pytest.skip("Skipping vLLM tests: GPU not available") + + +@pytest.fixture() +def vllm(): + patch() + import vllm + + yield vllm + unpatch() + + +@pytest.fixture +def mock_tracer(vllm): + pin = Pin.get_from(vllm) + mock_tracer = DummyTracer(writer=DummyWriter(trace_flush_enabled=False)) + pin._override(vllm, tracer=mock_tracer) + yield mock_tracer + + +@pytest.fixture +def llmobs_span_writer(): + yield TestLLMObsSpanWriter(1.0, 5.0, is_agentless=True, _site="datad0g.com") + + +@pytest.fixture +def vllm_llmobs(mock_tracer, llmobs_span_writer): + llmobs_service.disable() + with override_global_config({"_llmobs_ml_app": "", "service": "tests.contrib.vllm"}): + llmobs_service.enable(_tracer=mock_tracer, integrations_enabled=False) + llmobs_service._instance._llmobs_span_writer = llmobs_span_writer + yield llmobs_service + llmobs_service.disable() + + +@pytest.fixture +def llmobs_events(vllm_llmobs, llmobs_span_writer): + return llmobs_span_writer.events + + +@pytest.fixture(scope="module") +def opt_125m_llm(): + """Cached facebook/opt-125m LLM for text generation tests.""" + # Ensure patching happens before LLM creation + from ddtrace.contrib.internal.vllm.patch import patch + + patch() + + import vllm + from vllm.distributed import cleanup_dist_env_and_memory + + llm = vllm.LLM( + model="facebook/opt-125m", + max_model_len=256, + enforce_eager=True, + compilation_config={"use_inductor": False}, + gpu_memory_utilization=0.1, + ) + yield weakref.proxy(llm) + del llm + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="module") +def e5_small_llm(): + """Cached intfloat/e5-small LLM for embedding tests.""" + # Ensure patching happens before LLM creation + from ddtrace.contrib.internal.vllm.patch import patch + + patch() + + import vllm + from vllm.distributed import cleanup_dist_env_and_memory + + llm = vllm.LLM( + model="intfloat/e5-small", + runner="pooling", + max_model_len=256, + enforce_eager=True, + compilation_config={"use_inductor": False}, + trust_remote_code=True, + gpu_memory_utilization=0.1, + ) + yield weakref.proxy(llm) + del llm + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="module") +def bge_reranker_llm(): + """Cached BAAI/bge-reranker-v2-m3 LLM for classification/ranking tests.""" + # Ensure patching happens before LLM creation + from ddtrace.contrib.internal.vllm.patch import patch + + patch() + + import vllm + from vllm.distributed import cleanup_dist_env_and_memory + + llm = vllm.LLM( + model="BAAI/bge-reranker-v2-m3", + runner="pooling", + max_model_len=256, + enforce_eager=True, + compilation_config={"use_inductor": False}, + trust_remote_code=True, + gpu_memory_utilization=0.1, + ) + yield weakref.proxy(llm) + del llm + cleanup_dist_env_and_memory() diff --git a/tests/contrib/vllm/test_api_app.py b/tests/contrib/vllm/test_api_app.py new file mode 100644 index 00000000000..74e1b97250d --- /dev/null +++ b/tests/contrib/vllm/test_api_app.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +from fastapi.testclient import TestClient +import pytest + +from ddtrace import tracer as ddtracer +from ddtrace._trace.pin import Pin +from ddtrace.llmobs import LLMObs as llmobs_service +from ddtrace.propagation.http import HTTPPropagator +from tests.utils import override_global_config + +from .api_app import app + + +IGNORE_FIELDS = [ + "meta._dd.p.llmobs_trace_id", + "metrics.vllm.latency.ttft", + "metrics.vllm.latency.queue", + "metrics.vllm.latency.prefill", + "metrics.vllm.latency.decode", + "metrics.vllm.latency.inference", +] + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_rag_parent_child(vllm, llmobs_span_writer): + """Test RAG endpoint with parent-child span relationships and LLMObs event capture.""" + # Ensure snapshot writer receives traces: use global tracer for vLLM Pin + pin = Pin.get_from(vllm) + if pin is not None: + pin._override(vllm, tracer=ddtracer) + + # Enable LLMObs on ddtracer with integrations enabled and use test writer + llmobs_service.disable() + with override_global_config({"_llmobs_ml_app": "", "service": "tests.contrib.vllm"}): + llmobs_service.enable(_tracer=ddtracer, integrations_enabled=False) + llmobs_service._instance._llmobs_span_writer = llmobs_span_writer + + # Create a parent span and inject context into headers + with ddtracer.trace("api.rag") as parent_span: + headers = {} + HTTPPropagator.inject(parent_span.context, headers) + + client = TestClient(app) + payload = { + "query": "What is the capital of France?", + "documents": [ + "Paris is the capital and most populous city of France.", + "Berlin is Germany's capital.", + ], + } + + res = client.post("/rag", json=payload, headers=headers) + assert res.status_code == 200 + + llmobs_service.disable() + + # Verify LLMObs events were captured + # Should have events for: embed doc1, embed doc2, embed query, generate text + llmobs_events = llmobs_span_writer.events + assert len(llmobs_events) == 4 + + # Verify we have both embedding and completion operations + span_kinds = [event["meta"]["span"]["kind"] for event in llmobs_events] + assert span_kinds.count("embedding") == 3 # 2 docs + 1 query + assert span_kinds.count("llm") == 1 # 1 generation + + # Check embedding events (order may vary) + embedding_docs = { + "Paris is the capital and most populous city of France.", + "Berlin is Germany's capital.", + "What is the capital of France?", + } + + embedding_events = [e for e in llmobs_events if e["meta"]["span"]["kind"] == "embedding"] + generation_events = [e for e in llmobs_events if e["meta"]["span"]["kind"] == "llm"] + + captured_docs = {e["meta"]["input"]["documents"][0]["text"] for e in embedding_events} + assert captured_docs == embedding_docs + + # Verify all embedding events have correct structure + for event in embedding_events: + assert event["meta"]["model_name"] == "intfloat/e5-small-v2" + assert event["meta"]["model_provider"] == "vllm" + assert event["meta"]["metadata"]["embedding_dim"] == 384 + assert event["meta"]["metadata"]["num_cached_tokens"] == 0 + assert event["metrics"]["input_tokens"] > 0 + assert event["metrics"]["output_tokens"] == 0 + assert "time_to_first_token" in event["metrics"] + assert "time_in_queue" in event["metrics"] + assert "time_in_model_prefill" in event["metrics"] + assert "time_in_model_inference" in event["metrics"] + assert "ml_app:" in event["tags"] + assert "service:tests.contrib.vllm" in event["tags"] + + # Verify generation event has correct structure + assert len(generation_events) == 1 + gen_event = generation_events[0] + assert gen_event["meta"]["model_name"] == "facebook/opt-125m" + assert gen_event["meta"]["model_provider"] == "vllm" + assert gen_event["meta"]["metadata"]["temperature"] == 0.8 + assert gen_event["meta"]["metadata"]["top_p"] == 0.95 + assert gen_event["meta"]["metadata"]["max_tokens"] == 64 + assert gen_event["meta"]["metadata"]["n"] == 1 + assert gen_event["meta"]["metadata"]["num_cached_tokens"] == 0 + assert gen_event["metrics"]["input_tokens"] == 27 + assert gen_event["metrics"]["output_tokens"] > 0 + assert "time_to_first_token" in gen_event["metrics"] + assert "time_in_queue" in gen_event["metrics"] + assert "time_in_model_prefill" in gen_event["metrics"] + assert "time_in_model_decode" in gen_event["metrics"] + assert "time_in_model_inference" in gen_event["metrics"] + assert "ml_app:" in gen_event["tags"] + assert "service:tests.contrib.vllm" in gen_event["tags"] diff --git a/tests/contrib/vllm/test_extractors.py b/tests/contrib/vllm/test_extractors.py new file mode 100644 index 00000000000..bd2d85d9cc5 --- /dev/null +++ b/tests/contrib/vllm/test_extractors.py @@ -0,0 +1,245 @@ +from ddtrace.contrib.internal.vllm.extractors import parse_prompt_to_messages + + +class TestParsePromptToMessages: + """Tests for parse_prompt_to_messages function.""" + + def test_empty_prompt(self): + assert parse_prompt_to_messages("") == [] + assert parse_prompt_to_messages(None) == [] + + def test_plain_text_fallback(self): + """Unrecognized format returns raw prompt with empty role.""" + result = parse_prompt_to_messages("Hello world") + assert len(result) == 1 + assert result[0] == {"role": "", "content": "Hello world"} + + def test_llama3_format(self): + # https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-3/ + prompt = ( + "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" + "You are a helpful AI assistant for travel tips and recommendations<|eot_id|>" + "<|start_header_id|>user<|end_header_id|>\n\n" + "What is France's capital?<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>\n\n" + "Bonjour! The capital of France is Paris!<|eot_id|>" + "<|start_header_id|>user<|end_header_id|>\n\n" + "What can I do there?<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>\n\n" + ) + result = parse_prompt_to_messages(prompt) + assert len(result) == 4 + assert result[0] == { + "role": "system", + "content": "You are a helpful AI assistant for travel tips and recommendations", + } + assert result[1] == {"role": "user", "content": "What is France's capital?"} + assert result[2] == {"role": "assistant", "content": "Bonjour! The capital of France is Paris!"} + assert result[3] == {"role": "user", "content": "What can I do there?"} + + def test_llama4_format(self): + # https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/ + prompt = ( + "<|begin_of_text|><|header_start|>system<|header_end|>\n\n" + "You are a helpful assistant<|eot|>" + "<|header_start|>user<|header_end|>\n\n" + "Answer who are you in the form of jeopardy?<|eot|>" + "<|header_start|>assistant<|header_end|>\n\n" + ) + result = parse_prompt_to_messages(prompt) + assert len(result) == 2 + assert result[0] == {"role": "system", "content": "You are a helpful assistant"} + assert result[1] == {"role": "user", "content": "Answer who are you in the form of jeopardy?"} + + def test_granite_format(self): + # https://www.ibm.com/docs/en/watsonx/saas?topic=solutions-prompt-lab + prompt = ( + "<|start_of_role|>system<|end_of_role|>\n" + "You are Granite, an AI assistant developed by IBM.<|end_of_text|>\n" + "<|start_of_role|>user<|end_of_role|>\n" + "What can you help me with?<|end_of_text|>\n" + "<|start_of_role|>assistant<|end_of_role|>\n" + ) + result = parse_prompt_to_messages(prompt) + assert len(result) == 2 + assert result[0] == {"role": "system", "content": "You are Granite, an AI assistant developed by IBM."} + assert result[1] == {"role": "user", "content": "What can you help me with?"} + + def test_granite_format_with_documents(self): + # https://www.ibm.com/docs/en/watsonx/saas?topic=solutions-prompt-lab + prompt = ( + "<|start_of_role|>system<|end_of_role|>\n" + "You are a helpful assistant.<|end_of_text|>\n" + "<|start_of_role|>document<|end_of_role|>\n" + "Document content here.<|end_of_text|>\n" + "<|start_of_role|>user<|end_of_role|>\n" + "Summarize the document.<|end_of_text|>\n" + "<|start_of_role|>assistant<|end_of_role|>\n" + ) + result = parse_prompt_to_messages(prompt) + assert len(result) == 3 + assert result[0] == {"role": "system", "content": "You are a helpful assistant."} + assert result[1] == {"role": "document", "content": "Document content here."} + assert result[2] == {"role": "user", "content": "Summarize the document."} + + def test_gemma_format(self): + # https://ai.google.dev/gemma/docs/core/prompt-structure + prompt = ( + "user\n" + "knock knock\n" + "model\n" + "who is there\n" + "user\n" + "Gemma\n" + "model\n" + "Gemma who?\n" + ) + result = parse_prompt_to_messages(prompt) + assert len(result) == 4 + assert result[0] == {"role": "user", "content": "knock knock"} + assert result[1] == {"role": "model", "content": "who is there"} + assert result[2] == {"role": "user", "content": "Gemma"} + assert result[3] == {"role": "model", "content": "Gemma who?"} + + def test_minimax_format(self): + # https://platform.minimax.io/docs/guides/text-function-call + prompt = ( + "system\n" + "You are a helpful assistant.\n" + "user\n" + "What is the weather today?\n" + "ai\n" + ) + result = parse_prompt_to_messages(prompt) + assert len(result) == 2 + assert result[0] == {"role": "system", "content": "You are a helpful assistant."} + assert result[1] == {"role": "user", "content": "What is the weather today?"} + # Trailing empty "ai" is skipped (generation prompt marker) + + def test_chatml_format(self): + # https://qwen.readthedocs.io/en/latest/getting_started/concepts.html + prompt = ( + "<|im_start|>system\n" + "You are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + "Hello, how are you?<|im_end|>\n" + "<|im_start|>assistant\n" + "I'm doing well, thank you! How can I help you today?<|im_end|>\n" + "<|im_start|>user\n" + "Explain large language models.<|im_end|>\n" + "<|im_start|>assistant\n" + ) + result = parse_prompt_to_messages(prompt) + assert len(result) == 4 + assert result[0] == {"role": "system", "content": "You are a helpful assistant."} + assert result[1] == {"role": "user", "content": "Hello, how are you?"} + assert result[2] == {"role": "assistant", "content": "I'm doing well, thank you! How can I help you today?"} + assert result[3] == {"role": "user", "content": "Explain large language models."} + + def test_deepseek_vl2_format(self): + # https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja + prompt = ( + "You are a helpful assistant.\n" + "<|User|>: What is 2+2?\n\n" + "<|Assistant|>: The answer is 4.\n\n" + "<|User|>: And 3+3?\n\n" + "<|Assistant|>: " + ) + result = parse_prompt_to_messages(prompt) + assert len(result) == 3 + assert result[0] == {"role": "user", "content": "What is 2+2?"} + assert result[1] == {"role": "assistant", "content": "The answer is 4."} + assert result[2] == {"role": "user", "content": "And 3+3?"} + + def test_phi_format_with_system(self): + # https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_phi4_mini.jinja + prompt = ( + "<|system|>\n" + "You are a helpful assistant.<|end|>\n" + "<|user|>\n" + "What is the meaning of life?<|end|>\n" + "<|assistant|>\n" + ) + result = parse_prompt_to_messages(prompt) + assert len(result) == 2 + assert result[0] == {"role": "system", "content": "You are a helpful assistant."} + assert result[1] == {"role": "user", "content": "What is the meaning of life?"} + + def test_phi_format_with_user(self): + # https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_phi4_mini.jinja + prompt = "<|user|>\nHello there!<|end|>\n<|assistant|>\n" + result = parse_prompt_to_messages(prompt) + assert len(result) == 1 + assert result[0] == {"role": "user", "content": "Hello there!"} + + def test_deepseek_v3_format(self): + # https://huggingface.co/deepseek-ai/DeepSeek-V3.1 + # Uses fullwidth | (U+FF5C) + prompt = "<|begin▁of▁sentence|>You are a helpful assistant<|User|>Who are you?<|Assistant|>" + result = parse_prompt_to_messages(prompt) + assert len(result) == 1 + assert result[0] == {"role": "user", "content": "Who are you?"} + + def test_teleflm_format_with_user(self): + # https://github.com/vllm-project/vllm/blob/main/examples/template_teleflm.jinja + prompt = "<_user>What time is it?<_bot>" + result = parse_prompt_to_messages(prompt) + assert len(result) == 1 + assert result[0] == {"role": "user", "content": "What time is it?"} + + def test_teleflm_format_with_system(self): + # https://github.com/vllm-project/vllm/blob/main/examples/template_teleflm.jinja + prompt = "<_system>You are a helpful assistant.<_user>What time is it?<_bot>" + result = parse_prompt_to_messages(prompt) + assert len(result) == 2 + assert result[0] == {"role": "system", "content": "You are a helpful assistant."} + assert result[1] == {"role": "user", "content": "What time is it?"} + + def test_inkbot_format_with_user(self): + # https://github.com/vllm-project/vllm/blob/main/examples/template_inkbot.jinja + prompt = "<#user#>\nHow can I help you today?\n<#bot#>\n" + result = parse_prompt_to_messages(prompt) + assert len(result) == 1 + assert result[0] == {"role": "user", "content": "How can I help you today?"} + + def test_inkbot_format_with_system(self): + # https://github.com/vllm-project/vllm/blob/main/examples/template_inkbot.jinja + prompt = "<#system#>\nYou are a helpful assistant.\n<#user#>\nHow can I help you today?\n<#bot#>\n" + result = parse_prompt_to_messages(prompt) + assert len(result) == 2 + assert result[0] == {"role": "system", "content": "You are a helpful assistant."} + assert result[1] == {"role": "user", "content": "How can I help you today?"} + + def test_alpaca_format(self): + # https://github.com/vllm-project/vllm/blob/main/examples/template_alpaca.jinja + prompt = "### Instruction:\nWrite a poem about the sea.\n\n### Response:\n" + result = parse_prompt_to_messages(prompt) + assert len(result) == 1 + assert result[0] == {"role": "instruction", "content": "Write a poem about the sea."} + + def test_falcon_format(self): + # https://github.com/vllm-project/vllm/blob/main/examples/template_falcon.jinja + prompt = "User: What is the capital of France?\nAssistant:" + result = parse_prompt_to_messages(prompt) + assert len(result) == 1 + assert result[0] == {"role": "user", "content": "What is the capital of France?"} + + def test_skips_empty_trailing_assistant(self): + """Empty trailing assistant message should be skipped.""" + prompt = "<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n<|im_end|>" + result = parse_prompt_to_messages(prompt) + assert len(result) == 1 + assert result[0] == {"role": "user", "content": "Hello"} + + def test_preserves_multiline_content(self): + """Content with newlines should be preserved.""" + prompt = ( + "<|im_start|>system\n" + "Line 1\nLine 2\nLine 3<|im_end|>\n" + "<|im_start|>user\n" + "Question<|im_end|>\n" + "<|im_start|>assistant\n" + ) + result = parse_prompt_to_messages(prompt) + assert len(result) == 2 + assert result[0] == {"role": "system", "content": "Line 1\nLine 2\nLine 3"} diff --git a/tests/contrib/vllm/test_vllm_llmobs.py b/tests/contrib/vllm/test_vllm_llmobs.py new file mode 100644 index 00000000000..d215c9bcef6 --- /dev/null +++ b/tests/contrib/vllm/test_vllm_llmobs.py @@ -0,0 +1,286 @@ +import mock +import pytest + +from ddtrace.llmobs.types import Message +from tests.llmobs._utils import _expected_llmobs_llm_span_event + +from ._utils import get_simple_chat_template + + +IGNORE_FIELDS = [ + "metrics.vllm.latency.ttft", + "metrics.vllm.latency.queue", + "metrics.vllm.latency.prefill", + "metrics.vllm.latency.decode", + "metrics.vllm.latency.inference", +] + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_basic(llmobs_events, mock_tracer, opt_125m_llm): + from vllm import SamplingParams + + llm = opt_125m_llm + sampling = SamplingParams(temperature=0.1, top_p=0.9, max_tokens=8, seed=42) + llm.generate("The future of AI is", sampling) + span = mock_tracer.pop_traces()[0][0] + + assert len(llmobs_events) == 1 + expected = _expected_llmobs_llm_span_event( + span, + model_name="facebook/opt-125m", + model_provider="vllm", + input_messages=[Message(content="The future of AI is")], + output_messages=[Message(content=" in the hands of the people.")], + metadata={ + "max_tokens": 8, + "n": 1, + "temperature": 0.1, + "top_p": 0.9, + "finish_reason": "length", + "num_cached_tokens": 0, + }, + token_metrics={ + "input_tokens": 6, + "output_tokens": 8, + "total_tokens": 14, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_decode": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert llmobs_events[0] == expected + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_chat(llmobs_events, mock_tracer, opt_125m_llm): + from vllm import SamplingParams + + llm = opt_125m_llm + sampling_params = SamplingParams(seed=42) + + conversation = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hello! How can I assist you today?"}, + {"role": "user", "content": "Write an essay about the importance of higher education."}, + ] + + llm.chat(conversation, sampling_params, chat_template=get_simple_chat_template(), use_tqdm=False) + span = mock_tracer.pop_traces()[0][0] + + assert len(llmobs_events) == 1 + expected = _expected_llmobs_llm_span_event( + span, + model_name="facebook/opt-125m", + model_provider="vllm", + input_messages=[ + Message(role="system", content="You are a helpful assistant"), + Message(role="user", content="Hello"), + Message(role="assistant", content="Hello! How can I assist you today?"), + Message(role="user", content="Write an essay about the importance of higher education."), + ], + output_messages=[Message(content=mock.ANY)], + metadata={ + "max_tokens": 16, + "temperature": 1.0, + "top_p": 1.0, + "n": 1, + "finish_reason": "length", + "num_cached_tokens": mock.ANY, + }, + token_metrics={ + "input_tokens": mock.ANY, + "output_tokens": 16, + "total_tokens": mock.ANY, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_decode": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert llmobs_events[0] == expected + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_classify(llmobs_events, mock_tracer, bge_reranker_llm): + llm = bge_reranker_llm + + prompts = [ + "Hello, my name is", + "The capital of France is", + ] + + llm.classify(prompts) + traces = mock_tracer.pop_traces() + spans = [s for t in traces for s in t] + + # Expect one event per input prompt + assert len(llmobs_events) == len(prompts) == len(spans) + span_by_id = {s.span_id: s for s in spans} + + for prompt, event in zip(prompts, llmobs_events): + span = span_by_id[int(event["span_id"])] + expected = _expected_llmobs_llm_span_event( + span, + span_kind="embedding", + model_name="BAAI/bge-reranker-v2-m3", + model_provider="vllm", + input_documents=[{"text": prompt}], + output_value="[1 embedding(s) returned with size 1]", + metadata={"embedding_dim": 1, "num_cached_tokens": 0}, + token_metrics={ + "input_tokens": 7, + "output_tokens": 0, + "total_tokens": 7, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert event == expected + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_embed(llmobs_events, mock_tracer, e5_small_llm): + llm = e5_small_llm + + prompts = [ + "Hello, my name is", + "The capital of France is", + ] + + llm.embed(prompts) + traces = mock_tracer.pop_traces() + spans = [s for t in traces for s in t] + + # Expect one event per input prompt + assert len(llmobs_events) == len(prompts) == len(spans) + span_by_id = {s.span_id: s for s in spans} + + for prompt, event in zip(prompts, llmobs_events): + span = span_by_id[int(event["span_id"])] + expected = _expected_llmobs_llm_span_event( + span, + span_kind="embedding", + model_name="intfloat/e5-small", + model_provider="vllm", + input_documents=[{"text": prompt}], + output_value="[1 embedding(s) returned with size 384]", + metadata={"embedding_dim": 384, "num_cached_tokens": 0}, + token_metrics={ + "input_tokens": 7, + "output_tokens": 0, + "total_tokens": 7, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert event == expected + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_reward(llmobs_events, mock_tracer, bge_reranker_llm): + llm = bge_reranker_llm + + prompts = [ + "Hello, my name is", + "The capital of France is", + ] + + llm.reward(prompts) + traces = mock_tracer.pop_traces() + spans = [s for t in traces for s in t] + + # Expect one event per input prompt + assert len(llmobs_events) == len(prompts) == len(spans) + span_by_id = {s.span_id: s for s in spans} + + for prompt, event in zip(prompts, llmobs_events): + span = span_by_id[int(event["span_id"])] + expected = _expected_llmobs_llm_span_event( + span, + span_kind="embedding", + model_name="BAAI/bge-reranker-v2-m3", + model_provider="vllm", + input_documents=[{"text": prompt}], + output_value="[7 embedding(s) returned with size 1024]", + metadata={"embedding_dim": 1024, "num_cached_tokens": 0}, + token_metrics={ + "input_tokens": 7, + "output_tokens": 0, + "total_tokens": 7, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert event == expected + + +@pytest.mark.snapshot(ignores=IGNORE_FIELDS) +def test_llmobs_score(llmobs_events, mock_tracer, bge_reranker_llm): + llm = bge_reranker_llm + + text_1 = "What is the capital of France?" + texts_2 = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + llm.score(text_1, texts_2) + traces = mock_tracer.pop_traces() + spans = [s for t in traces for s in t] + + # Expect one event per candidate document + assert len(llmobs_events) == len(texts_2) == len(spans) + span_by_id = {s.span_id: s for s in spans} + + expected_token_metrics_by_text = { + "[0, 4865, 83, 70, 10323, 111, 9942, 32, 2, 2, 581, 10323, 111, 30089, 83, 8233, 399, 5, 2]": { + "input_tokens": 19, + "output_tokens": 0, + "total_tokens": 19, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + "[0, 4865, 83, 70, 10323, 111, 9942, 32, 2, 2, 581, 10323, 111, 9942, 83, 7270, 5, 2]": { + "input_tokens": 18, + "output_tokens": 0, + "total_tokens": 18, + "time_to_first_token": mock.ANY, + "time_in_queue": mock.ANY, + "time_in_model_prefill": mock.ANY, + "time_in_model_inference": mock.ANY, + }, + } + + for event in llmobs_events: + span = span_by_id[int(event["span_id"])] + token_text = event["meta"]["input"]["documents"][0]["text"] + expected = _expected_llmobs_llm_span_event( + span, + span_kind="embedding", + model_name="BAAI/bge-reranker-v2-m3", + model_provider="vllm", + input_documents=[{"text": token_text}], + output_value="[1 embedding(s) returned with size 1]", + metadata={"embedding_dim": 1, "num_cached_tokens": 0}, + token_metrics=expected_token_metrics_by_text[token_text], + tags={"ml_app": "", "service": "tests.contrib.vllm"}, + ) + assert event == expected diff --git a/tests/llmobs/suitespec.yml b/tests/llmobs/suitespec.yml index b224a938e40..676a2b2dfae 100644 --- a/tests/llmobs/suitespec.yml +++ b/tests/llmobs/suitespec.yml @@ -26,6 +26,8 @@ components: - ddtrace/contrib/internal/openai_agents/* pydantic_ai: - ddtrace/contrib/internal/pydantic_ai/* + vllm: + - ddtrace/contrib/internal/vllm/* suites: anthropic: parallelism: 2 @@ -197,3 +199,18 @@ suites: runner: riot snapshot: true venvs_per_job: 2 + vllm: + parallelism: 1 + paths: + - '@bootstrap' + - '@core' + - '@tracing' + - '@contrib' + - '@vllm' + - '@llmobs' + - tests/contrib/vllm/* + - tests/snapshots/tests.contrib.vllm.* + runner: riot + gpu: true + snapshot: true + diff --git a/tests/snapshots/tests.contrib.vllm.test_api_app.test_rag_parent_child.json b/tests/snapshots/tests.contrib.vllm.test_api_app.test_rag_parent_child.json new file mode 100644 index 00000000000..c3f984752cb --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_api_app.test_rag_parent_child.json @@ -0,0 +1,149 @@ +[[ + { + "name": "api.rag", + "service": "tests.contrib.vllm", + "resource": "api.rag", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.llmobs_ml_app": "", + "_dd.p.llmobs_parent_id": "undefined", + "_dd.p.llmobs_trace_id": "139852621697726191151886226690907377488", + "_dd.p.tid": "6936a29c00000000", + "language": "python", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c" + }, + "metrics": { + "_dd.top_level": 1.0, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1.0, + "process_id": 573.0 + }, + "duration": 12281970976, + "start": 1765188252014480515 + }, + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 2, + "parent_id": 1, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.llmobs_ml_app": "", + "_dd.p.llmobs_parent_id": "undefined", + "_dd.p.llmobs_trace_id": "139852622014638841213851944042997448706", + "_dd.p.tid": "6936a29c00000000", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "intfloat/e5-small-v2", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "process_id": 573.0, + "vllm.latency.inference": 0.16328352299751714, + "vllm.latency.prefill": 0.16328352299751714, + "vllm.latency.queue": 4.4741027522832155e-05, + "vllm.latency.ttft": 0.35298991203308105 + }, + "duration": 354100637, + "start": 1765188256991254016 + }, + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 3, + "parent_id": 1, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.llmobs_ml_app": "", + "_dd.p.llmobs_parent_id": "undefined", + "_dd.p.llmobs_trace_id": "139852622093867003732216588897463149330", + "_dd.p.tid": "6936a29c00000000", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "intfloat/e5-small-v2", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "process_id": 573.0, + "vllm.latency.inference": 0.007165413990151137, + "vllm.latency.prefill": 0.007165413990151137, + "vllm.latency.queue": 0.00012883200543001294, + "vllm.latency.ttft": 0.009274721145629883 + }, + "duration": 9848402, + "start": 1765188257346675200 + }, + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 4, + "parent_id": 1, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.llmobs_ml_app": "", + "_dd.p.llmobs_parent_id": "undefined", + "_dd.p.llmobs_trace_id": "139852622093867003721424398273548191883", + "_dd.p.tid": "6936a29c00000000", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "intfloat/e5-small-v2", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "process_id": 573.0, + "vllm.latency.inference": 0.00657972099725157, + "vllm.latency.prefill": 0.00657972099725157, + "vllm.latency.queue": 5.130103090777993e-05, + "vllm.latency.ttft": 0.008209466934204102 + }, + "duration": 8775021, + "start": 1765188257357449728 + }, + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 5, + "parent_id": 1, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.llmobs_ml_app": "", + "_dd.p.llmobs_parent_id": "undefined", + "_dd.p.llmobs_trace_id": "139852622569235978801466275868881827569", + "_dd.p.tid": "6936a29c00000000", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "facebook/opt-125m", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "process_id": 573.0, + "vllm.latency.decode": 0.14062083698809147, + "vllm.latency.inference": 0.2075997189967893, + "vllm.latency.prefill": 0.06697888200869784, + "vllm.latency.queue": 1.7681042663753033e-05, + "vllm.latency.ttft": 0.22455477714538574 + }, + "duration": 365513241, + "start": 1765188263600904960 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_basic.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_basic.json new file mode 100644 index 00000000000..346e91afbcd --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_basic.json @@ -0,0 +1,33 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6936a28d00000000", + "language": "python", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "facebook/opt-125m", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1.0, + "process_id": 573.0, + "vllm.latency.decode": 0.04719014198053628, + "vllm.latency.inference": 0.07701651600655168, + "vllm.latency.prefill": 0.0298263740260154, + "vllm.latency.queue": 0.00018273299792781472, + "vllm.latency.ttft": 0.03212332725524902 + }, + "duration": 79799280, + "start": 1765188236988379392 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_chat.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_chat.json new file mode 100644 index 00000000000..eb25b188792 --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_chat.json @@ -0,0 +1,33 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6936a28c00000000", + "language": "python", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "facebook/opt-125m", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1.0, + "process_id": 573.0, + "vllm.latency.decode": 0.10228108201408759, + "vllm.latency.inference": 0.1432632320211269, + "vllm.latency.prefill": 0.04098215000703931, + "vllm.latency.queue": 8.597195846959949e-05, + "vllm.latency.ttft": 0.04277944564819336 + }, + "duration": 145692608, + "start": 1765188235957388032 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_classify.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_classify.json new file mode 100644 index 00000000000..9d86d331c24 --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_classify.json @@ -0,0 +1,64 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6936a28d00000000", + "language": "python", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1.0, + "process_id": 573.0, + "vllm.latency.inference": 0.013529117044527084, + "vllm.latency.prefill": 0.013529117044527084, + "vllm.latency.queue": 0.0001674939994700253, + "vllm.latency.ttft": 0.015713214874267578 + }, + "duration": 16190542, + "start": 1765188237392694784 + }], +[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 1, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6936a28d00000000", + "language": "python", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1.0, + "process_id": 573.0, + "vllm.latency.inference": 0.012352541962172836, + "vllm.latency.prefill": 0.012352541962172836, + "vllm.latency.queue": 7.889990229159594e-06, + "vllm.latency.ttft": 0.02725076675415039 + }, + "duration": 27648805, + "start": 1765188237393234176 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_embed.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_embed.json new file mode 100644 index 00000000000..aeedd8b8f12 --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_embed.json @@ -0,0 +1,64 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6936a29a00000000", + "language": "python", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "intfloat/e5-small", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1.0, + "process_id": 573.0, + "vllm.latency.inference": 0.21128384995972738, + "vllm.latency.prefill": 0.21128384995972738, + "vllm.latency.queue": 9.00209997780621e-05, + "vllm.latency.ttft": 0.21466588973999023 + }, + "duration": 215546539, + "start": 1765188249802107904 + }], +[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 1, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6936a29a00000000", + "language": "python", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "intfloat/e5-small", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1.0, + "process_id": 573.0, + "vllm.latency.inference": 0.00829247699584812, + "vllm.latency.prefill": 0.00829247699584812, + "vllm.latency.queue": 8.840986993163824e-06, + "vllm.latency.ttft": 0.22136187553405762 + }, + "duration": 221856576, + "start": 1765188249802959360 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_reward.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_reward.json new file mode 100644 index 00000000000..4b2a1e0dc04 --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_reward.json @@ -0,0 +1,64 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6936a28c00000000", + "language": "python", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1.0, + "process_id": 573.0, + "vllm.latency.inference": 0.18620535300578922, + "vllm.latency.prefill": 0.18620535300578922, + "vllm.latency.queue": 0.00014735397417098284, + "vllm.latency.ttft": 0.18919801712036133 + }, + "duration": 189851489, + "start": 1765188236472516096 + }], +[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 1, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6936a28c00000000", + "language": "python", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1.0, + "process_id": 573.0, + "vllm.latency.inference": 0.013218810025136918, + "vllm.latency.prefill": 0.013218810025136918, + "vllm.latency.queue": 8.311006240546703e-06, + "vllm.latency.ttft": 0.20083880424499512 + }, + "duration": 201263133, + "start": 1765188236473232640 + }]] diff --git a/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_score.json b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_score.json new file mode 100644 index 00000000000..6b1c8b9160e --- /dev/null +++ b/tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_score.json @@ -0,0 +1,64 @@ +[[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 0, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6936a27e00000000", + "language": "python", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1.0, + "process_id": 573.0, + "vllm.latency.inference": 0.045261238992679864, + "vllm.latency.prefill": 0.045261238992679864, + "vllm.latency.queue": 0.00014211301458999515, + "vllm.latency.ttft": 0.04782295227050781 + }, + "duration": 48812318, + "start": 1765188222587318528 + }], +[ + { + "name": "vllm.request", + "service": "tests.contrib.vllm", + "resource": "vllm.request", + "trace_id": 1, + "span_id": 1, + "parent_id": 0, + "type": "llm", + "error": 0, + "meta": { + "_dd.p.dm": "-0", + "_dd.p.tid": "6936a27e00000000", + "language": "python", + "runtime-id": "3ba78b6a59db4075ba8116247bfff62c", + "vllm.request.model": "BAAI/bge-reranker-v2-m3", + "vllm.request.provider": "vllm" + }, + "metrics": { + "_dd.measured": 1.0, + "_dd.top_level": 1.0, + "_dd.tracer_kr": 1.0, + "_sampling_priority_v1": 1.0, + "process_id": 573.0, + "vllm.latency.inference": 0.012816611037123948, + "vllm.latency.prefill": 0.012816611037123948, + "vllm.latency.queue": 7.789989467710257e-06, + "vllm.latency.ttft": 0.05951714515686035 + }, + "duration": 61092922, + "start": 1765188222587809024 + }]]