-
Notifications
You must be signed in to change notification settings - Fork 262
Description
Describe the bug
The following error occurs when converting the Qwen model to a QNN-supported ONNX model:
2025-10-11 20:19:31.4721216 [E:onnxruntime:, [sequential_executor.cc:572](http://sequential_executor.cc:572/) onnxruntime::ExecuteKernel] Non-zero status code returned while running Mul node. Name:'/model/layers.14/mlp/act_fn/Mul' Status Message: E:_work\1\s\onnxruntime\core\framework\[bfc_arena.cc:359](http://bfc_arena.cc:359/) onnxruntime::BFCArena::AllocateRawInternal Failed to allocate memory for requested buffer of size 159383552
Tested models: Qwen2.5-7B-Instruct and Qwen3-4B-Instruct-2507
To Reproduce
Configure the environment and run according to the guidelines in olive-recipes/Qwen-Qwen2.5-7B-Instruct/QNN at main · microsoft/olive-recipes
Expected behavior
Output a QNN-supported ONNX model
Olive config
{
"input_model": { "type": "HfModel", "model_path": "Qwen/Qwen3-4B-Instruct-2507" },
"systems": {
"qnn_system": {
"type": "PythonEnvironment",
"python_environment_path": "F:\\AI\\gaotong\\olive\\qnn\\.venv\\Scripts",
"accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
}
},
"data_configs": [
{
"name": "wikitext2_train_joined",
"type": "HuggingfaceContainer",
"load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
"pre_process_data_config": {
"strategy": "join",
"add_special_tokens": false,
"max_seq_len": 4096,
"max_samples": 128
}
},
{
"name": "wikitext2_train_act",
"type": "HuggingfaceContainer",
"load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
"pre_process_data_config": {
"strategy": "line-by-line",
"add_special_tokens": true,
"max_samples": 256,
"max_seq_len": 4096
}
}
],
"passes": {
"q": { "type": "QuaRot" },
"g": {
"type": "GptqModel",
"bits": 4,
"sym": true,
"group_size": -1,
"lm_head": false,
"device": "cuda",
"data_config": "wikitext2_train_joined"
},
"cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
"mb": {
"type": "ModelBuilder",
"precision": "int4",
"int4_block_size": 32,
"int4_accuracy_level": 4,
"int4_op_types_to_quantize": [ "MatMul", "Gather" ]
},
"mq": {
"type": "MatMulNBitsToQDQ",
"use_int4": true,
"add_zero_point": true,
"nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
"save_as_external_data": true
},
"gs": {
"type": "GraphSurgeries",
"surgeries": [
{ "surgeon": "RemoveRopeMultiCache" },
{ "surgeon": "AttentionMaskToSequenceLengths" },
{ "surgeon": "SimplifiedLayerNormToL2Norm" }
],
"save_as_external_data": true
},
"sq": {
"type": "OnnxStaticQuantization",
"data_config": "wikitext2_train_act",
"activation_type": "uint16",
"precision": "uint8",
"calibration_providers": [ "CUDAExecutionProvider" ],
"quant_preprocess": true,
"op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
"save_as_external_data": true,
"extra_option": { "CalibStridedMinMax": 4 }
},
"sp": { "type": "SplitModel" },
"st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
"cb": {
"type": "EPContextBinaryGenerator",
"provider_options": {
"htp_performance_mode": "burst",
"htp_graph_finalization_optimization_mode": "3",
"soc_model": "60"
},
"weight_sharing": true
},
"cp": { "type": "ComposeOnnxModels" }
},
"target": "qnn_system",
"log_severity_level": 1,
"output_dir": "models/qwen_2.5_7b_Instruct",
"cache_dir": "cache",
"no_artifacts": true
}
Olive logs
(.venv) PS F:\AI\gaotong\olive\olive-recipes\Qwen-Qwen2.5-7B-Instruct\QNN> olive run --config config.json
[2025-10-11 20:18:08,696] [INFO] [run.py:99:run_engine] Running workflow default_workflow
[2025-10-11 20:18:08,801] [INFO] [cache.py:138:__init__] Using cache directory: F:\AI\gaotong\olive\olive-recipes\Qwen-Qwen2.5-7B-Instruct\QNN\cache\default_workflow
[2025-10-11 20:18:10,205] [INFO] [accelerator_creator.py:79:_fill_accelerators] the accelerator device is not specified. Inferred device: npu.
[2025-10-11 20:18:10,205] [INFO] [accelerator_creator.py:204:create_accelerators] Running workflow on accelerator specs: npu-qnn
[2025-10-11 20:18:10,239] [INFO] [engine.py:217:run] Running Olive on accelerator: npu-qnn
[2025-10-11 20:18:10,239] [INFO] [engine.py:853:_create_system] Creating target system ...
[2025-10-11 20:18:10,240] [INFO] [engine.py:856:_create_system] Target system created in 0.000999 seconds
[2025-10-11 20:18:10,241] [INFO] [engine.py:859:_create_system] Creating host system ...
[2025-10-11 20:18:10,251] [INFO] [engine.py:862:_create_system] Host system created in 0.009694 seconds
[2025-10-11 20:18:10,998] [WARNING] [config_utils.py:347:validate_config] Keys {'extra_option'} are not part of OnnxStaticQuantizationConfig. Ignoring them.
[2025-10-11 20:18:11,034] [INFO] [engine.py:685:_run_pass] Running pass q:quarot
[2025-10-11 20:18:11,035] [INFO] [cache.py:235:load_run_from_model_id] Loading run d18e18b5 from cache.
[2025-10-11 20:18:11,035] [INFO] [cache.py:195:load_model] Loading model d18e18b5 from cache.
[2025-10-11 20:18:11,036] [INFO] [engine.py:722:_run_pass] Loaded model from cache: d18e18b5
[2025-10-11 20:18:11,036] [INFO] [engine.py:685:_run_pass] Running pass g:gptqmodel
[2025-10-11 20:18:11,059] [INFO] [cache.py:235:load_run_from_model_id] Loading run 8d8de7f5 from cache.
[2025-10-11 20:18:11,078] [INFO] [cache.py:195:load_model] Loading model 8d8de7f5 from cache.
[2025-10-11 20:18:11,086] [INFO] [engine.py:722:_run_pass] Loaded model from cache: 8d8de7f5
[2025-10-11 20:18:11,086] [INFO] [engine.py:685:_run_pass] Running pass cs:capturesplitinfo
[2025-10-11 20:18:11,088] [INFO] [cache.py:235:load_run_from_model_id] Loading run 0cd3e38e from cache.
[2025-10-11 20:18:11,095] [INFO] [cache.py:195:load_model] Loading model 0cd3e38e from cache.
[2025-10-11 20:18:11,114] [INFO] [engine.py:722:_run_pass] Loaded model from cache: 0cd3e38e
[2025-10-11 20:18:11,115] [INFO] [engine.py:685:_run_pass] Running pass mb:modelbuilder
[2025-10-11 20:18:11,116] [INFO] [cache.py:235:load_run_from_model_id] Loading run 26fab61a from cache.
[2025-10-11 20:18:11,133] [INFO] [cache.py:195:load_model] Loading model 26fab61a from cache.
[2025-10-11 20:18:11,156] [INFO] [engine.py:722:_run_pass] Loaded model from cache: 26fab61a
[2025-10-11 20:18:11,156] [INFO] [engine.py:685:_run_pass] Running pass mq:matmulnbitstoqdq
[2025-10-11 20:18:11,157] [INFO] [cache.py:235:load_run_from_model_id] Loading run 6fefae8a from cache.
[2025-10-11 20:18:11,165] [INFO] [cache.py:195:load_model] Loading model 6fefae8a from cache.
[2025-10-11 20:18:11,182] [INFO] [engine.py:722:_run_pass] Loaded model from cache: 6fefae8a
[2025-10-11 20:18:11,183] [INFO] [engine.py:685:_run_pass] Running pass gs:graphsurgeries
[2025-10-11 20:18:11,185] [INFO] [cache.py:235:load_run_from_model_id] Loading run b7e6105a from cache.
[2025-10-11 20:18:11,192] [INFO] [cache.py:195:load_model] Loading model b7e6105a from cache.
[2025-10-11 20:18:11,203] [INFO] [engine.py:722:_run_pass] Loaded model from cache: b7e6105a
[2025-10-11 20:18:11,204] [INFO] [engine.py:685:_run_pass] Running pass sq:onnxstaticquantization
[2025-10-11 20:18:11,422] [INFO] [quantization.py:402:_run_for_config] Preprocessing model for quantization
2025-10-11 20:19:31.4721216 [E:onnxruntime:, sequential_executor.cc:572 onnxruntime::ExecuteKernel] Non-zero status code returned while running Mul node. Name:'/model/layers.14/mlp/act_fn/Mul' Status Message: E:\_work\1\s\onnxruntime\core\framework\bfc_arena.cc:359 onnxruntime::BFCArena::AllocateRawInternal Failed to allocate memory for requested buffer of size 159383552
[2025-10-11 20:19:31,579] [ERROR] [engine.py:748:_run_pass] Pass run failed.
Traceback (most recent call last):
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\engine\engine.py", line 736, in _run_pass
output_model_config = host.run_pass(p, input_model_config, output_model_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\systems\local.py", line 45, in run_pass
output_model = the_pass.run(model, output_model_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\passes\olive_pass.py", line 242, in run
output_model = self._run_for_config(model, self.config, output_model_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\passes\onnx\quantization.py", line 462, in _run_for_config
quantize_static(
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\quantization\quantize.py", line 742, in quantize_static
calibrator.collect_data(calibration_data_reader)
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\quantization\calibrate.py", line 420, in collect_data
self.intermediate_outputs.append(self.infer_session.run(None, inputs))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\capi\onnxruntime_inference_collection.py", line 275, in run
return self._sess.run(output_names, input_feed, run_options)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
onnxruntime.capi.onnxruntime_pybind11_state.RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Mul node. Name:'/model/layers.14/mlp/act_fn/Mul' Status Message: E:\_work\1\s\onnxruntime\core\framework\bfc_arena.cc:359 onnxruntime::BFCArena::AllocateRawInternal Failed to allocate memory for requested buffer of size 159383552
[2025-10-11 20:19:31,631] [WARNING] [engine.py:310:run_accelerator] Failed to run Olive on npu-qnn.
Traceback (most recent call last):
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\engine\engine.py", line 306, in run_accelerator
output_footprint = self._run_no_search(input_model_config, input_model_id, accelerator_spec, output_dir)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\engine\engine.py", line 350, in _run_no_search
should_prune, signal, model_ids = self._run_passes(input_model_config, input_model_id, accelerator_spec)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\engine\engine.py", line 641, in _run_passes
model_config, model_id = self._run_pass(
^^^^^^^^^^^^^^^
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\engine\engine.py", line 736, in _run_pass
output_model_config = host.run_pass(p, input_model_config, output_model_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\systems\local.py", line 45, in run_pass
output_model = the_pass.run(model, output_model_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\passes\olive_pass.py", line 242, in run
output_model = self._run_for_config(model, self.config, output_model_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\passes\onnx\quantization.py", line 462, in _run_for_config
quantize_static(
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\quantization\quantize.py", line 742, in quantize_static
calibrator.collect_data(calibration_data_reader)
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\quantization\calibrate.py", line 420, in collect_data
self.intermediate_outputs.append(self.infer_session.run(None, inputs))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\capi\onnxruntime_inference_collection.py", line 275, in run
return self._sess.run(output_names, input_feed, run_options)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
onnxruntime.capi.onnxruntime_pybind11_state.RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Mul node. Name:'/model/layers.14/mlp/act_fn/Mul' Status Message: E:\_work\1\s\onnxruntime\core\framework\bfc_arena.cc:359 onnxruntime::BFCArena::AllocateRawInternal Failed to allocate memory for requested buffer of size 159383552
[2025-10-11 20:19:36,148] [INFO] [engine.py:235:run] Run history for npu-qnn:
[2025-10-11 20:19:36,175] [INFO] [engine.py:499:dump_run_history] run history:
+------------+-------------------+------------------+----------------+-----------+
| model_id | parent_model_id | from_pass | duration_sec | metrics |
+============+===================+==================+================+===========+
| d4993402 | | | | |
+------------+-------------------+------------------+----------------+-----------+
| d18e18b5 | d4993402 | quarot | 0.0019691 | |
+------------+-------------------+------------------+----------------+-----------+
| 8d8de7f5 | d18e18b5 | gptqmodel | 0.0498719 | |
+------------+-------------------+------------------+----------------+-----------+
| 0cd3e38e | 8d8de7f5 | capturesplitinfo | 0.027631 | |
+------------+-------------------+------------------+----------------+-----------+
| 26fab61a | 0cd3e38e | modelbuilder | 0.0411398 | |
+------------+-------------------+------------------+----------------+-----------+
| 6fefae8a | 26fab61a | matmulnbitstoqdq | 0.0255711 | |
+------------+-------------------+------------------+----------------+-----------+
| b7e6105a | 6fefae8a | graphsurgeries | 0.0201991 | |
+------------+-------------------+------------------+----------------+-----------+
[2025-10-11 20:19:36,175] [WARNING] [engine.py:259:run] No output model
No output model produced. Please check the log for details.
Other information
- OS: Windows 11 Professional Edition 25H2 26220.6772 x64
Package Version Editable project location
---------------------- ------------ -----------------------------
accelerate 1.10.1
aiohappyeyeballs 2.6.1
aiohttp 3.13.0
aiosignal 1.4.0
alembic 1.16.5
annotated-types 0.7.0
anyio 4.11.0
attrs 25.4.0
auto_gptq 0.8.0.dev0 F:\AI\gaotong\olive\AutoGPTQ
autopep8 2.3.2
certifi 2025.10.5
cffi 2.0.0
charset-normalizer 3.4.3
colorama 0.4.6
coloredlogs 15.0.1
colorlog 6.9.0
datasets 4.2.0
device-smi 0.4.1
dill 0.4.0
filelock 3.20.0
flatbuffers 25.9.23
frozenlist 1.8.0
fsspec 2025.9.0
gekko 1.3.0
gptqmodel 4.0.0.dev0 F:\AI\gaotong\olive\GPTQModel
greenlet 3.2.4
h11 0.16.0
hf_transfer 0.1.9
httpcore 1.0.9
httpx 0.28.1
huggingface-hub 0.35.3
humanfriendly 10.0
idna 3.10
iniconfig 2.1.0
Jinja2 3.1.6
lightning-utilities 0.15.2
logbar 0.0.4
Mako 1.3.10
MarkupSafe 3.0.3
ml_dtypes 0.5.3
mpmath 1.3.0
multidict 6.7.0
multiprocess 0.70.16
networkx 3.5
numpy 2.3.3
olive-ai 0.9.3
onnx 1.19.1
onnx-ir 0.1.10
onnxruntime-genai-cuda 0.9.2
onnxruntime-gpu 1.23.0
onnxscript 0.5.3
optimum 2.0.0
optuna 4.5.0
packaging 25.0
pandas 2.3.3
peft 0.17.1
pillow 11.3.0
pip 25.2
pluggy 1.6.0
propcache 0.4.1
protobuf 6.32.1
psutil 7.1.0
pyarrow 21.0.0
pycodestyle 2.14.0
pycparser 2.23
pydantic 2.12.0
pydantic_core 2.41.1
Pygments 2.19.2
pyreadline3 3.5.4
pytest 8.4.2
python-dateutil 2.9.0.post0
pytz 2025.2
PyYAML 6.0.3
random_word 1.0.13
regex 2025.9.18
requests 2.32.5
rouge 1.0.1
safetensors 0.6.2
sentencepiece 0.2.1
setuptools 80.9.0
six 1.17.0
sniffio 1.3.1
soundfile 0.13.1
SQLAlchemy 2.0.43
sympy 1.13.1
tabulate 0.9.0
threadpoolctl 3.6.0
tokenicer 0.0.4
tokenizers 0.21.4
torch 2.5.1+cu121
torchaudio 2.5.1+cu121
torchmetrics 1.8.2
torchvision 0.20.1+cu121
tqdm 4.67.1
transformers 4.53.2
typing_extensions 4.15.0
typing-inspection 0.4.2
tzdata 2025.2
urllib3 2.5.0
xxhash 3.6.0
yarl 1.22.0
QNN Environment Package List:
Package Version
------------------- -----------
alembic 1.16.5
annotated-types 0.7.0
certifi 2025.10.5
charset-normalizer 3.4.3
colorama 0.4.6
coloredlogs 15.0.1
colorlog 6.9.0
filelock 3.20.0
flatbuffers 25.9.23
fsspec 2025.9.0
greenlet 3.2.4
huggingface-hub 0.35.3
humanfriendly 10.0
idna 3.10
Jinja2 3.1.6
lightning-utilities 0.15.2
Mako 1.3.10
MarkupSafe 3.0.3
ml_dtypes 0.5.3
mpmath 1.3.0
networkx 3.5
numpy 2.3.3
olive-ai 0.9.3
onnx 1.19.1
onnx-ir 0.1.10
onnxruntime-qnn 1.22.2
onnxscript 0.5.3
optuna 4.5.0
packaging 25.0
pandas 2.3.3
pip 25.2
protobuf 6.32.1
pydantic 2.12.0
pydantic_core 2.41.1
pyreadline3 3.5.4
python-dateutil 2.9.0.post0
pytz 2025.2
PyYAML 6.0.3
regex 2025.9.18
requests 2.32.5
safetensors 0.6.2
setuptools 80.9.0
six 1.17.0
SQLAlchemy 2.0.43
sympy 1.14.0
tokenizers 0.22.1
torch 2.8.0
torchmetrics 1.8.2
tqdm 4.67.1
transformers 4.57.0
typing_extensions 4.15.0
typing-inspection 0.4.2
tzdata 2025.2
urllib3 2.5.0
Additional context
Computer Configuration 1:
CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz (3.70 GHz)
RAM: 32GB
GPU: NVIDIA GeForce GTX 1080 Ti 11G
CUDA: 12.1
Computer Configuration 2:
CPU: Intel(R) Xeon(R) Platinum 8352V CPU @ 2.10GHz (2.10 GHz)
RAM: 256GB
GPU: NVIDIA GeForce RTX 4090 24G
CUDA: 12.8