Skip to content

Bug Report: Memory Allocation Error in Qwen2.5-7B/Qwen3-4B Conversion to QNN-Supported ONNX #2210

@nnbw-liu

Description

@nnbw-liu

Describe the bug
The following error occurs when converting the Qwen model to a QNN-supported ONNX model:

2025-10-11 20:19:31.4721216 [E:onnxruntime:, [sequential_executor.cc:572](http://sequential_executor.cc:572/) onnxruntime::ExecuteKernel] Non-zero status code returned while running Mul node. Name:'/model/layers.14/mlp/act_fn/Mul' Status Message: E:_work\1\s\onnxruntime\core\framework\[bfc_arena.cc:359](http://bfc_arena.cc:359/) onnxruntime::BFCArena::AllocateRawInternal Failed to allocate memory for requested buffer of size 159383552

Tested models: Qwen2.5-7B-Instruct and Qwen3-4B-Instruct-2507​

To Reproduce
Configure the environment and run according to the guidelines in olive-recipes/Qwen-Qwen2.5-7B-Instruct/QNN at main · microsoft/olive-recipes

Expected behavior
Output a QNN-supported ONNX model​

Olive config

{
    "input_model": { "type": "HfModel", "model_path": "Qwen/Qwen3-4B-Instruct-2507" },
    "systems": {
        "qnn_system": {
            "type": "PythonEnvironment",
            "python_environment_path": "F:\\AI\\gaotong\\olive\\qnn\\.venv\\Scripts",
            "accelerators": [ { "execution_providers": [ "QNNExecutionProvider" ] } ]
        }
    },
    "data_configs": [
        {
            "name": "wikitext2_train_joined",
            "type": "HuggingfaceContainer",
            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
            "pre_process_data_config": {
                "strategy": "join",
                "add_special_tokens": false,
                "max_seq_len": 4096,
                "max_samples": 128
            }
        },
        {
            "name": "wikitext2_train_act",
            "type": "HuggingfaceContainer",
            "load_dataset_config": { "data_name": "wikitext", "subset": "wikitext-2-raw-v1", "split": "train" },
            "pre_process_data_config": {
                "strategy": "line-by-line",
                "add_special_tokens": true,
                "max_samples": 256,
                "max_seq_len": 4096
            }
        }
    ],
    "passes": {
        "q": { "type": "QuaRot" },
        "g": {
            "type": "GptqModel",
            "bits": 4,
            "sym": true,
            "group_size": -1,
            "lm_head": false,
            "device": "cuda",
            "data_config": "wikitext2_train_joined"
        },
        "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true },
        "mb": {
            "type": "ModelBuilder",
            "precision": "int4",
            "int4_block_size": 32,
            "int4_accuracy_level": 4,
            "int4_op_types_to_quantize": [ "MatMul", "Gather" ]
        },
        "mq": {
            "type": "MatMulNBitsToQDQ",
            "use_int4": true,
            "add_zero_point": true,
            "nodes_to_exclude": [ "/lm_head/MatMul_Q4" ],
            "save_as_external_data": true
        },
        "gs": {
            "type": "GraphSurgeries",
            "surgeries": [
                { "surgeon": "RemoveRopeMultiCache" },
                { "surgeon": "AttentionMaskToSequenceLengths" },
                { "surgeon": "SimplifiedLayerNormToL2Norm" }
            ],
            "save_as_external_data": true
        },
        "sq": {
            "type": "OnnxStaticQuantization",
            "data_config": "wikitext2_train_act",
            "activation_type": "uint16",
            "precision": "uint8",
            "calibration_providers": [ "CUDAExecutionProvider" ],
            "quant_preprocess": true,
            "op_types_to_exclude": [ "GatherBlockQuantized", "GroupQueryAttention", "MatMulNBits" ],
            "save_as_external_data": true,
            "extra_option": { "CalibStridedMinMax": 4 }
        },
        "sp": { "type": "SplitModel" },
        "st": { "type": "StaticLLM", "batch_size": 1, "context_length": 64 },
        "cb": {
            "type": "EPContextBinaryGenerator",
            "provider_options": {
                "htp_performance_mode": "burst",
                "htp_graph_finalization_optimization_mode": "3",
                "soc_model": "60"
            },
            "weight_sharing": true
        },
        "cp": { "type": "ComposeOnnxModels" }
    },
    "target": "qnn_system",
    "log_severity_level": 1,
    "output_dir": "models/qwen_2.5_7b_Instruct",
    "cache_dir": "cache",
    "no_artifacts": true
}

Olive logs

(.venv) PS F:\AI\gaotong\olive\olive-recipes\Qwen-Qwen2.5-7B-Instruct\QNN> olive run --config config.json
[2025-10-11 20:18:08,696] [INFO] [run.py:99:run_engine] Running workflow default_workflow
[2025-10-11 20:18:08,801] [INFO] [cache.py:138:__init__] Using cache directory: F:\AI\gaotong\olive\olive-recipes\Qwen-Qwen2.5-7B-Instruct\QNN\cache\default_workflow
[2025-10-11 20:18:10,205] [INFO] [accelerator_creator.py:79:_fill_accelerators] the accelerator device is not specified. Inferred device: npu.
[2025-10-11 20:18:10,205] [INFO] [accelerator_creator.py:204:create_accelerators] Running workflow on accelerator specs: npu-qnn
[2025-10-11 20:18:10,239] [INFO] [engine.py:217:run] Running Olive on accelerator: npu-qnn
[2025-10-11 20:18:10,239] [INFO] [engine.py:853:_create_system] Creating target system ...
[2025-10-11 20:18:10,240] [INFO] [engine.py:856:_create_system] Target system created in 0.000999 seconds
[2025-10-11 20:18:10,241] [INFO] [engine.py:859:_create_system] Creating host system ...
[2025-10-11 20:18:10,251] [INFO] [engine.py:862:_create_system] Host system created in 0.009694 seconds
[2025-10-11 20:18:10,998] [WARNING] [config_utils.py:347:validate_config] Keys {'extra_option'} are not part of OnnxStaticQuantizationConfig. Ignoring them.
[2025-10-11 20:18:11,034] [INFO] [engine.py:685:_run_pass] Running pass q:quarot
[2025-10-11 20:18:11,035] [INFO] [cache.py:235:load_run_from_model_id] Loading run d18e18b5 from cache.
[2025-10-11 20:18:11,035] [INFO] [cache.py:195:load_model] Loading model d18e18b5 from cache.
[2025-10-11 20:18:11,036] [INFO] [engine.py:722:_run_pass] Loaded model from cache: d18e18b5
[2025-10-11 20:18:11,036] [INFO] [engine.py:685:_run_pass] Running pass g:gptqmodel
[2025-10-11 20:18:11,059] [INFO] [cache.py:235:load_run_from_model_id] Loading run 8d8de7f5 from cache.
[2025-10-11 20:18:11,078] [INFO] [cache.py:195:load_model] Loading model 8d8de7f5 from cache.
[2025-10-11 20:18:11,086] [INFO] [engine.py:722:_run_pass] Loaded model from cache: 8d8de7f5
[2025-10-11 20:18:11,086] [INFO] [engine.py:685:_run_pass] Running pass cs:capturesplitinfo
[2025-10-11 20:18:11,088] [INFO] [cache.py:235:load_run_from_model_id] Loading run 0cd3e38e from cache.
[2025-10-11 20:18:11,095] [INFO] [cache.py:195:load_model] Loading model 0cd3e38e from cache.
[2025-10-11 20:18:11,114] [INFO] [engine.py:722:_run_pass] Loaded model from cache: 0cd3e38e
[2025-10-11 20:18:11,115] [INFO] [engine.py:685:_run_pass] Running pass mb:modelbuilder
[2025-10-11 20:18:11,116] [INFO] [cache.py:235:load_run_from_model_id] Loading run 26fab61a from cache.
[2025-10-11 20:18:11,133] [INFO] [cache.py:195:load_model] Loading model 26fab61a from cache.
[2025-10-11 20:18:11,156] [INFO] [engine.py:722:_run_pass] Loaded model from cache: 26fab61a
[2025-10-11 20:18:11,156] [INFO] [engine.py:685:_run_pass] Running pass mq:matmulnbitstoqdq
[2025-10-11 20:18:11,157] [INFO] [cache.py:235:load_run_from_model_id] Loading run 6fefae8a from cache.
[2025-10-11 20:18:11,165] [INFO] [cache.py:195:load_model] Loading model 6fefae8a from cache.
[2025-10-11 20:18:11,182] [INFO] [engine.py:722:_run_pass] Loaded model from cache: 6fefae8a
[2025-10-11 20:18:11,183] [INFO] [engine.py:685:_run_pass] Running pass gs:graphsurgeries
[2025-10-11 20:18:11,185] [INFO] [cache.py:235:load_run_from_model_id] Loading run b7e6105a from cache.
[2025-10-11 20:18:11,192] [INFO] [cache.py:195:load_model] Loading model b7e6105a from cache.
[2025-10-11 20:18:11,203] [INFO] [engine.py:722:_run_pass] Loaded model from cache: b7e6105a
[2025-10-11 20:18:11,204] [INFO] [engine.py:685:_run_pass] Running pass sq:onnxstaticquantization
[2025-10-11 20:18:11,422] [INFO] [quantization.py:402:_run_for_config] Preprocessing model for quantization
2025-10-11 20:19:31.4721216 [E:onnxruntime:, sequential_executor.cc:572 onnxruntime::ExecuteKernel] Non-zero status code returned while running Mul node. Name:'/model/layers.14/mlp/act_fn/Mul' Status Message: E:\_work\1\s\onnxruntime\core\framework\bfc_arena.cc:359 onnxruntime::BFCArena::AllocateRawInternal Failed to allocate memory for requested buffer of size 159383552

[2025-10-11 20:19:31,579] [ERROR] [engine.py:748:_run_pass] Pass run failed.
Traceback (most recent call last):
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\engine\engine.py", line 736, in _run_pass
    output_model_config = host.run_pass(p, input_model_config, output_model_path)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\systems\local.py", line 45, in run_pass
    output_model = the_pass.run(model, output_model_path)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\passes\olive_pass.py", line 242, in run
    output_model = self._run_for_config(model, self.config, output_model_path)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\passes\onnx\quantization.py", line 462, in _run_for_config
    quantize_static(
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\quantization\quantize.py", line 742, in quantize_static
    calibrator.collect_data(calibration_data_reader)
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\quantization\calibrate.py", line 420, in collect_data
    self.intermediate_outputs.append(self.infer_session.run(None, inputs))
                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\capi\onnxruntime_inference_collection.py", line 275, in run
    return self._sess.run(output_names, input_feed, run_options)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
onnxruntime.capi.onnxruntime_pybind11_state.RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Mul node. Name:'/model/layers.14/mlp/act_fn/Mul' Status Message: E:\_work\1\s\onnxruntime\core\framework\bfc_arena.cc:359 onnxruntime::BFCArena::AllocateRawInternal Failed to allocate memory for requested buffer of size 159383552

[2025-10-11 20:19:31,631] [WARNING] [engine.py:310:run_accelerator] Failed to run Olive on npu-qnn.
Traceback (most recent call last):
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\engine\engine.py", line 306, in run_accelerator
    output_footprint = self._run_no_search(input_model_config, input_model_id, accelerator_spec, output_dir)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\engine\engine.py", line 350, in _run_no_search
    should_prune, signal, model_ids = self._run_passes(input_model_config, input_model_id, accelerator_spec)
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\engine\engine.py", line 641, in _run_passes
    model_config, model_id = self._run_pass(
                             ^^^^^^^^^^^^^^^
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\engine\engine.py", line 736, in _run_pass
    output_model_config = host.run_pass(p, input_model_config, output_model_path)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\systems\local.py", line 45, in run_pass
    output_model = the_pass.run(model, output_model_path)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\passes\olive_pass.py", line 242, in run
    output_model = self._run_for_config(model, self.config, output_model_path)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\olive\passes\onnx\quantization.py", line 462, in _run_for_config
    quantize_static(
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\quantization\quantize.py", line 742, in quantize_static
    calibrator.collect_data(calibration_data_reader)
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\quantization\calibrate.py", line 420, in collect_data
    self.intermediate_outputs.append(self.infer_session.run(None, inputs))
                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\AI\gaotong\olive\.venv\Lib\site-packages\onnxruntime\capi\onnxruntime_inference_collection.py", line 275, in run
    return self._sess.run(output_names, input_feed, run_options)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
onnxruntime.capi.onnxruntime_pybind11_state.RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Mul node. Name:'/model/layers.14/mlp/act_fn/Mul' Status Message: E:\_work\1\s\onnxruntime\core\framework\bfc_arena.cc:359 onnxruntime::BFCArena::AllocateRawInternal Failed to allocate memory for requested buffer of size 159383552

[2025-10-11 20:19:36,148] [INFO] [engine.py:235:run] Run history for npu-qnn:
[2025-10-11 20:19:36,175] [INFO] [engine.py:499:dump_run_history] run history:
+------------+-------------------+------------------+----------------+-----------+
| model_id   | parent_model_id   | from_pass        |   duration_sec | metrics   |
+============+===================+==================+================+===========+
| d4993402   |                   |                  |                |           |
+------------+-------------------+------------------+----------------+-----------+
| d18e18b5   | d4993402          | quarot           |      0.0019691 |           |
+------------+-------------------+------------------+----------------+-----------+
| 8d8de7f5   | d18e18b5          | gptqmodel        |      0.0498719 |           |
+------------+-------------------+------------------+----------------+-----------+
| 0cd3e38e   | 8d8de7f5          | capturesplitinfo |      0.027631  |           |
+------------+-------------------+------------------+----------------+-----------+
| 26fab61a   | 0cd3e38e          | modelbuilder     |      0.0411398 |           |
+------------+-------------------+------------------+----------------+-----------+
| 6fefae8a   | 26fab61a          | matmulnbitstoqdq |      0.0255711 |           |
+------------+-------------------+------------------+----------------+-----------+
| b7e6105a   | 6fefae8a          | graphsurgeries   |      0.0201991 |           |
+------------+-------------------+------------------+----------------+-----------+
[2025-10-11 20:19:36,175] [WARNING] [engine.py:259:run] No output model
No output model produced. Please check the log for details.

Other information

  • OS: Windows 11 Professional Edition 25H2 26220.6772 x64​
Package                Version      Editable project location
---------------------- ------------ -----------------------------
accelerate             1.10.1
aiohappyeyeballs       2.6.1
aiohttp                3.13.0
aiosignal              1.4.0
alembic                1.16.5
annotated-types        0.7.0
anyio                  4.11.0
attrs                  25.4.0
auto_gptq              0.8.0.dev0   F:\AI\gaotong\olive\AutoGPTQ
autopep8               2.3.2
certifi                2025.10.5
cffi                   2.0.0
charset-normalizer     3.4.3
colorama               0.4.6
coloredlogs            15.0.1
colorlog               6.9.0
datasets               4.2.0
device-smi             0.4.1
dill                   0.4.0
filelock               3.20.0
flatbuffers            25.9.23
frozenlist             1.8.0
fsspec                 2025.9.0
gekko                  1.3.0
gptqmodel              4.0.0.dev0   F:\AI\gaotong\olive\GPTQModel
greenlet               3.2.4
h11                    0.16.0
hf_transfer            0.1.9
httpcore               1.0.9
httpx                  0.28.1
huggingface-hub        0.35.3
humanfriendly          10.0
idna                   3.10
iniconfig              2.1.0
Jinja2                 3.1.6
lightning-utilities    0.15.2
logbar                 0.0.4
Mako                   1.3.10
MarkupSafe             3.0.3
ml_dtypes              0.5.3
mpmath                 1.3.0
multidict              6.7.0
multiprocess           0.70.16
networkx               3.5
numpy                  2.3.3
olive-ai               0.9.3
onnx                   1.19.1
onnx-ir                0.1.10
onnxruntime-genai-cuda 0.9.2
onnxruntime-gpu        1.23.0
onnxscript             0.5.3
optimum                2.0.0
optuna                 4.5.0
packaging              25.0
pandas                 2.3.3
peft                   0.17.1
pillow                 11.3.0
pip                    25.2
pluggy                 1.6.0
propcache              0.4.1
protobuf               6.32.1
psutil                 7.1.0
pyarrow                21.0.0
pycodestyle            2.14.0
pycparser              2.23
pydantic               2.12.0
pydantic_core          2.41.1
Pygments               2.19.2
pyreadline3            3.5.4
pytest                 8.4.2
python-dateutil        2.9.0.post0
pytz                   2025.2
PyYAML                 6.0.3
random_word            1.0.13
regex                  2025.9.18
requests               2.32.5
rouge                  1.0.1
safetensors            0.6.2
sentencepiece          0.2.1
setuptools             80.9.0
six                    1.17.0
sniffio                1.3.1
soundfile              0.13.1
SQLAlchemy             2.0.43
sympy                  1.13.1
tabulate               0.9.0
threadpoolctl          3.6.0
tokenicer              0.0.4
tokenizers             0.21.4
torch                  2.5.1+cu121
torchaudio             2.5.1+cu121
torchmetrics           1.8.2
torchvision            0.20.1+cu121
tqdm                   4.67.1
transformers           4.53.2
typing_extensions      4.15.0
typing-inspection      0.4.2
tzdata                 2025.2
urllib3                2.5.0
xxhash                 3.6.0
yarl                   1.22.0

QNN Environment Package List:​

Package             Version
------------------- -----------
alembic             1.16.5
annotated-types     0.7.0
certifi             2025.10.5
charset-normalizer  3.4.3
colorama            0.4.6
coloredlogs         15.0.1
colorlog            6.9.0
filelock            3.20.0
flatbuffers         25.9.23
fsspec              2025.9.0
greenlet            3.2.4
huggingface-hub     0.35.3
humanfriendly       10.0
idna                3.10
Jinja2              3.1.6
lightning-utilities 0.15.2
Mako                1.3.10
MarkupSafe          3.0.3
ml_dtypes           0.5.3
mpmath              1.3.0
networkx            3.5
numpy               2.3.3
olive-ai            0.9.3
onnx                1.19.1
onnx-ir             0.1.10
onnxruntime-qnn     1.22.2
onnxscript          0.5.3
optuna              4.5.0
packaging           25.0
pandas              2.3.3
pip                 25.2
protobuf            6.32.1
pydantic            2.12.0
pydantic_core       2.41.1
pyreadline3         3.5.4
python-dateutil     2.9.0.post0
pytz                2025.2
PyYAML              6.0.3
regex               2025.9.18
requests            2.32.5
safetensors         0.6.2
setuptools          80.9.0
six                 1.17.0
SQLAlchemy          2.0.43
sympy               1.14.0
tokenizers          0.22.1
torch               2.8.0
torchmetrics        1.8.2
tqdm                4.67.1
transformers        4.57.0
typing_extensions   4.15.0
typing-inspection   0.4.2
tzdata              2025.2
urllib3             2.5.0

Additional context
Computer Configuration 1:​
CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz (3.70 GHz)​
RAM: 32GB​
GPU: NVIDIA GeForce GTX 1080 Ti 11G​
CUDA: 12.1​

Computer Configuration 2:​
CPU: Intel(R) Xeon(R) Platinum 8352V CPU @ 2.10GHz (2.10 GHz)​
RAM: 256GB​
GPU: NVIDIA GeForce RTX 4090 24G​
CUDA: 12.8

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions