Skip to content

Commit ee5addf

Browse files
committed
Repalce cudaMemcpyAsync with cudaMemcpyBatchAsync to get rid of a driver locking bug
1 parent b8ef0bf commit ee5addf

File tree

4 files changed

+87
-15
lines changed

4 files changed

+87
-15
lines changed

include/cuco/detail/hyperloglog/hyperloglog_impl.cuh

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <cuco/detail/error.hpp>
2020
#include <cuco/detail/hyperloglog/finalizer.cuh>
2121
#include <cuco/detail/hyperloglog/kernels.cuh>
22+
#include <cuco/detail/utility/memcpy_async.cuh>
2223
#include <cuco/detail/utils.hpp>
2324
#include <cuco/hash_functions.cuh>
2425
#include <cuco/types.cuh>
@@ -420,11 +421,11 @@ class hyperloglog_impl {
420421
std::vector<register_type> host_sketch(num_regs);
421422

422423
// TODO check if storage is host accessible
423-
CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(),
424-
this->sketch_.data(),
425-
sizeof(register_type) * num_regs,
426-
cudaMemcpyDefault,
427-
stream.get()));
424+
cuco::detail::memcpy_async(host_sketch.data(),
425+
this->sketch_.data(),
426+
sizeof(register_type) * num_regs,
427+
cudaMemcpyDefault,
428+
stream);
428429
#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
429430
stream.sync();
430431
#else

include/cuco/detail/open_addressing/open_addressing_impl.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <cuco/detail/open_addressing/kernels.cuh>
2222
#include <cuco/detail/storage/counter_storage.cuh>
2323
#include <cuco/detail/utility/cuda.hpp>
24+
#include <cuco/detail/utility/memcpy_async.cuh>
2425
#include <cuco/detail/utils.hpp>
2526
#include <cuco/extent.cuh>
2627
#include <cuco/operator.hpp>
@@ -882,8 +883,8 @@ class open_addressing_impl {
882883
stream.get()));
883884

884885
size_type temp_count;
885-
CUCO_CUDA_TRY(cudaMemcpyAsync(
886-
&temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
886+
cuco::detail::memcpy_async(
887+
&temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream);
887888
#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
888889
stream.sync();
889890
#else

include/cuco/detail/static_map.inl

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include <cuco/detail/bitwise_compare.cuh>
1818
#include <cuco/detail/error.hpp>
19+
#include <cuco/detail/utility/memcpy_async.cuh>
1920
#include <cuco/detail/utils.cuh>
2021
#include <cuco/detail/utils.hpp>
2122

@@ -108,8 +109,11 @@ void static_map<Key, Value, Scope, Allocator>::insert(
108109

109110
detail::insert<block_size, tile_size>
110111
<<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
111-
CUCO_CUDA_TRY(cudaMemcpyAsync(
112-
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
112+
cuco::detail::memcpy_async(&h_num_successes,
113+
num_successes_,
114+
sizeof(atomic_ctr_type),
115+
cudaMemcpyDeviceToHost,
116+
cuda::stream_ref{stream});
113117

114118
CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated
115119

@@ -146,8 +150,11 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
146150

147151
detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
148152
first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
149-
CUCO_CUDA_TRY(cudaMemcpyAsync(
150-
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
153+
cuco::detail::memcpy_async(&h_num_successes,
154+
num_successes_,
155+
sizeof(atomic_ctr_type),
156+
cudaMemcpyDeviceToHost,
157+
cuda::stream_ref{stream});
151158
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
152159

153160
size_ += h_num_successes;
@@ -178,8 +185,11 @@ void static_map<Key, Value, Scope, Allocator>::erase(
178185

179186
detail::erase<block_size, tile_size>
180187
<<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
181-
CUCO_CUDA_TRY(cudaMemcpyAsync(
182-
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
188+
cuco::detail::memcpy_async(&h_num_successes,
189+
num_successes_,
190+
sizeof(atomic_ctr_type),
191+
cudaMemcpyDeviceToHost,
192+
cuda::stream_ref{stream});
183193

184194
CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated
185195

@@ -249,8 +259,8 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
249259
stream);
250260

251261
std::size_t h_num_out;
252-
CUCO_CUDA_TRY(
253-
cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream));
262+
cuco::detail::memcpy_async(
263+
&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream});
254264
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
255265
temp_allocator.deallocate(
256266
reinterpret_cast<char*>(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream});
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <cuco/detail/error.hpp>
20+
21+
#include <cuda/stream_ref>
22+
23+
namespace cuco::detail {
24+
25+
/**
26+
* @brief Asynchronous memory copy utility that works around cudaMemcpyAsync bugs
27+
*
28+
* This function provides a drop-in replacement for cudaMemcpyAsync that uses
29+
* cudaMemcpyBatchAsync internally to work around known issues with cudaMemcpyAsync.
30+
* The function automatically handles the different API signatures between CUDA
31+
* runtime versions.
32+
*
33+
* @param dst Destination memory address
34+
* @param src Source memory address
35+
* @param count Number of bytes to copy
36+
* @param kind Type of memory copy (cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, etc.)
37+
* @param stream CUDA stream for the asynchronous operation
38+
*/
39+
inline void memcpy_async(
40+
void* dst, const void* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream)
41+
{
42+
// Use cudaMemcpyBatchAsync as a workaround for cudaMemcpyAsync bugs
43+
void* dsts[1] = {dst};
44+
void* srcs[1] = {const_cast<void*>(src)};
45+
size_t sizes[1] = {count};
46+
cudaMemcpyAttributes attrs[1] = {{.srcAccessOrder = cudaMemcpySrcAccessOrderStream}};
47+
size_t attrsIdxs[1] = {0};
48+
49+
#if CUDART_VERSION >= 13000
50+
// CUDA 13.0+ API - no failIdx parameter
51+
CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, stream.get()));
52+
#else
53+
// CUDA 12.x API - requires failIdx parameter
54+
size_t failIdx;
55+
CUCO_CUDA_TRY(
56+
cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, &failIdx, stream.get()));
57+
#endif
58+
}
59+
60+
} // namespace cuco::detail

0 commit comments

Comments
 (0)