Repalce cudaMemcpyAsync with cudaMemcpyBatchAsync to get rid of a driver locking bug

PointKernel · PointKernel · commit ee5addf697f4 · 2025-12-05T10:54:12.000-08:00
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
@@ -19,6 +19,7 @@
 #include <cuco/detail/error.hpp>
 #include <cuco/detail/hyperloglog/finalizer.cuh>
 #include <cuco/detail/hyperloglog/kernels.cuh>
+#include <cuco/detail/utility/memcpy_async.cuh>
 #include <cuco/detail/utils.hpp>
 #include <cuco/hash_functions.cuh>
 #include <cuco/types.cuh>
@@ -420,11 +421,11 @@ class hyperloglog_impl {
     std::vector<register_type> host_sketch(num_regs);
 
     // TODO check if storage is host accessible
-    CUCO_CUDA_TRY(cudaMemcpyAsync(host_sketch.data(),
-                                  this->sketch_.data(),
-                                  sizeof(register_type) * num_regs,
-                                  cudaMemcpyDefault,
-                                  stream.get()));
+    cuco::detail::memcpy_async(host_sketch.data(),
+                               this->sketch_.data(),
+                               sizeof(register_type) * num_regs,
+                               cudaMemcpyDefault,
+                               stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
     stream.sync();
 #else
diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
@@ -21,6 +21,7 @@
 #include <cuco/detail/open_addressing/kernels.cuh>
 #include <cuco/detail/storage/counter_storage.cuh>
 #include <cuco/detail/utility/cuda.hpp>
+#include <cuco/detail/utility/memcpy_async.cuh>
 #include <cuco/detail/utils.hpp>
 #include <cuco/extent.cuh>
 #include <cuco/operator.hpp>
@@ -882,8 +883,8 @@ class open_addressing_impl {
                                           stream.get()));
 
       size_type temp_count;
-      CUCO_CUDA_TRY(cudaMemcpyAsync(
-        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
+      cuco::detail::memcpy_async(
+        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
       stream.sync();
 #else
diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
@@ -16,6 +16,7 @@
 
 #include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/error.hpp>
+#include <cuco/detail/utility/memcpy_async.cuh>
 #include <cuco/detail/utils.cuh>
 #include <cuco/detail/utils.hpp>
 
@@ -108,8 +109,11 @@ void static_map<Key, Value, Scope, Allocator>::insert(
 
   detail::insert<block_size, tile_size>
     <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
+  cuco::detail::memcpy_async(&h_num_successes,
+                             num_successes_,
+                             sizeof(atomic_ctr_type),
+                             cudaMemcpyDeviceToHost,
+                             cuda::stream_ref{stream});
 
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));  // stream sync to ensure h_num_successes is updated
 
@@ -146,8 +150,11 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
 
   detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
     first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
+  cuco::detail::memcpy_async(&h_num_successes,
+                             num_successes_,
+                             sizeof(atomic_ctr_type),
+                             cudaMemcpyDeviceToHost,
+                             cuda::stream_ref{stream});
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
 
   size_ += h_num_successes;
@@ -178,8 +185,11 @@ void static_map<Key, Value, Scope, Allocator>::erase(
 
   detail::erase<block_size, tile_size>
     <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
+  cuco::detail::memcpy_async(&h_num_successes,
+                             num_successes_,
+                             sizeof(atomic_ctr_type),
+                             cudaMemcpyDeviceToHost,
+                             cuda::stream_ref{stream});
 
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));  // stream sync to ensure h_num_successes is updated
 
@@ -249,8 +259,8 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
                         stream);
 
   std::size_t h_num_out;
-  CUCO_CUDA_TRY(
-    cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream));
+  cuco::detail::memcpy_async(
+    &h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream});
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
   temp_allocator.deallocate(
     reinterpret_cast<char*>(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream});
diff --git a/include/cuco/detail/utility/memcpy_async.cuh b/include/cuco/detail/utility/memcpy_async.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/error.hpp>
+
+#include <cuda/stream_ref>
+
+namespace cuco::detail {
+
+/**
+ * @brief Asynchronous memory copy utility that works around cudaMemcpyAsync bugs
+ *
+ * This function provides a drop-in replacement for cudaMemcpyAsync that uses
+ * cudaMemcpyBatchAsync internally to work around known issues with cudaMemcpyAsync.
+ * The function automatically handles the different API signatures between CUDA
+ * runtime versions.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param count Number of bytes to copy
+ * @param kind Type of memory copy (cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, etc.)
+ * @param stream CUDA stream for the asynchronous operation
+ */
+inline void memcpy_async(
+  void* dst, const void* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream)
+{
+  // Use cudaMemcpyBatchAsync as a workaround for cudaMemcpyAsync bugs
+  void* dsts[1]                 = {dst};
+  void* srcs[1]                 = {const_cast<void*>(src)};
+  size_t sizes[1]               = {count};
+  cudaMemcpyAttributes attrs[1] = {{.srcAccessOrder = cudaMemcpySrcAccessOrderStream}};
+  size_t attrsIdxs[1]           = {0};
+
+#if CUDART_VERSION >= 13000
+  // CUDA 13.0+ API - no failIdx parameter
+  CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, stream.get()));
+#else
+  // CUDA 12.x API - requires failIdx parameter
+  size_t failIdx;
+  CUCO_CUDA_TRY(
+    cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrsIdxs, 1, &failIdx, stream.get()));
+#endif
+}
+
+}  // namespace cuco::detail