vortex-data · a10y · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026 · a10y
diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs
@@ -101,7 +101,7 @@ fn launch_dict_kernel_timed<V: cudarc::driver::DeviceRepr, I: cudarc::driver::De
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "dict",
-        ptypes: &[value_ptype.to_string().as_str(), code_ptype.to_string().as_str()],
+        ptypes: &[value_ptype, code_ptype],
         launch_args: [codes_view, codes_len_u64, values_view, output_view],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: codes_len

diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs
@@ -89,7 +89,7 @@ fn launch_for_kernel_timed_u8(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
@@ -110,7 +110,7 @@ fn launch_for_kernel_timed_u16(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
@@ -131,7 +131,7 @@ fn launch_for_kernel_timed_u32(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
@@ -152,7 +152,7 @@ fn launch_for_kernel_timed_u64(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()

diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include <stdint.h>
+
+// Apply patches to a source array
+template<typename ValueT, typename IndexT>
+__device__ void patches(
+    ValueT *const values,
+    const IndexT *const patchIndices,
+    const ValueT *const patchValues,
+    uint64_t patchesLen
+) {
+    const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx > patchesLen) {
+        return;
+    }
+
+    const IndexT patchIdx = patchIndices[idx];
+    const ValueT patchVal = patchValues[idx];
+
+    const size_t valueIdx = static_cast<size_t>(patchIdx);
+    values[valueIdx] = patchVal;
+}
+
+#define GENERATE_PATCHES_KERNEL(ValueT, value_suffix, IndexT, index_suffix) \
+extern "C" __global__ void patches_##value_suffix##_##index_suffix( \
+    ValueT *const values, \
+    const IndexT *const patchIndices, \
+    const ValueT *const patchValues, \
+    uint64_t patchesLen \
+) { \
+    patches(values, patchIndices, patchValues, patchesLen); \
+}
+
+#define GENERATE_PATCHES_KERNEL_FOR_VALUE(ValueT, value_suffix) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint8_t, u8) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint16_t, u16) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint32_t, u32) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint64_t, u64)
+
+
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint8_t, u8)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint16_t, u16)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint32_t, u32)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint64_t, u64)
+
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int8_t, i8)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int16_t, i16)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int32_t, i32)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int64_t, i64)
+
+GENERATE_PATCHES_KERNEL_FOR_VALUE(float, f32)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(double, f64)
diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
@@ -27,6 +27,7 @@ use crate::stream::await_stream_callback;
 /// A [`DeviceBuffer`] wrapping a CUDA GPU allocation.
 ///
 /// Like the host `BufferHandle` variant, all slicing/referencing works in terms of byte units.
+#[derive(Clone)]
 pub struct CudaDeviceBuffer {
     allocation: Arc<dyn private::DeviceAllocation>,
     /// Offset in bytes from the start of the allocation
@@ -39,8 +40,6 @@ pub struct CudaDeviceBuffer {
     alignment: Alignment,
 }
 
-// We can call the sys methods, it's just a lot of extra code...fuck that lol
-
 mod private {
     use std::fmt::Debug;
     use std::sync::Arc;

diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -129,7 +129,7 @@ async fn execute_dict_prim_typed<V: DeviceRepr + NativePType, I: DeviceRepr + Na
     let _cuda_events = crate::launch_cuda_kernel!(
         execution_ctx: ctx,
         module: "dict",
-        ptypes: &[value_ptype.to_string().as_str(), I::PTYPE.to_string().as_str()],
+        ptypes: &[value_ptype, I::PTYPE],
         launch_args: [codes_view, codes_len_u64, values_view, output_view],
         event_recording: cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING,
         array_len: codes_len

diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -20,6 +20,7 @@ use vortex_array::arrays::PrimitiveArrayParts;
 use vortex_array::buffer::BufferHandle;
 use vortex_cuda_macros::cuda_tests;
 use vortex_dtype::NativePType;
+use vortex_dtype::match_each_unsigned_integer_ptype;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 
@@ -28,6 +29,7 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
+use crate::kernel::patches::execute_patches;
 use crate::launch_cuda_kernel_impl;
 
 /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
@@ -88,20 +90,33 @@ where
     // Load kernel function
     let kernel_ptypes = [A::ALPInt::PTYPE, A::PTYPE];
     let cuda_function = ctx.load_function_ptype("alp", &kernel_ptypes)?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
+    {
+        let mut launch_builder = ctx.launch_builder(&cuda_function);
+
+        // Build launch args: input, output, f, e, length
+        launch_builder.arg(&input_view);
+        launch_builder.arg(&output_view);
+        launch_builder.arg(&f);
+        launch_builder.arg(&e);
+        launch_builder.arg(&array_len_u64);
+
+        // Launch kernel
+        let _cuda_events =
+            launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
+    }
 
-    // Build launch args: input, output, f, e, length
-    launch_builder.arg(&input_view);
-    launch_builder.arg(&output_view);
-    launch_builder.arg(&f);
-    launch_builder.arg(&e);
-    launch_builder.arg(&array_len_u64);
+    // Check if there are any patches to decode here
+    let output_buf = if let Some(patches) = array.patches() {
+        match_each_unsigned_integer_ptype!(patches.indices_ptype()?, |I| {
+            execute_patches::<A, I>(patches.clone(), output_buf, ctx).await?
+        })
+    } else {
+        output_buf
+    };
 
-    // Launch kernel
-    let _cuda_events =
-        launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
+    // TODO(aduffy): scatter patch values validity. There are several places we'll need to start
+    //  handling validity.
 
-    // Build result with newly allocated buffer
     let output_handle = BufferHandle::new_device(Arc::new(output_buf));
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
         output_handle,
@@ -117,8 +132,10 @@ mod tests {
     use vortex_array::IntoArray;
     use vortex_array::arrays::PrimitiveArray;
     use vortex_array::assert_arrays_eq;
-    use vortex_array::validity::Validity::NonNullable;
+    use vortex_array::patches::Patches;
+    use vortex_array::validity::Validity;
     use vortex_buffer::Buffer;
+    use vortex_buffer::buffer;
     use vortex_error::VortexExpect;
     use vortex_session::VortexSession;
 
@@ -138,13 +155,24 @@ mod tests {
         let encoded_data: Vec<i32> = vec![100, 200, 300, 400, 500];
         let exponents = Exponents { e: 0, f: 2 }; // multiply by 100
 
+        // Patches
+        let patches = Patches::new(
+            5,
+            0,
+            PrimitiveArray::new(buffer![0u32, 4u32], Validity::NonNullable).into_array(),
+            PrimitiveArray::new(buffer![0.0f32, 999f32], Validity::NonNullable).into_array(),
+            None,
+        )
+        .unwrap();
+
         let alp_array = ALPArray::try_new(
-            PrimitiveArray::new(Buffer::from(encoded_data.clone()), NonNullable).into_array(),
+            PrimitiveArray::new(Buffer::from(encoded_data.clone()), Validity::NonNullable)
+                .into_array(),
             exponents,
-            None,
+            Some(patches),
         )?;
 
-        let cpu_result = alp_array.to_canonical()?;
+        let cpu_result = alp_array.to_canonical()?.into_array();
 
         let gpu_result = ALPExecutor
             .execute(alp_array.to_array(), &mut cuda_ctx)
@@ -154,7 +182,7 @@ mod tests {
             .await?
             .into_array();
 
-        assert_arrays_eq!(cpu_result.into_array(), gpu_result);
+        assert_arrays_eq!(cpu_result, gpu_result);
 
         Ok(())
     }

diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::fmt::Debug;
+use std::sync::Arc;
 
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
@@ -16,6 +17,8 @@ use vortex_array::buffer::DeviceBufferExt;
 use vortex_cuda_macros::cuda_tests;
 use vortex_dtype::NativePType;
 use vortex_dtype::match_each_integer_ptype;
+use vortex_dtype::match_each_unsigned_integer_ptype;
+use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_ensure;
 use vortex_error::vortex_err;
@@ -29,6 +32,7 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
 use crate::kernel::launch_cuda_kernel_with_config;
+use crate::kernel::patches::execute_patches;
 
 /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
 #[derive(Debug)]
@@ -74,7 +78,6 @@ where
     } = array.into_parts();
 
     vortex_ensure!(len > 0, "Non empty array");
-    vortex_ensure!(patches.is_none(), "Patches not supported");
     let offset = offset as usize;
 
     let device_input: BufferHandle = if packed.is_on_device() {
@@ -97,27 +100,46 @@ where
     let thread_count = if bits == 64 { 16 } else { 32 };
     let suffixes: [&str; _] = [&format!("{bit_width}bw"), &format!("{thread_count}t")];
     let cuda_function = ctx.load_function(&format!("bit_unpack_{}", bits), &suffixes)?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
 
-    // Build launch args: input, output, f, e, length
-    launch_builder.arg(&input_view);
-    launch_builder.arg(&output_view);
+    {
+        let mut launch_builder = ctx.launch_builder(&cuda_function);
 
-    let num_blocks = u32::try_from(len.div_ceil(1024))?;
+        // Build launch args: input, output, f, e, length
+        launch_builder.arg(&input_view);
+        launch_builder.arg(&output_view);
 
-    let config = LaunchConfig {
-        grid_dim: (num_blocks, 1, 1),
-        block_dim: (thread_count, 1, 1),
-        shared_mem_bytes: 0,
-    };
+        let num_blocks = u32::try_from(len.div_ceil(1024))?;
+
+        let config = LaunchConfig {
+            grid_dim: (num_blocks, 1, 1),
+            block_dim: (thread_count, 1, 1),
+            shared_mem_bytes: 0,
+        };
 
-    // Launch kernel
-    let _cuda_events =
-        launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?;
+        // Launch kernel
+        let _cuda_events =
+            launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?;
+    }
+
+    let output_handle = match patches {
+        None => BufferHandle::new_device(output_buf.slice_typed::<A>(offset..(offset + len))),
+        Some(p) => {
+            let output_buf = output_buf.slice_typed::<A>(offset..(offset + len));
+            let buf = output_buf
+                .as_any()
+                .downcast_ref::<CudaDeviceBuffer>()
+                .vortex_expect("we created this as CudaDeviceBuffer")
+                .clone();
+
+            let patched_buf = match_each_unsigned_integer_ptype!(p.indices_ptype()?, |I| {
+                execute_patches::<A, I>(p, buf, ctx).await?
+            });
+
+            BufferHandle::new_device(Arc::new(patched_buf))
+        }
+    };
 
     // Build result with newly allocated buffer
-    let output_handle =
-        BufferHandle::new_device(output_buf.slice_typed::<A>(offset..(offset + len)));
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
         output_handle,
         A::PTYPE,
@@ -141,6 +163,34 @@ mod tests {
     use crate::CanonicalCudaExt;
     use crate::session::CudaSession;
 
+    #[test]
+    fn test_patches() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        let array = PrimitiveArray::new((0u16..=513).collect::<Buffer<_>>(), NonNullable);
+
+        // Last two items should be patched
+        let bp_with_patches = BitPackedArray::encode(array.as_ref(), 9)?;
+        assert!(bp_with_patches.patches().is_some());
+
+        let cpu_result = bp_with_patches.to_canonical()?.into_array();
+
+        let gpu_result = block_on(async {
+            BitPackedExecutor
+                .execute(bp_with_patches.to_array(), &mut cuda_ctx)
+                .await
+                .vortex_expect("GPU decompression failed")
+                .into_host()
+                .await
+                .map(|a| a.into_array())
+        })?;
+
+        assert_arrays_eq!(cpu_result, gpu_result);
+
+        Ok(())
+    }
+
     #[rstest]
     #[case::bw_1(1)]
     #[case::bw_2(2)]

diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
@@ -23,6 +23,7 @@ use vortex_utils::aliases::dash_map::DashMap;
 mod arrays;
 mod encodings;
 mod filter;
+mod patches;
 mod slice;
 
 pub use arrays::DictExecutor;
@@ -65,7 +66,7 @@ macro_rules! launch_cuda_kernel {
         array_len: $len:expr
     ) => {{
         use ::cudarc::driver::PushKernelArg as _;
-        let cuda_function = $ctx.load_function($module, $ptypes)?;
+        let cuda_function = $ctx.load_function_ptype($module, $ptypes)?;
         let mut launch_builder = $ctx.launch_builder(&cuda_function);
 
         $(