Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vortex-cuda/benches/dict_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ fn launch_dict_kernel_timed<V: cudarc::driver::DeviceRepr, I: cudarc::driver::De
let events = vortex_cuda::launch_cuda_kernel!(
execution_ctx: cuda_ctx,
module: "dict",
ptypes: &[value_ptype.to_string().as_str(), code_ptype.to_string().as_str()],
ptypes: &[value_ptype, code_ptype],
launch_args: [codes_view, codes_len_u64, values_view, output_view],
event_recording: CU_EVENT_BLOCKING_SYNC,
array_len: codes_len
Expand Down
8 changes: 4 additions & 4 deletions vortex-cuda/benches/for_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ fn launch_for_kernel_timed_u8(
let events = vortex_cuda::launch_cuda_kernel!(
execution_ctx: cuda_ctx,
module: "for",
ptypes: &[for_array.ptype().to_string().as_str()],
ptypes: &[for_array.ptype()],
launch_args: [device_data, reference, array_len_u64],
event_recording: CU_EVENT_BLOCKING_SYNC,
array_len: for_array.len()
Expand All @@ -110,7 +110,7 @@ fn launch_for_kernel_timed_u16(
let events = vortex_cuda::launch_cuda_kernel!(
execution_ctx: cuda_ctx,
module: "for",
ptypes: &[for_array.ptype().to_string().as_str()],
ptypes: &[for_array.ptype()],
launch_args: [device_data, reference, array_len_u64],
event_recording: CU_EVENT_BLOCKING_SYNC,
array_len: for_array.len()
Expand All @@ -131,7 +131,7 @@ fn launch_for_kernel_timed_u32(
let events = vortex_cuda::launch_cuda_kernel!(
execution_ctx: cuda_ctx,
module: "for",
ptypes: &[for_array.ptype().to_string().as_str()],
ptypes: &[for_array.ptype()],
launch_args: [device_data, reference, array_len_u64],
event_recording: CU_EVENT_BLOCKING_SYNC,
array_len: for_array.len()
Expand All @@ -152,7 +152,7 @@ fn launch_for_kernel_timed_u64(
let events = vortex_cuda::launch_cuda_kernel!(
execution_ctx: cuda_ctx,
module: "for",
ptypes: &[for_array.ptype().to_string().as_str()],
ptypes: &[for_array.ptype()],
launch_args: [device_data, reference, array_len_u64],
event_recording: CU_EVENT_BLOCKING_SYNC,
array_len: for_array.len()
Expand Down
55 changes: 55 additions & 0 deletions vortex-cuda/kernels/src/patches.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

#include <stdint.h>

// Apply patches to a source array
template<typename ValueT, typename IndexT>
__device__ void patches(
ValueT *const values,
const IndexT *const patchIndices,
const ValueT *const patchValues,
uint64_t patchesLen
) {
const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx > patchesLen) {
return;
}

const IndexT patchIdx = patchIndices[idx];
const ValueT patchVal = patchValues[idx];

const size_t valueIdx = static_cast<size_t>(patchIdx);
values[valueIdx] = patchVal;
}

#define GENERATE_PATCHES_KERNEL(ValueT, value_suffix, IndexT, index_suffix) \
extern "C" __global__ void patches_##value_suffix##_##index_suffix( \
ValueT *const values, \
const IndexT *const patchIndices, \
const ValueT *const patchValues, \
uint64_t patchesLen \
) { \
patches(values, patchIndices, patchValues, patchesLen); \
}

#define GENERATE_PATCHES_KERNEL_FOR_VALUE(ValueT, value_suffix) \
GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint8_t, u8) \
GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint16_t, u16) \
GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint32_t, u32) \
GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint64_t, u64)


GENERATE_PATCHES_KERNEL_FOR_VALUE(uint8_t, u8)
GENERATE_PATCHES_KERNEL_FOR_VALUE(uint16_t, u16)
GENERATE_PATCHES_KERNEL_FOR_VALUE(uint32_t, u32)
GENERATE_PATCHES_KERNEL_FOR_VALUE(uint64_t, u64)

GENERATE_PATCHES_KERNEL_FOR_VALUE(int8_t, i8)
GENERATE_PATCHES_KERNEL_FOR_VALUE(int16_t, i16)
GENERATE_PATCHES_KERNEL_FOR_VALUE(int32_t, i32)
GENERATE_PATCHES_KERNEL_FOR_VALUE(int64_t, i64)

GENERATE_PATCHES_KERNEL_FOR_VALUE(float, f32)
GENERATE_PATCHES_KERNEL_FOR_VALUE(double, f64)
3 changes: 1 addition & 2 deletions vortex-cuda/src/device_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use crate::stream::await_stream_callback;
/// A [`DeviceBuffer`] wrapping a CUDA GPU allocation.
///
/// Like the host `BufferHandle` variant, all slicing/referencing works in terms of byte units.
#[derive(Clone)]
pub struct CudaDeviceBuffer {
allocation: Arc<dyn private::DeviceAllocation>,
/// Offset in bytes from the start of the allocation
Expand All @@ -39,8 +40,6 @@ pub struct CudaDeviceBuffer {
alignment: Alignment,
}

// We can call the sys methods, it's just a lot of extra code...fuck that lol

mod private {
use std::fmt::Debug;
use std::sync::Arc;
Expand Down
2 changes: 1 addition & 1 deletion vortex-cuda/src/kernel/arrays/dict.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ async fn execute_dict_prim_typed<V: DeviceRepr + NativePType, I: DeviceRepr + Na
let _cuda_events = crate::launch_cuda_kernel!(
execution_ctx: ctx,
module: "dict",
ptypes: &[value_ptype.to_string().as_str(), I::PTYPE.to_string().as_str()],
ptypes: &[value_ptype, I::PTYPE],
launch_args: [codes_view, codes_len_u64, values_view, output_view],
event_recording: cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING,
array_len: codes_len
Expand Down
60 changes: 44 additions & 16 deletions vortex-cuda/src/kernel/encodings/alp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use vortex_array::arrays::PrimitiveArrayParts;
use vortex_array::buffer::BufferHandle;
use vortex_cuda_macros::cuda_tests;
use vortex_dtype::NativePType;
use vortex_dtype::match_each_unsigned_integer_ptype;
use vortex_error::VortexResult;
use vortex_error::vortex_err;

Expand All @@ -28,6 +29,7 @@ use crate::CudaDeviceBuffer;
use crate::executor::CudaArrayExt;
use crate::executor::CudaExecute;
use crate::executor::CudaExecutionCtx;
use crate::kernel::patches::execute_patches;
use crate::launch_cuda_kernel_impl;

/// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
Expand Down Expand Up @@ -88,20 +90,33 @@ where
// Load kernel function
let kernel_ptypes = [A::ALPInt::PTYPE, A::PTYPE];
let cuda_function = ctx.load_function_ptype("alp", &kernel_ptypes)?;
let mut launch_builder = ctx.launch_builder(&cuda_function);
{
let mut launch_builder = ctx.launch_builder(&cuda_function);

// Build launch args: input, output, f, e, length
launch_builder.arg(&input_view);
launch_builder.arg(&output_view);
launch_builder.arg(&f);
launch_builder.arg(&e);
launch_builder.arg(&array_len_u64);

// Launch kernel
let _cuda_events =
launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
}

// Build launch args: input, output, f, e, length
launch_builder.arg(&input_view);
launch_builder.arg(&output_view);
launch_builder.arg(&f);
launch_builder.arg(&e);
launch_builder.arg(&array_len_u64);
// Check if there are any patches to decode here
let output_buf = if let Some(patches) = array.patches() {
match_each_unsigned_integer_ptype!(patches.indices_ptype()?, |I| {
execute_patches::<A, I>(patches.clone(), output_buf, ctx).await?
})
} else {
output_buf
};

// Launch kernel
let _cuda_events =
launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
// TODO(aduffy): scatter patch values validity. There are several places we'll need to start
// handling validity.

// Build result with newly allocated buffer
let output_handle = BufferHandle::new_device(Arc::new(output_buf));
Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
output_handle,
Expand All @@ -117,8 +132,10 @@ mod tests {
use vortex_array::IntoArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::assert_arrays_eq;
use vortex_array::validity::Validity::NonNullable;
use vortex_array::patches::Patches;
use vortex_array::validity::Validity;
use vortex_buffer::Buffer;
use vortex_buffer::buffer;
use vortex_error::VortexExpect;
use vortex_session::VortexSession;

Expand All @@ -138,13 +155,24 @@ mod tests {
let encoded_data: Vec<i32> = vec![100, 200, 300, 400, 500];
let exponents = Exponents { e: 0, f: 2 }; // multiply by 100

// Patches
let patches = Patches::new(
5,
0,
PrimitiveArray::new(buffer![0u32, 4u32], Validity::NonNullable).into_array(),
PrimitiveArray::new(buffer![0.0f32, 999f32], Validity::NonNullable).into_array(),
None,
)
.unwrap();

let alp_array = ALPArray::try_new(
PrimitiveArray::new(Buffer::from(encoded_data.clone()), NonNullable).into_array(),
PrimitiveArray::new(Buffer::from(encoded_data.clone()), Validity::NonNullable)
.into_array(),
exponents,
None,
Some(patches),
)?;

let cpu_result = alp_array.to_canonical()?;
let cpu_result = alp_array.to_canonical()?.into_array();

let gpu_result = ALPExecutor
.execute(alp_array.to_array(), &mut cuda_ctx)
Expand All @@ -154,7 +182,7 @@ mod tests {
.await?
.into_array();

assert_arrays_eq!(cpu_result.into_array(), gpu_result);
assert_arrays_eq!(cpu_result, gpu_result);

Ok(())
}
Expand Down
82 changes: 66 additions & 16 deletions vortex-cuda/src/kernel/encodings/bitpacked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::fmt::Debug;
use std::sync::Arc;

use async_trait::async_trait;
use cudarc::driver::DeviceRepr;
Expand All @@ -16,6 +17,8 @@ use vortex_array::buffer::DeviceBufferExt;
use vortex_cuda_macros::cuda_tests;
use vortex_dtype::NativePType;
use vortex_dtype::match_each_integer_ptype;
use vortex_dtype::match_each_unsigned_integer_ptype;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use vortex_error::vortex_ensure;
use vortex_error::vortex_err;
Expand All @@ -29,6 +32,7 @@ use crate::CudaDeviceBuffer;
use crate::executor::CudaExecute;
use crate::executor::CudaExecutionCtx;
use crate::kernel::launch_cuda_kernel_with_config;
use crate::kernel::patches::execute_patches;

/// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
#[derive(Debug)]
Expand Down Expand Up @@ -74,7 +78,6 @@ where
} = array.into_parts();

vortex_ensure!(len > 0, "Non empty array");
vortex_ensure!(patches.is_none(), "Patches not supported");
let offset = offset as usize;

let device_input: BufferHandle = if packed.is_on_device() {
Expand All @@ -97,27 +100,46 @@ where
let thread_count = if bits == 64 { 16 } else { 32 };
let suffixes: [&str; _] = [&format!("{bit_width}bw"), &format!("{thread_count}t")];
let cuda_function = ctx.load_function(&format!("bit_unpack_{}", bits), &suffixes)?;
let mut launch_builder = ctx.launch_builder(&cuda_function);

// Build launch args: input, output, f, e, length
launch_builder.arg(&input_view);
launch_builder.arg(&output_view);
{
Copy link
Contributor Author

@a10y a10y Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i had to move all the launch builder stuff into scope so it wasn't held across the await point for execute_patches

let mut launch_builder = ctx.launch_builder(&cuda_function);

let num_blocks = u32::try_from(len.div_ceil(1024))?;
// Build launch args: input, output, f, e, length
launch_builder.arg(&input_view);
launch_builder.arg(&output_view);

let config = LaunchConfig {
grid_dim: (num_blocks, 1, 1),
block_dim: (thread_count, 1, 1),
shared_mem_bytes: 0,
};
let num_blocks = u32::try_from(len.div_ceil(1024))?;

let config = LaunchConfig {
grid_dim: (num_blocks, 1, 1),
block_dim: (thread_count, 1, 1),
shared_mem_bytes: 0,
};

// Launch kernel
let _cuda_events =
launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?;
// Launch kernel
let _cuda_events =
launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?;
}

let output_handle = match patches {
None => BufferHandle::new_device(output_buf.slice_typed::<A>(offset..(offset + len))),
Some(p) => {
let output_buf = output_buf.slice_typed::<A>(offset..(offset + len));
let buf = output_buf
.as_any()
.downcast_ref::<CudaDeviceBuffer>()
.vortex_expect("we created this as CudaDeviceBuffer")
.clone();

let patched_buf = match_each_unsigned_integer_ptype!(p.indices_ptype()?, |I| {
execute_patches::<A, I>(p, buf, ctx).await?
});

BufferHandle::new_device(Arc::new(patched_buf))
}
};

// Build result with newly allocated buffer
let output_handle =
BufferHandle::new_device(output_buf.slice_typed::<A>(offset..(offset + len)));
Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
output_handle,
A::PTYPE,
Expand All @@ -141,6 +163,34 @@ mod tests {
use crate::CanonicalCudaExt;
use crate::session::CudaSession;

#[test]
fn test_patches() -> VortexResult<()> {
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context");

let array = PrimitiveArray::new((0u16..=513).collect::<Buffer<_>>(), NonNullable);

// Last two items should be patched
let bp_with_patches = BitPackedArray::encode(array.as_ref(), 9)?;
assert!(bp_with_patches.patches().is_some());

let cpu_result = bp_with_patches.to_canonical()?.into_array();

let gpu_result = block_on(async {
BitPackedExecutor
.execute(bp_with_patches.to_array(), &mut cuda_ctx)
.await
.vortex_expect("GPU decompression failed")
.into_host()
.await
.map(|a| a.into_array())
})?;

assert_arrays_eq!(cpu_result, gpu_result);

Ok(())
}

#[rstest]
#[case::bw_1(1)]
#[case::bw_2(2)]
Expand Down
3 changes: 2 additions & 1 deletion vortex-cuda/src/kernel/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use vortex_utils::aliases::dash_map::DashMap;
mod arrays;
mod encodings;
mod filter;
mod patches;
mod slice;

pub use arrays::DictExecutor;
Expand Down Expand Up @@ -65,7 +66,7 @@ macro_rules! launch_cuda_kernel {
array_len: $len:expr
) => {{
use ::cudarc::driver::PushKernelArg as _;
let cuda_function = $ctx.load_function($module, $ptypes)?;
let cuda_function = $ctx.load_function_ptype($module, $ptypes)?;
let mut launch_builder = $ctx.launch_builder(&cuda_function);

$(
Expand Down
Loading
Loading