diff --git a/Cargo.lock b/Cargo.lock index 9cc10fe4415..2b9d93911a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2270,6 +2270,7 @@ version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3aa12038120eb13347a6ae2ffab1d34efe78150125108627fd85044dd4d6ff1e" dependencies = [ + "half", "libloading 0.8.9", ] @@ -4441,6 +4442,8 @@ dependencies = [ "cfg-if", "crunchy", "num-traits", + "rand 0.9.2", + "rand_distr 0.5.1", "zerocopy", ] @@ -10409,6 +10412,7 @@ dependencies = [ "vortex-mask", "vortex-nvcomp", "vortex-scalar", + "vortex-sequence", "vortex-session", "vortex-utils", "vortex-zigzag", diff --git a/encodings/sequence/src/array.rs b/encodings/sequence/src/array.rs index f99e2d812ab..36ad094d3be 100644 --- a/encodings/sequence/src/array.rs +++ b/encodings/sequence/src/array.rs @@ -59,13 +59,22 @@ pub struct SequenceMetadata { multiplier: Option, } +/// Components of [`SequenceArray`]. +pub struct SequenceArrayParts { + pub base: PValue, + pub multiplier: PValue, + pub len: usize, + pub ptype: PType, + pub nullability: Nullability, +} + #[derive(Clone, Debug)] /// An array representing the equation `A[i] = base + i * multiplier`. pub struct SequenceArray { base: PValue, multiplier: PValue, dtype: DType, - pub(crate) length: usize, + pub(crate) len: usize, stats_set: ArrayStats, } @@ -124,7 +133,7 @@ impl SequenceArray { base, multiplier, dtype, - length, + len: length, // TODO(joe): add stats, on construct or on use? stats_set: Default::default(), } @@ -164,7 +173,7 @@ impl SequenceArray { } pub(crate) fn index_value(&self, idx: usize) -> PValue { - assert!(idx < self.length, "index_value({idx}): index out of bounds"); + assert!(idx < self.len, "index_value({idx}): index out of bounds"); match_each_native_ptype!(self.ptype(), |P| { let base = self.base.cast::

(); @@ -177,9 +186,19 @@ impl SequenceArray { /// Returns the validated final value of a sequence array pub fn last(&self) -> PValue { - Self::try_last(self.base, self.multiplier, self.ptype(), self.length) + Self::try_last(self.base, self.multiplier, self.ptype(), self.len) .vortex_expect("validated array") } + + pub fn into_parts(self) -> SequenceArrayParts { + SequenceArrayParts { + base: self.base, + multiplier: self.multiplier, + len: self.len, + ptype: self.dtype.as_ptype(), + nullability: self.dtype.nullability(), + } + } } impl VTable for SequenceVTable { @@ -355,7 +374,7 @@ fn execute_iter>( impl BaseArrayVTable for SequenceVTable { fn len(array: &SequenceArray) -> usize { - array.length + array.len } fn dtype(array: &SequenceArray) -> &DType { @@ -374,14 +393,14 @@ impl BaseArrayVTable for SequenceVTable { array.base.hash(state); array.multiplier.hash(state); array.dtype.hash(state); - array.length.hash(state); + array.len.hash(state); } fn array_eq(array: &SequenceArray, other: &SequenceArray, _precision: Precision) -> bool { array.base == other.base && array.multiplier == other.multiplier && array.dtype == other.dtype - && array.length == other.length + && array.len == other.len } } diff --git a/encodings/sequence/src/kernel.rs b/encodings/sequence/src/kernel.rs index b13d5d491e7..586cfe629f7 100644 --- a/encodings/sequence/src/kernel.rs +++ b/encodings/sequence/src/kernel.rs @@ -89,7 +89,7 @@ impl ExecuteParentKernel for SequenceCompareKernel { // Constant is null - result is all null for comparisons let nullability = array.dtype().nullability() | constant.dtype().nullability(); let result_array = - ConstantArray::new(Scalar::null(DType::Bool(nullability)), array.length).to_array(); + ConstantArray::new(Scalar::null(DType::Bool(nullability)), array.len).to_array(); return Ok(Some(result_array.execute(ctx)?)); }; @@ -125,22 +125,22 @@ fn compare_eq_neq( // Check if there exists an integer solution to const = base + idx * multiplier let Some(set_idx) = - find_intersection_scalar(array.base(), array.multiplier(), array.length, constant) + find_intersection_scalar(array.base(), array.multiplier(), array.len, constant) else { let result_array = ConstantArray::new( Scalar::new(DType::Bool(nullability), not_match_val.into()), - array.length, + array.len, ) .to_array(); return Ok(Some(result_array.execute(ctx)?)); }; let idx = set_idx as u64; - let len = array.length as u64; + let len = array.len as u64; if len == 1 && set_idx == 0 { let result_array = ConstantArray::new( Scalar::new(DType::Bool(nullability), match_val.into()), - array.length, + array.len, ) .to_array(); return Ok(Some(result_array.execute(ctx)?)); @@ -179,7 +179,7 @@ fn compare_ordering( let transition = find_transition_point( array.base(), array.multiplier(), - array.length, + array.len, constant, operator, ); @@ -187,23 +187,23 @@ fn compare_ordering( let result_array = match transition { Transition::AllTrue => ConstantArray::new( Scalar::new(DType::Bool(nullability), true.into()), - array.length, + array.len, ) .to_array(), Transition::AllFalse => ConstantArray::new( Scalar::new(DType::Bool(nullability), false.into()), - array.length, + array.len, ) .to_array(), Transition::FalseToTrue(idx) => { // [0..idx) is false, [idx..len) is true - let ends = buffer![idx as u64, array.length as u64].into_array(); + let ends = buffer![idx as u64, array.len as u64].into_array(); let values = BoolArray::new(bitbuffer![false, true], nullability.into()).into_array(); RunEndArray::try_new(ends, values)?.into_array() } Transition::TrueToFalse(idx) => { // [0..idx) is true, [idx..len) is false - let ends = buffer![idx as u64, array.length as u64].into_array(); + let ends = buffer![idx as u64, array.len as u64].into_array(); let values = BoolArray::new(bitbuffer![true, false], nullability.into()).into_array(); RunEndArray::try_new(ends, values)?.into_array() } diff --git a/encodings/sequence/src/lib.rs b/encodings/sequence/src/lib.rs index 1b099edcc9b..7dabc831b99 100644 --- a/encodings/sequence/src/lib.rs +++ b/encodings/sequence/src/lib.rs @@ -9,6 +9,7 @@ mod kernel; /// Represents the equation A\[i\] = a * i + b. /// This can be used for compression, fast comparisons and also for row ids. pub use array::SequenceArray; +pub use array::SequenceArrayParts; /// Represents the equation A\[i\] = a * i + b. /// This can be used for compression, fast comparisons and also for row ids. pub use array::SequenceVTable; diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml index b2f5a0bc1cf..99a481e1720 100644 --- a/vortex-cuda/Cargo.toml +++ b/vortex-cuda/Cargo.toml @@ -23,7 +23,7 @@ _test-harness = [] [dependencies] arc-swap = { workspace = true } async-trait = { workspace = true } -cudarc = { workspace = true } +cudarc = { workspace = true, features = ["f16"] } fastlanes = { workspace = true } futures = { workspace = true, features = ["executor"] } kanal = { workspace = true } @@ -41,6 +41,7 @@ vortex-fastlanes = { workspace = true } vortex-io = { workspace = true } vortex-mask = { workspace = true } vortex-nvcomp = { path = "nvcomp" } +vortex-sequence = { workspace = true } vortex-session = { workspace = true } vortex-utils = { workspace = true } vortex-zigzag = { workspace = true } diff --git a/vortex-cuda/kernels/src/sequence.cu b/vortex-cuda/kernels/src/sequence.cu new file mode 100644 index 00000000000..1b58d62fa93 --- /dev/null +++ b/vortex-cuda/kernels/src/sequence.cu @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#include "config.cuh" +#include + +#define MIN(a, b) (((a) < (b)) : (a) : (b)) + +template +__device__ void sequence( + ValueT *const output, + ValueT base, + ValueT multiplier, + uint64_t len +) { + const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x; + + const uint64_t elemStart = MIN(worker * ELEMENTS_PER_THREAD, len); + const uint64_t elemEnd = MIN(elemStart + ELEMENTS_PER_THREAD, len); + + for (uint64_t idx = elemStart; idx < elemEnd; idx++) { + output[idx] = static_cast(idx) * multiplier + base; + } +} + +#define GENERATE_KERNEL(ValueT, suffix) \ +extern "C" __global__ void sequence_##suffix( \ + ValueT *const output, \ + ValueT base, \ + ValueT multiplier, \ + uint64_t len \ +) { \ + sequence(output, base, multiplier, len); \ +} + +GENERATE_KERNEL(uint8_t, u8); +GENERATE_KERNEL(uint16_t, u16); +GENERATE_KERNEL(uint32_t, u32); +GENERATE_KERNEL(uint64_t, u64); +GENERATE_KERNEL(int8_t, i8); +GENERATE_KERNEL(int16_t, i16); +GENERATE_KERNEL(int32_t, i32); +GENERATE_KERNEL(int64_t, i64); +GENERATE_KERNEL(float, f32); +GENERATE_KERNEL(double, f64); diff --git a/vortex-cuda/src/kernel/encodings/mod.rs b/vortex-cuda/src/kernel/encodings/mod.rs index addb877543c..ed38a71b023 100644 --- a/vortex-cuda/src/kernel/encodings/mod.rs +++ b/vortex-cuda/src/kernel/encodings/mod.rs @@ -5,6 +5,7 @@ mod alp; mod bitpacked; mod decimal_byte_parts; mod for_; +mod sequence; mod zigzag; mod zstd; @@ -12,6 +13,7 @@ pub use alp::ALPExecutor; pub use bitpacked::BitPackedExecutor; pub use decimal_byte_parts::DecimalBytePartsExecutor; pub use for_::FoRExecutor; +pub use sequence::SequenceExecutor; pub use zigzag::ZigZagExecutor; pub use zstd::ZstdExecutor; pub use zstd::ZstdKernelPrep; diff --git a/vortex-cuda/src/kernel/encodings/sequence.rs b/vortex-cuda/src/kernel/encodings/sequence.rs new file mode 100644 index 00000000000..d098a02d971 --- /dev/null +++ b/vortex-cuda/src/kernel/encodings/sequence.rs @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use async_trait::async_trait; +use cudarc::driver::DeviceRepr; +use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::buffer::BufferHandle; +use vortex_cuda_macros::cuda_tests; +use vortex_dtype::NativePType; +use vortex_dtype::Nullability; +use vortex_dtype::match_each_native_ptype; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_sequence::SequenceArrayParts; +use vortex_sequence::SequenceVTable; + +use crate::CudaDeviceBuffer; +use crate::CudaExecutionCtx; +use crate::executor::CudaExecute; +use crate::launch_cuda_kernel; + +/// CUDA execution for `SequenceArray`. +#[derive(Debug)] +pub struct SequenceExecutor; + +#[async_trait] +impl CudaExecute for SequenceExecutor { + async fn execute( + &self, + array: ArrayRef, + ctx: &mut CudaExecutionCtx, + ) -> VortexResult { + let array = array + .try_into::() + .map_err(|_| vortex_err!("SequenceExecutor can only accept SequenceArray"))?; + + let SequenceArrayParts { + base, + multiplier, + len, + ptype, + nullability, + } = array.into_parts(); + + match_each_native_ptype!(ptype, |P| { + let base = base.cast::

(); + let multiplier = multiplier.cast::

(); + + execute_typed::

(base, multiplier, len, nullability, ctx).await + }) + } +} + +async fn execute_typed( + base: T, + multiplier: T, + len: usize, + nullability: Nullability, + ctx: &mut CudaExecutionCtx, +) -> VortexResult { + let buffer = ctx.device_alloc::(len)?; + + let len_u64 = len as u64; + + let _events = launch_cuda_kernel!( + execution_ctx: ctx, + module: "sequence", + ptypes: &[T::PTYPE.to_string().as_str()], + launch_args: [buffer, base, multiplier, len_u64], + event_recording: CU_EVENT_DISABLE_TIMING, + array_len: len + ); + + let output_buf = BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(buffer))); + + Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle( + output_buf, + T::PTYPE, + nullability.into(), + ))) +} + +#[cuda_tests] +mod tests { + use futures::executor::block_on; + use rstest::rstest; + use vortex_array::IntoArray; + use vortex_array::assert_arrays_eq; + use vortex_dtype::NativePType; + use vortex_dtype::Nullability; + use vortex_scalar::PValue; + use vortex_sequence::SequenceArray; + use vortex_session::VortexSession; + + use crate::CanonicalCudaExt; + use crate::CudaSession; + use crate::executor::CudaExecute; + use crate::kernel::encodings::sequence::SequenceExecutor; + + #[rstest] + #[case::u8(10u8, 2u8, 10)] + #[case::u16(10u16, 2u16, 100)] + #[case::u32(10u32, 2u32, 1000)] + #[case::u64(100u64, 20u64, 500)] + fn test_sequence>( + #[case] base: T, + #[case] multiplier: T, + #[case] len: usize, + ) { + block_on( + async move { test_ptype::(base, multiplier, len, Nullability::NonNullable).await }, + ); + + block_on( + async move { test_ptype::(base, multiplier, len, Nullability::Nullable).await }, + ); + } + + async fn test_ptype>( + base: P, + multiplier: P, + len: usize, + nullability: Nullability, + ) { + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).unwrap(); + + let array = SequenceArray::typed_new(base, multiplier, nullability, len).unwrap(); + + let cpu_result = array.to_canonical().unwrap().into_array(); + + let gpu_result = SequenceExecutor + .execute(array.into_array(), &mut cuda_ctx) + .await + .unwrap() + .into_host() + .await + .unwrap() + .into_array(); + + assert_arrays_eq!(cpu_result, gpu_result); + } +} diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs index c914fe1b749..b464281fde8 100644 --- a/vortex-cuda/src/lib.rs +++ b/vortex-cuda/src/lib.rs @@ -42,9 +42,11 @@ use vortex_decimal_byte_parts::DecimalBytePartsVTable; use vortex_fastlanes::BitPackedVTable; use vortex_fastlanes::FoRVTable; pub use vortex_nvcomp as nvcomp; +use vortex_sequence::SequenceVTable; use vortex_zigzag::ZigZagVTable; use vortex_zstd::ZstdVTable; +use crate::kernel::SequenceExecutor; use crate::kernel::SliceExecutor; /// Checks if CUDA is available on the system by looking for nvcc. @@ -63,6 +65,7 @@ pub fn initialize_cuda(session: &CudaSession) { session.register_kernel(DecimalBytePartsVTable::ID, &DecimalBytePartsExecutor); session.register_kernel(DictVTable::ID, &DictExecutor); session.register_kernel(FoRVTable::ID, &FoRExecutor); + session.register_kernel(SequenceVTable::ID, &SequenceExecutor); session.register_kernel(ZigZagVTable::ID, &ZigZagExecutor); session.register_kernel(ZstdVTable::ID, &ZstdExecutor);