diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs
index 1cfa2ff69b3..290835639b9 100644
--- a/vortex-cuda/benches/for_cuda.rs
+++ b/vortex-cuda/benches/for_cuda.rs
@@ -211,7 +211,7 @@ fn benchmark_for_u8(c: &mut Criterion) {
 
                     for _ in 0..iters {
                         let device_data = cuda_ctx
-                            .copy_buffer_to_device(unpacked_slice)
+                            .copy_buffer_to_device_sync(unpacked_slice)
                             .vortex_expect("failed to copy to device");
 
                         let kernel_time = launch_for_kernel_timed_u8(
@@ -260,7 +260,7 @@ fn benchmark_for_u16(c: &mut Criterion) {
 
                     for _ in 0..iters {
                         let device_data = cuda_ctx
-                            .copy_buffer_to_device(unpacked_slice)
+                            .copy_buffer_to_device_sync(unpacked_slice)
                             .vortex_expect("failed to copy to device");
 
                         let kernel_time = launch_for_kernel_timed_u16(
@@ -309,7 +309,7 @@ fn benchmark_for_u32(c: &mut Criterion) {
 
                     for _ in 0..iters {
                         let device_data = cuda_ctx
-                            .copy_buffer_to_device(unpacked_slice)
+                            .copy_buffer_to_device_sync(unpacked_slice)
                             .vortex_expect("failed to copy to device");
 
                         let kernel_time = launch_for_kernel_timed_u32(
@@ -358,7 +358,7 @@ fn benchmark_for_u64(c: &mut Criterion) {
 
                     for _ in 0..iters {
                         let device_data = cuda_ctx
-                            .copy_buffer_to_device(unpacked_slice)
+                            .copy_buffer_to_device_sync(unpacked_slice)
                             .vortex_expect("failed to copy to device");
 
                         let kernel_time = launch_for_kernel_timed_u64(
diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
index 4ae59c6dd2b..258499e3a97 100644
--- a/vortex-cuda/src/executor.rs
+++ b/vortex-cuda/src/executor.rs
@@ -92,6 +92,11 @@ impl CudaExecutionCtx {
         self.cuda_session.load_function(module_name, ptypes)
     }
 
+    /// Returns a reference to the CUDA stream.
+    pub fn stream(&self) -> &Arc<CudaStream> {
+        &self.stream
+    }
+
     /// Returns a launch builder for a CUDA kernel function.
     ///
     /// Arguments can be added to the kernel launch with `.arg(buffer)`.
@@ -104,7 +109,7 @@ impl CudaExecutionCtx {
     }
 
     /// Copies host data to the device, returning a [`CudaDeviceBuffer`].
-    pub fn copy_buffer_to_device<T: DeviceRepr>(
+    pub fn copy_buffer_to_device_sync<T: DeviceRepr>(
         &self,
         data: &[T],
     ) -> VortexResult<CudaDeviceBuffer<T>> {
@@ -127,7 +132,7 @@ impl CudaExecutionCtx {
     /// # Returns
     ///
     /// A future that resolves to the device buffer handle when the copy completes.
-    pub fn copy_buffer_to_device_async<T: DeviceRepr + Send + Sync + 'static>(
+    pub fn copy_buffer_to_device<T: DeviceRepr + Send + Sync + 'static>(
         &self,
         handle: BufferHandle,
     ) -> VortexResult<BoxFuture<'static, VortexResult<BufferHandle>>> {
diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs
index bea4c690081..148ce15b2f4 100644
--- a/vortex-cuda/src/kernel/encodings/for_.rs
+++ b/vortex-cuda/src/kernel/encodings/for_.rs
@@ -3,7 +3,6 @@
 
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
-use cudarc::driver::PushKernelArg;
 use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
@@ -19,7 +18,7 @@ use crate::CudaBufferExt;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::launch_cuda_kernel;
+use crate::kernel::KernelLauncher;
 
 /// CUDA executor for frame-of-reference.
 #[derive(Debug)]
@@ -61,7 +60,7 @@ async fn execute_for_typed<P: DeviceRepr + NativePType>(
     let device_buffer_handle = if buffer_handle.is_on_device() {
         buffer_handle
     } else {
-        ctx.copy_buffer_to_device_async::<P>(buffer_handle)?.await?
+        ctx.copy_buffer_to_device::<P>(buffer_handle)?.await?
     };
 
     let cuda_view = device_buffer_handle.cuda_view::<P>()?;
@@ -69,15 +68,12 @@ async fn execute_for_typed<P: DeviceRepr + NativePType>(
 
     // Ignore the CUDA events returned from the kernel launch, as the CUDA slice,
     // owned by the buffer handle, holds CUDA events that can be checked for completion.
-    let _cuda_events = launch_cuda_kernel!(
-        execution_ctx: ctx,
-        module: "for",
-        ptypes: &[array.ptype()],
-        launch_args: [cuda_view, reference, array_len],
-        // CUDA events are automatically submitted before and after the kernel launch.
-        event_recording: CU_EVENT_DISABLE_TIMING,
-        array_len: array.len()
-    );
+    let _cuda_events = KernelLauncher::new(ctx, "for", &[array.ptype()])?
+        .arg_cuda_view(&cuda_view)
+        .arg(&reference)
+        .arg(&array_len)
+        .event_flags(CU_EVENT_DISABLE_TIMING)
+        .launch(array.len())?;
 
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
         device_buffer_handle,
diff --git a/vortex-cuda/src/kernel/launcher.rs b/vortex-cuda/src/kernel/launcher.rs
new file mode 100644
index 00000000000..50ba4640e45
--- /dev/null
+++ b/vortex-cuda/src/kernel/launcher.rs
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Type-safe CUDA kernel launcher that replaces the `launch_cuda_kernel!` macro.
+//!
+//! This module provides a builder-based API for launching CUDA kernels without
+//! requiring a macro. The key challenge is that CUDA's `cuLaunchKernel` requires
+//! a `void**` (array of pointers to arguments), so we need to store argument values
+//! and provide stable pointers to them.
+//!
+//! # Example
+//!
+//! ```ignore
+//! use vortex_cuda::kernel::KernelLauncher;
+//! use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
+//!
+//! // Instead of:
+//! // launch_cuda_kernel!(
+//! //     execution_ctx: ctx,
+//! //     module: "for",
+//! //     ptypes: &[array.ptype()],
+//! //     launch_args: [cuda_view, reference, array_len],
+//! //     event_recording: CU_EVENT_DISABLE_TIMING,
+//! //     array_len: array.len()
+//! // );
+//!
+//! // Use:
+//! let events = KernelLauncher::new(ctx, "for", &[array.ptype()])?
+//!     .arg_view(&cuda_view)
+//!     .arg(&reference)
+//!     .arg(&array_len)
+//!     .event_flags(CU_EVENT_DISABLE_TIMING)
+//!     .launch(array.len())?;
+//! ```
+
+use std::sync::Arc;
+
+use cudarc::driver::CudaFunction;
+use cudarc::driver::CudaStream;
+use cudarc::driver::CudaView;
+use cudarc::driver::DevicePtr;
+use cudarc::driver::DeviceRepr;
+use cudarc::driver::LaunchConfig;
+use cudarc::driver::PushKernelArg;
+use cudarc::driver::sys::CUevent_flags;
+use vortex_dtype::PType;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+
+use crate::CudaKernelEvents;
+use crate::executor::CudaExecutionCtx;
+
+/// A builder for launching CUDA kernels with type-safe argument handling.
+///
+/// This struct collects kernel arguments and configuration, then launches the kernel.
+/// Arguments are stored internally to ensure their memory remains valid until launch.
+///
+/// # Memory Layout
+///
+/// Arguments are stored as `u64` values (8 bytes each), which is sufficient for:
+/// - All primitive scalar types (u8, u16, u32, u64, i8, i16, i32, i64, f32, f64)
+/// - Device pointers (`CUdeviceptr` is a `u64`)
+///
+/// The arguments are added to cudarc's `LaunchArgs` builder at launch time,
+/// after all arguments have been collected and storage is stable.
+pub struct KernelLauncher<'a> {
+    stream: &'a Arc<CudaStream>,
+    function: CudaFunction,
+    /// Storage for argument values. Each value occupies one u64 slot.
+    storage: Vec<u64>,
+    /// Event recording flags (None means no event recording).
+    event_flags: Option<CUevent_flags>,
+}
+
+impl<'a> KernelLauncher<'a> {
+    /// Creates a new kernel launcher for the specified module and ptypes.
+    ///
+    /// # Arguments
+    ///
+    /// * `ctx` - The CUDA execution context
+    /// * `module_name` - Name of the PTX module (e.g., "for")
+    /// * `ptypes` - Primitive types that determine the kernel variant (e.g., `&[PType::U32]`)
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the kernel function cannot be loaded.
+    pub fn new(
+        ctx: &'a CudaExecutionCtx,
+        module_name: &str,
+        ptypes: &[PType],
+    ) -> VortexResult<Self> {
+        let function = ctx.load_function(module_name, ptypes)?;
+        let stream = ctx.stream();
+        Ok(Self {
+            stream,
+            function,
+            storage: Vec::new(),
+            event_flags: None,
+        })
+    }
+
+    /// Adds a scalar argument to the kernel launch.
+    ///
+    /// Supports any type that implements `DeviceRepr` and `Copy` and fits in 8 bytes:
+    /// - Integers: u8, u16, u32, u64, i8, i16, i32, i64
+    /// - Floats: f32, f64
+    ///
+    /// # Panics
+    ///
+    /// Panics if `size_of::<T>() > 8`.
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// launcher
+    ///     .arg(&42u32)
+    ///     .arg(&3.14f64)
+    ///     .arg(&array_len);
+    /// ```
+    pub fn arg<T: DeviceRepr + Copy>(mut self, value: &T) -> Self {
+        assert!(
+            size_of::<T>() <= 8,
+            "Scalar argument must fit in 8 bytes, got {} bytes for {}",
+            size_of::<T>(),
+            std::any::type_name::<T>()
+        );
+
+        // Store the value as a u64 (zeroed first to ensure padding is deterministic)
+        let mut storage_value: u64 = 0;
+        // SAFETY: We've asserted that T fits in 8 bytes, and storage_value is properly aligned
+        // for u64 which is at least as aligned as any primitive type <= 8 bytes.
+        unsafe {
+            std::ptr::copy_nonoverlapping(
+                value as *const T as *const u8,
+                (&raw mut storage_value).cast::<u8>(),
+                size_of::<T>(),
+            );
+        }
+        self.storage.push(storage_value);
+        self
+    }
+
+    /// Adds a CUDA device buffer view as an argument.
+    ///
+    /// This extracts the device pointer from the view and stores it for the kernel.
+    /// The view's underlying memory must remain valid until the kernel completes execution.
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// let cuda_view = buffer_handle.cuda_view::<f32>()?;
+    /// launcher.arg_view(&cuda_view);
+    /// ```
+    pub fn arg_cuda_view<T: DeviceRepr>(mut self, view: &CudaView<'_, T>) -> Self {
+        // Get the device pointer value (CUdeviceptr is a u64)
+        // The _sync guard is dropped immediately, but that's fine since we're just
+        // reading the pointer value, not scheduling any work yet.
+        let (device_ptr, _sync) = view.device_ptr(self.stream);
+        self.storage.push(device_ptr);
+        self
+    }
+
+    /// Sets the event recording flags for kernel launch timing.
+    ///
+    /// Events are recorded before and after the kernel launch for synchronization
+    /// and optional timing measurements.
+    ///
+    /// # Arguments
+    ///
+    /// * `flags` - Event flags. Use `CU_EVENT_DISABLE_TIMING` for minimal overhead,
+    ///   or `CU_EVENT_DEFAULT` to enable timestamps for profiling.
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
+    ///
+    /// launcher.event_flags(CU_EVENT_DISABLE_TIMING);
+    /// ```
+    pub fn event_flags(mut self, flags: CUevent_flags) -> Self {
+        self.event_flags = Some(flags);
+        self
+    }
+
+    /// Launches the kernel with the configured arguments.
+    ///
+    /// # Arguments
+    ///
+    /// * `array_len` - The total number of elements to process
+    ///
+    /// # Launch Configuration
+    ///
+    /// The kernel is launched with:
+    /// - `grid_dim`: `(ceil(array_len / 2048), 1, 1)` blocks
+    /// - `block_dim`: `(64, 1, 1)` threads per block (2 warps)
+    /// - Each thread processes 32 elements
+    /// - Each block processes 2048 elements
+    /// - The last block/thread may process fewer elements
+    ///
+    /// # Returns
+    ///
+    /// Returns `CudaKernelEvents` with before/after launch events for synchronization.
+    ///
+    /// # Safety
+    ///
+    /// The kernel launch is inherently unsafe because:
+    /// - We cannot verify that arguments match the kernel signature
+    /// - We cannot verify argument order or types at compile time
+    /// - The kernel may access memory outside bounds
+    /// - Device buffers passed as arguments may be mutated by the kernel
+    ///
+    /// The caller is responsible for ensuring arguments are correct.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - Event flags were not set (required for this API)
+    /// - Event recording fails
+    /// - The kernel launch fails
+    pub fn launch(self, array_len: usize) -> VortexResult<CudaKernelEvents> {
+        let num_chunks = u32::try_from(array_len.div_ceil(2048))?;
+
+        let config = LaunchConfig {
+            grid_dim: (num_chunks, 1, 1),
+            block_dim: (64, 1, 1),
+            shared_mem_bytes: 0,
+        };
+
+        // Get the event flags - required for this API to match macro behavior
+        let event_flags = self
+            .event_flags
+            .ok_or_else(|| vortex_err!("Event flags must be set before launch"))?;
+
+        // Build LaunchArgs using cudarc's builder.
+        // Storage is now stable (no more additions), so references remain valid.
+        let mut launch_args = self.stream.launch_builder(&self.function);
+
+        // Add all stored arguments to the launch builder
+        for storage_val in self.storage.iter() {
+            launch_args.arg(storage_val);
+        }
+
+        // Enable event recording
+        launch_args.record_kernel_launch(event_flags);
+
+        // Launch the kernel
+        // SAFETY: This is unsafe because we cannot verify argument types match the kernel.
+        // The caller is responsible for ensuring arguments are correct.
+        unsafe {
+            launch_args
+                .launch(config)
+                .map_err(|e| vortex_err!("Failed to launch kernel: {}", e))
+                .and_then(|events| {
+                    events
+                        .ok_or_else(|| vortex_err!("CUDA events not recorded"))
+                        .map(|(before_launch, after_launch)| CudaKernelEvents {
+                            before_launch,
+                            after_launch,
+                        })
+                })
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test_arg_storage_size() {
+        // Verify that all supported scalar types fit in 8 bytes
+        assert!(size_of::<u8>() <= 8);
+        assert!(size_of::<u16>() <= 8);
+        assert!(size_of::<u32>() <= 8);
+        assert!(size_of::<u64>() <= 8);
+        assert!(size_of::<i8>() <= 8);
+        assert!(size_of::<i16>() <= 8);
+        assert!(size_of::<i32>() <= 8);
+        assert!(size_of::<i64>() <= 8);
+        assert!(size_of::<f32>() <= 8);
+        assert!(size_of::<f64>() <= 8);
+    }
+}
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
index a4efa4727ed..514f37242b4 100644
--- a/vortex-cuda/src/kernel/mod.rs
+++ b/vortex-cuda/src/kernel/mod.rs
@@ -22,14 +22,19 @@ use vortex_utils::aliases::dash_map::DashMap;
 
 mod arrays;
 mod encodings;
+mod launcher;
 
 pub use arrays::DictExecutor;
 pub use encodings::FoRExecutor;
+pub use launcher::KernelLauncher;
 
 use crate::CudaKernelEvents;
 
 /// Convenience macro to launch a CUDA kernel.
 ///
+/// **Deprecated**: Prefer using [`KernelLauncher`] instead, which provides
+/// the same functionality with a cleaner builder API.
+///
 /// The kernel gets launched on the stream of the execution context.
 ///
 /// The kernel launch config:
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index d9da884a5d9..ac3ee70f62a 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -17,6 +17,7 @@ pub use executor::CudaExecutionCtx;
 pub use executor::CudaKernelEvents;
 use kernel::DictExecutor;
 use kernel::FoRExecutor;
+pub use kernel::KernelLauncher;
 pub use kernel::launch_cuda_kernel_impl;
 pub use session::CudaSession;
 use vortex_array::arrays::DictVTable;