diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml index 88cb70ef1a5..7817d65e216 100644 --- a/encodings/runend/Cargo.toml +++ b/encodings/runend/Cargo.toml @@ -48,3 +48,7 @@ harness = false [[bench]] name = "run_end_compress" harness = false + +[[bench]] +name = "run_end_decode" +harness = false diff --git a/encodings/runend/benches/run_end_decode.rs b/encodings/runend/benches/run_end_decode.rs new file mode 100644 index 00000000000..93aa5aa58f3 --- /dev/null +++ b/encodings/runend/benches/run_end_decode.rs @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)] + +use divan::Bencher; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::compute::warm_up_vtables; +use vortex_array::validity::Validity; +use vortex_buffer::BitBuffer; +use vortex_buffer::BufferMut; +use vortex_runend::decompress_bool::runend_decode_bools; + +fn main() { + warm_up_vtables(); + divan::main(); +} + +/// Distribution types for bool benchmarks +#[derive(Clone, Copy)] +enum BoolDistribution { + /// Alternating true/false (50/50) + Alternating, + /// Mostly true (90% true runs) + MostlyTrue, + /// Mostly false (90% false runs) + MostlyFalse, + /// All true + AllTrue, + /// All false + AllFalse, +} + +/// Creates bool test data with configurable distribution +fn create_bool_test_data( + total_length: usize, + avg_run_length: usize, + distribution: BoolDistribution, +) -> (PrimitiveArray, BoolArray) { + let mut ends = BufferMut::::with_capacity(total_length / avg_run_length + 1); + let mut values = Vec::with_capacity(total_length / avg_run_length + 1); + + let mut pos = 0usize; + let mut run_index = 0usize; + + while pos < total_length { + let run_len = avg_run_length.min(total_length - pos); + pos += run_len; + ends.push(pos as u32); + + let val = match distribution { + BoolDistribution::Alternating => run_index % 2 == 0, + BoolDistribution::MostlyTrue => run_index % 10 != 0, // 90% true + BoolDistribution::MostlyFalse => run_index % 10 == 0, // 10% true (90% false) + BoolDistribution::AllTrue => true, + BoolDistribution::AllFalse => false, + }; + values.push(val); + run_index += 1; + } + + ( + PrimitiveArray::new(ends.freeze(), Validity::NonNullable), + BoolArray::from(BitBuffer::from(values)), + ) +} + +// Medium size: 10k elements with various run lengths +const BOOL_ARGS: &[(usize, usize)] = &[ + (10_000, 2), // Very short runs (5000 runs) + (10_000, 10), // Short runs (1000 runs) + (10_000, 100), // Medium runs (100 runs) + (10_000, 1000), // Long runs (10 runs) +]; + +#[divan::bench(args = BOOL_ARGS)] +fn decode_bool_alternating(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { + let (ends, values) = + create_bool_test_data(total_length, avg_run_length, BoolDistribution::Alternating); + bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); +} + +#[divan::bench(args = BOOL_ARGS)] +fn decode_bool_mostly_true(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { + let (ends, values) = + create_bool_test_data(total_length, avg_run_length, BoolDistribution::MostlyTrue); + bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); +} + +#[divan::bench(args = BOOL_ARGS)] +fn decode_bool_mostly_false(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { + let (ends, values) = + create_bool_test_data(total_length, avg_run_length, BoolDistribution::MostlyFalse); + bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); +} + +#[divan::bench(args = BOOL_ARGS)] +fn decode_bool_all_true(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { + let (ends, values) = + create_bool_test_data(total_length, avg_run_length, BoolDistribution::AllTrue); + bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); +} + +#[divan::bench(args = BOOL_ARGS)] +fn decode_bool_all_false(bencher: Bencher, (total_length, avg_run_length): (usize, usize)) { + let (ends, values) = + create_bool_test_data(total_length, avg_run_length, BoolDistribution::AllFalse); + bencher.bench(|| runend_decode_bools(ends.clone(), values.clone(), 0, total_length)); +} diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index 8d8af3a828f..007e184ec65 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -186,23 +186,7 @@ pub fn runend_decode_primitive( })) } -pub fn runend_decode_bools( - ends: PrimitiveArray, - values: BoolArray, - offset: usize, - length: usize, -) -> VortexResult { - let validity_mask = values.validity_mask()?; - Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { - runend_decode_typed_bool( - trimmed_ends_iter(ends.as_slice::(), offset, length), - &values.to_bit_buffer(), - validity_mask, - values.dtype().nullability(), - length, - ) - })) -} +pub use crate::decompress_bool::runend_decode_bools; pub fn runend_decode_typed_primitive( run_ends: impl Iterator, @@ -263,47 +247,6 @@ pub fn runend_decode_typed_primitive( } } -pub fn runend_decode_typed_bool( - run_ends: impl Iterator, - values: &BitBuffer, - values_validity: Mask, - values_nullability: Nullability, - length: usize, -) -> BoolArray { - match values_validity { - Mask::AllTrue(_) => { - let mut decoded = BitBufferMut::with_capacity(length); - for (end, value) in run_ends.zip_eq(values.iter()) { - decoded.append_n(value, end - decoded.len()); - } - BoolArray::new(decoded.freeze(), values_nullability.into()) - } - Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), - Mask::Values(mask) => { - let mut decoded = BitBufferMut::with_capacity(length); - let mut decoded_validity = BitBufferMut::with_capacity(length); - for (end, value) in run_ends.zip_eq( - values - .iter() - .zip(mask.bit_buffer().iter()) - .map(|(v, is_valid)| is_valid.then_some(v)), - ) { - match value { - None => { - decoded_validity.append_n(false, end - decoded.len()); - decoded.append_n(false, end - decoded.len()); - } - Some(value) => { - decoded_validity.append_n(true, end - decoded.len()); - decoded.append_n(value, end - decoded.len()); - } - } - } - BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) - } - } -} - #[cfg(test)] mod test { use vortex_array::ToCanonical; diff --git a/encodings/runend/src/decompress_bool.rs b/encodings/runend/src/decompress_bool.rs new file mode 100644 index 00000000000..280b0d9f38f --- /dev/null +++ b/encodings/runend/src/decompress_bool.rs @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Optimized run-end decoding for boolean arrays. +//! +//! Uses an adaptive strategy that pre-fills the buffer with the majority value +//! (0s or 1s) and only fills the minority runs, minimizing work for skewed distributions. + +use itertools::Itertools; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::validity::Validity; +use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; +use vortex_dtype::Nullability; +use vortex_dtype::match_each_unsigned_integer_ptype; +use vortex_error::VortexResult; +use vortex_mask::Mask; + +use crate::iter::trimmed_ends_iter; + +/// Decodes run-end encoded boolean values into a flat `BoolArray`. +pub fn runend_decode_bools( + ends: PrimitiveArray, + values: BoolArray, + offset: usize, + length: usize, +) -> VortexResult { + let validity = values.validity_mask()?; + Ok(match_each_unsigned_integer_ptype!(ends.ptype(), |E| { + runend_decode_typed_bool( + trimmed_ends_iter(ends.as_slice::(), offset, length), + &values.to_bit_buffer(), + validity, + values.dtype().nullability(), + length, + ) + })) +} + +/// Fills bits in range [start, end) to true using byte-level operations. +/// Assumes the buffer is pre-initialized to all zeros. +#[inline(always)] +fn fill_bits_true(slice: &mut [u8], start: usize, end: usize) { + if start >= end { + return; + } + + let start_byte = start / 8; + let start_bit = start % 8; + let end_byte = end / 8; + let end_bit = end % 8; + + if start_byte == end_byte { + // All bits in same byte + // Use u16 to avoid overflow, then truncate (guaranteed to fit in u8 since max is 0xFF) + #[allow(clippy::cast_possible_truncation)] + let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; + slice[start_byte] |= mask << start_bit; + } else { + // First partial byte + if start_bit != 0 { + slice[start_byte] |= !((1u8 << start_bit) - 1); + } + + // Middle bytes (bulk memset to 0xFF) + let fill_start = if start_bit != 0 { + start_byte + 1 + } else { + start_byte + }; + if fill_start < end_byte { + slice[fill_start..end_byte].fill(0xFF); + } + + // Last partial byte + if end_bit != 0 { + slice[end_byte] |= (1u8 << end_bit) - 1; + } + } +} + +/// Clears bits in range [start, end) to false using byte-level operations. +/// Assumes the buffer is pre-initialized to all ones. +#[inline(always)] +fn fill_bits_false(slice: &mut [u8], start: usize, end: usize) { + if start >= end { + return; + } + + let start_byte = start / 8; + let start_bit = start % 8; + let end_byte = end / 8; + let end_bit = end % 8; + + if start_byte == end_byte { + // All bits in same byte - create mask with 0s in the range we want to clear + #[allow(clippy::cast_possible_truncation)] + let mask = ((1u16 << (end_bit - start_bit)) - 1) as u8; + slice[start_byte] &= !(mask << start_bit); + } else { + // First partial byte - clear high bits from start_bit + if start_bit != 0 { + slice[start_byte] &= (1u8 << start_bit) - 1; + } + + // Middle bytes (bulk memset to 0x00) + let fill_start = if start_bit != 0 { + start_byte + 1 + } else { + start_byte + }; + if fill_start < end_byte { + slice[fill_start..end_byte].fill(0x00); + } + + // Last partial byte - clear low bits up to end_bit + if end_bit != 0 { + slice[end_byte] &= !((1u8 << end_bit) - 1); + } + } +} + +/// Decodes run-end encoded boolean values using an adaptive strategy. +/// +/// The strategy counts true vs false runs and chooses the optimal approach: +/// - If more true runs: pre-fill with 1s, clear false runs +/// - If more false runs: pre-fill with 0s, fill true runs +/// +/// This minimizes work for skewed distributions (e.g., sparse validity masks). +pub fn runend_decode_typed_bool( + run_ends: impl Iterator, + values: &BitBuffer, + values_validity: Mask, + values_nullability: Nullability, + length: usize, +) -> BoolArray { + match values_validity { + Mask::AllTrue(_) => { + // Adaptive strategy: choose based on which value is more common + // If more runs have true values, pre-fill with 1s and clear false runs + // If more runs have false values, pre-fill with 0s and fill true runs + let true_count = values.true_count(); + let false_count = values.len() - true_count; + + if true_count > false_count { + // More true runs - pre-fill with 1s and clear false runs + let mut decoded = BitBufferMut::new_set(length); + let decoded_bytes = decoded.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq(values.iter()) { + // Only clear when value is false (true is already 1) + if end > current_pos && !value { + fill_bits_false(decoded_bytes, current_pos, end); + } + current_pos = end; + } + BoolArray::new(decoded.freeze(), values_nullability.into()) + } else { + // More or equal false runs - pre-fill with 0s and fill true runs + let mut decoded = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq(values.iter()) { + // Only fill when value is true (false is already 0) + if end > current_pos && value { + fill_bits_true(decoded_bytes, current_pos, end); + } + current_pos = end; + } + BoolArray::new(decoded.freeze(), values_nullability.into()) + } + } + Mask::AllFalse(_) => BoolArray::new(BitBuffer::new_unset(length), Validity::AllInvalid), + Mask::Values(mask) => { + // For nullable values, adaptive strategy based on true count + // (counting only valid values as true) + let valid_true_count = values + .iter() + .zip(mask.bit_buffer().iter()) + .filter(|&(v, is_valid)| is_valid && v) + .count(); + let valid_false_count = values + .iter() + .zip(mask.bit_buffer().iter()) + .filter(|&(v, is_valid)| is_valid && !v) + .count(); + + if valid_true_count > valid_false_count { + // More true runs - pre-fill with 1s and clear false/null runs + let mut decoded = BitBufferMut::new_set(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq( + values + .iter() + .zip(mask.bit_buffer().iter()) + .map(|(v, is_valid)| is_valid.then_some(v)), + ) { + if end > current_pos { + match value { + None => { + // Null: clear decoded bits, validity stays false + fill_bits_false(decoded_bytes, current_pos, end); + } + Some(v) => { + // Valid: set validity bits to true + fill_bits_true(validity_bytes, current_pos, end); + // Clear decoded bits if value is false + if !v { + fill_bits_false(decoded_bytes, current_pos, end); + } + } + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) + } else { + // More or equal false runs - pre-fill with 0s and fill true runs + let mut decoded = BitBufferMut::new_unset(length); + let mut decoded_validity = BitBufferMut::new_unset(length); + let decoded_bytes = decoded.as_mut_slice(); + let validity_bytes = decoded_validity.as_mut_slice(); + let mut current_pos = 0usize; + + for (end, value) in run_ends.zip_eq( + values + .iter() + .zip(mask.bit_buffer().iter()) + .map(|(v, is_valid)| is_valid.then_some(v)), + ) { + if end > current_pos { + match value { + None => { + // Validity stays false (already 0), decoded stays false + } + Some(v) => { + // Set validity bits to true + fill_bits_true(validity_bytes, current_pos, end); + // Set decoded bits if value is true + if v { + fill_bits_true(decoded_bytes, current_pos, end); + } + } + } + current_pos = end; + } + } + BoolArray::new(decoded.freeze(), Validity::from(decoded_validity.freeze())) + } + } + } +} + +#[cfg(test)] +mod tests { + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_buffer::BitBuffer; + use vortex_error::VortexResult; + + use super::runend_decode_bools; + + #[test] + fn decode_bools_alternating() -> VortexResult<()> { + // Alternating true/false: [T, T, F, F, F, T, T, T, T, T] + let ends = PrimitiveArray::from_iter([2u32, 5, 10]); + let values = BoolArray::from(BitBuffer::from(vec![true, false, true])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + true, true, false, false, false, true, true, true, true, true, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_mostly_true() -> VortexResult<()> { + // Mostly true: [T, T, T, T, T, F, T, T, T, T] - triggers true-heavy path + let ends = PrimitiveArray::from_iter([5u32, 6, 10]); + let values = BoolArray::from(BitBuffer::from(vec![true, false, true])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + true, true, true, true, true, false, true, true, true, true, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_mostly_false() -> VortexResult<()> { + // Mostly false: [F, F, F, F, F, T, F, F, F, F] - triggers false-heavy path + let ends = PrimitiveArray::from_iter([5u32, 6, 10]); + let values = BoolArray::from(BitBuffer::from(vec![false, true, false])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + false, false, false, false, false, true, false, false, false, false, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_all_true() -> VortexResult<()> { + // All true: single run + let ends = PrimitiveArray::from_iter([10u32]); + let values = BoolArray::from(BitBuffer::from(vec![true])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + true, true, true, true, true, true, true, true, true, true, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_all_false() -> VortexResult<()> { + // All false: single run + let ends = PrimitiveArray::from_iter([10u32]); + let values = BoolArray::from(BitBuffer::from(vec![false])); + let decoded = runend_decode_bools(ends, values, 0, 10)?; + + let expected = BoolArray::from(BitBuffer::from(vec![ + false, false, false, false, false, false, false, false, false, false, + ])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } + + #[test] + fn decode_bools_with_offset() -> VortexResult<()> { + // Test with offset: [T, T, F, F, F, T, T, T, T, T] -> slice [2..8] = [F, F, F, T, T, T] + let ends = PrimitiveArray::from_iter([2u32, 5, 10]); + let values = BoolArray::from(BitBuffer::from(vec![true, false, true])); + let decoded = runend_decode_bools(ends, values, 2, 6)?; + + let expected = + BoolArray::from(BitBuffer::from(vec![false, false, false, true, true, true])); + assert_arrays_eq!(decoded, expected); + Ok(()) + } +} diff --git a/encodings/runend/src/lib.rs b/encodings/runend/src/lib.rs index 589b16e2c65..5be018b036d 100644 --- a/encodings/runend/src/lib.rs +++ b/encodings/runend/src/lib.rs @@ -13,6 +13,7 @@ mod array; mod arrow; pub mod compress; mod compute; +pub mod decompress_bool; mod iter; mod kernel; mod ops;