Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
c7d9267
add bitwise ops
rluvaton Oct 12, 2025
d14e5b7
add bitwise ops
rluvaton Oct 12, 2025
739fe0a
cleanup
rluvaton Oct 12, 2025
0e15b32
pub(crate) as I don't like that we have both mutable and only left mu…
rluvaton Oct 12, 2025
c442299
start adding tests
rluvaton Oct 12, 2025
2f28dc3
add tests
rluvaton Oct 12, 2025
c4676a6
add trait for left
rluvaton Oct 15, 2025
da03628
format
rluvaton Oct 15, 2025
652a256
revert changes
rluvaton Oct 15, 2025
0c29f0e
fix validation
rluvaton Oct 15, 2025
bcd4863
remove many unsafe and cleanup
rluvaton Oct 15, 2025
6b7bfe9
format
rluvaton Oct 15, 2025
aec92d6
add reproduction test
rluvaton Oct 26, 2025
db3e853
extract, cleanup and add comments
rluvaton Oct 26, 2025
0a64bcb
add comments
rluvaton Oct 26, 2025
ca621f8
Merge remote-tracking branch 'apache/main' into add-bitwise-ops-to-bo…
alamb Nov 3, 2025
d63d72c
Update arrow-buffer/src/buffer/mutable_ops.rs
alamb Nov 3, 2025
464e56c
Merge branch 'add-bitwise-ops-to-boolean-buffer-builder' of github.co…
alamb Nov 3, 2025
07679d7
Revert changes to boolean
alamb Nov 3, 2025
bfdf381
Restore enough for the tests
alamb Nov 3, 2025
246d4e2
Improve docs
alamb Nov 3, 2025
b9acb34
Move into mutable module
alamb Nov 3, 2025
d590ee1
Add example/doc tests
alamb Nov 3, 2025
ccf266f
Add tests for out of bounds
alamb Nov 3, 2025
005c444
Add tests for unary ops
alamb Nov 3, 2025
3a8e760
Add panic doc
alamb Nov 3, 2025
cf52bdf
fmt
alamb Nov 3, 2025
6dbed0b
Move buffer modification to bit_utils
alamb Nov 5, 2025
9ca7e45
Move tests and remove changes to MutableBufer
alamb Nov 5, 2025
5cb50d5
Merge remote-tracking branch 'apache/main' into add-bitwise-ops-to-bo…
alamb Nov 5, 2025
379d1ec
Update docs
alamb Nov 5, 2025
1fb4981
fix docs
alamb Nov 5, 2025
b0cf38b
Use new `bitwise_binary_op` in boolean kernels
alamb Nov 5, 2025
5e4e242
hack
alamb Nov 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions arrow-buffer/src/buffer/immutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,12 @@ impl std::ops::Deref for Buffer {
}
}

impl AsRef<[u8]> for &Buffer {
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}

impl From<MutableBuffer> for Buffer {
#[inline]
fn from(buffer: MutableBuffer) -> Self {
Expand Down
10 changes: 8 additions & 2 deletions arrow-buffer/src/buffer/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,15 @@ pub struct MutableBuffer {
}

impl MutableBuffer {
/// Allocate a new [MutableBuffer] with initial capacity to be at least `capacity`.
/// Allocate a new [MutableBuffer] with initial capacity to be at least `capacity` bytes
///
/// See [`MutableBuffer::with_capacity`].
#[inline]
pub fn new(capacity: usize) -> Self {
Self::with_capacity(capacity)
}

/// Allocate a new [MutableBuffer] with initial capacity to be at least `capacity`.
/// Allocate a new [MutableBuffer] with initial capacity to be at least `capacity` bytes
///
/// # Panics
///
Expand Down Expand Up @@ -812,6 +812,12 @@ impl std::ops::DerefMut for MutableBuffer {
}
}

impl AsRef<[u8]> for &MutableBuffer {
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}

impl Drop for MutableBuffer {
fn drop(&mut self) {
if self.layout.size() != 0 {
Expand Down
134 changes: 99 additions & 35 deletions arrow-buffer/src/buffer/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// under the License.

use super::{Buffer, MutableBuffer};
use crate::bit_util::{bitwise_binary_op, bitwise_unary_op};
use crate::util::bit_util::ceil;

/// Apply a bitwise operation `op` to four inputs and return the result as a Buffer.
Expand Down Expand Up @@ -60,39 +61,70 @@ where

/// Apply a bitwise operation `op` to two inputs and return the result as a Buffer.
/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits.
///
/// The output is guaranteed to have
/// 1. all bits outside the specified range set to zero
/// 2. start at offset zero
pub fn bitwise_bin_op_helper<F>(
left: &Buffer,
left_offset_in_bits: usize,
right: &Buffer,
right_offset_in_bits: usize,
len_in_bits: usize,
mut op: F,
op: F,
) -> Buffer
where
F: FnMut(u64, u64) -> u64,
{
let left_chunks = left.bit_chunks(left_offset_in_bits, len_in_bits);
let right_chunks = right.bit_chunks(right_offset_in_bits, len_in_bits);
if len_in_bits == 0 {
return Buffer::default();
}

let chunks = left_chunks
.iter()
.zip(right_chunks.iter())
.map(|(left, right)| op(left, right));
// Soundness: `BitChunks` is a `BitChunks` iterator which
// correctly reports its upper bound
let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) };
// figure out the starting byte for left buffer
let start_byte = left_offset_in_bits / 8;
let starting_bit_in_byte = left_offset_in_bits % 8;

let remainder_bytes = ceil(left_chunks.remainder_len(), 8);
let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits());
// we are counting its starting from the least significant bit, to to_le_bytes should be correct
let rem = &rem.to_le_bytes()[0..remainder_bytes];
buffer.extend_from_slice(rem);
let len_bytes = ceil(starting_bit_in_byte + len_in_bits, 8);
let mut result = left[start_byte..len_bytes].to_vec();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does an extra copy which wasn't there before (using from_trusted_len_iter)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 you are right. We do need a new allocation to write into but don't need to copy the values

It is fascinating however, that this code is often still faster than the previous one (maybe due to fewer branches)

I'll see if I can perhaps optimize the case when the offsets are zero which I think is a common case

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

optimization works well: #8807

bitwise_binary_op(
&mut result,
starting_bit_in_byte,
right,
right_offset_in_bits,
len_in_bits,
op,
);

buffer.into()
// shift result to the left so that that it starts at offset zero (TODO do this a word at a time)
shift_left_by(&mut result, starting_bit_in_byte);
result.into()
}

/// Shift the bits in the buffer to the left by `shift` bits.
/// `shift` must be less than 8.
fn shift_left_by(buffer: &mut [u8], starting_bit_in_byte: usize) {
if starting_bit_in_byte == 0 {
return;
}
assert!(starting_bit_in_byte < 8);
let shift = 8 - starting_bit_in_byte;
let carry_mask = ((1u8 << starting_bit_in_byte) - 1) << shift;

let mut carry = 0;
// shift from right to left
for b in buffer.iter_mut().rev() {
let new_carry = (*b & carry_mask) >> shift;
*b = (*b << starting_bit_in_byte) | carry;
carry = new_carry;
}
}

/// Apply a bitwise operation `op` to one input and return the result as a Buffer.
/// The input is treated as a bitmap, meaning that offset and length are specified in number of bits.
///
/// The output is guaranteed to have
/// 1. all bits outside the specified range set to zero
/// 2. start at offset zero
pub fn bitwise_unary_op_helper<F>(
left: &Buffer,
offset_in_bits: usize,
Expand All @@ -102,26 +134,22 @@ pub fn bitwise_unary_op_helper<F>(
where
F: FnMut(u64) -> u64,
{
// reserve capacity and set length so we can get a typed view of u64 chunks
let mut result =
MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);

let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits);

let result_chunks = result.typed_data_mut::<u64>().iter_mut();

result_chunks
.zip(left_chunks.iter())
.for_each(|(res, left)| {
*res = op(left);
if len_in_bits == 0 {
return Buffer::default();
}
// already byte aligned, copy over directly
let len_in_bytes = ceil(len_in_bits, 8);
let mut result;
if offset_in_bits == 0 {
result = left.as_slice()[0..len_in_bytes].to_vec();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't be needed as well

bitwise_unary_op(&mut result, 0, len_in_bits, op);
} else {
// need to align bits
result = vec![0u8; len_in_bytes];
bitwise_binary_op(&mut result, 0, left, offset_in_bits, len_in_bits, |_, b| {
op(b)
});

let remainder_bytes = ceil(left_chunks.remainder_len(), 8);
let rem = op(left_chunks.remainder_bits());
// we are counting its starting from the least significant bit, to to_le_bytes should be correct
let rem = &rem.to_le_bytes()[0..remainder_bytes];
result.extend_from_slice(rem);

}
result.into()
}

Expand Down Expand Up @@ -206,3 +234,39 @@ pub fn buffer_bin_and_not(
pub fn buffer_unary_not(left: &Buffer, offset_in_bits: usize, len_in_bits: usize) -> Buffer {
bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a)
}


#[cfg(test)]
mod tests {
#[test]
fn test_shift_left_by() {
let input = vec![0b10110011, 0b00011100, 0b11111111];
do_shift_left_by(&input, 0, &input);
do_shift_left_by(&input, 1, &[0b01100110, 0b00111001, 0b11111110]);
do_shift_left_by(&input, 2, &[0b11001100, 0b01110011, 0b11111100]);
do_shift_left_by(&input, 3, &[0b10011000, 0b11100111, 0b11111000]);
do_shift_left_by(&input, 4, &[0b00110001, 0b11001111, 0b11110000]);
do_shift_left_by(&input, 5, &[0b01100011, 0b10011111, 0b11100000]);
do_shift_left_by(&input, 6, &[0b11000111, 0b00111111, 0b11000000]);
do_shift_left_by(&input, 7, &[0b10001110, 0b01111111, 0b10000000]);

}
fn do_shift_left_by(input: &[u8], shift: usize, expected: &[u8]) {
let mut buffer = input.to_vec();
super::shift_left_by(&mut buffer, shift);
assert_eq!(buffer, expected,
"\nshift_left_by({}, {})\nactual: {}\nexpected: {}",
buffer_string(input), shift,
buffer_string(&buffer),
buffer_string(expected)
);
}
fn buffer_string(buffer: &[u8]) -> String {
use std::fmt::Write;
let mut s = String::new();
for b in buffer {
write!(&mut s, "{:08b} ", b).unwrap();
}
s
}
}
Loading
Loading