Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@
from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
from pypaimon.read.reader.iface.record_iterator import RecordIterator
from pypaimon.deletionvectors.deletion_vector import DeletionVector

from pyroaring import BitMap

from pypaimon.utils.roaring_bitmap import RoaringBitmap
from pypaimon.read.reader.iface.record_reader import RecordReader


Expand Down Expand Up @@ -57,10 +55,11 @@ def read_arrow_batch(self) -> Optional[RecordBatch]:
if arrow_batch is None:
return None
# Remove the deleted rows from the batch
range_bitmap = BitMap(
range(self._reader.return_batch_pos() - arrow_batch.num_rows, self._reader.return_batch_pos()))
intersection_bitmap = range_bitmap - self._deletion_vector.bit_map()
added_row_list = [x - (self._reader.return_batch_pos() - arrow_batch.num_rows) for x in
range_bitmap = RoaringBitmap()
return_batch_pos = self._reader.return_batch_pos()
range_bitmap.add_range(return_batch_pos - arrow_batch.num_rows, return_batch_pos - 1)
intersection_bitmap = RoaringBitmap.remove_all(range_bitmap, self._deletion_vector.bit_map())
added_row_list = [x - (return_batch_pos - arrow_batch.num_rows) for x in
list(intersection_bitmap)]
return arrow_batch.take(pyarrow.array(added_row_list, type=pyarrow.int32()))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from pypaimon.deletionvectors.deletion_vector import DeletionVector
import struct
import zlib
from pyroaring import BitMap
from pypaimon.utils.roaring_bitmap import RoaringBitmap


class BitmapDeletionVector(DeletionVector):
Expand All @@ -31,14 +31,14 @@ class BitmapDeletionVector(DeletionVector):
MAGIC_NUMBER_SIZE_BYTES = 4
MAX_VALUE = 2147483647

def __init__(self, bitmap: BitMap = None):
def __init__(self, bitmap: RoaringBitmap = None):
"""
Initialize a BitmapDeletionVector.

Args:
bitmap: Optional RoaringBitmap instance. If None, creates an empty bitmap.
"""
self._bitmap = bitmap if bitmap is not None else BitMap()
self._bitmap = bitmap if bitmap is not None else RoaringBitmap()

def delete(self, position: int) -> None:
"""
Expand Down Expand Up @@ -121,7 +121,7 @@ def deserialize_from_bytes(data: bytes) -> 'BitmapDeletionVector':
Returns:
A BitmapDeletionVector instance.
"""
bitmap = BitMap.deserialize(data)
bitmap = RoaringBitmap.deserialize(data)
return BitmapDeletionVector(bitmap)

def bit_map(self):
Expand Down
3 changes: 2 additions & 1 deletion paimon-python/pypaimon/deletionvectors/deletion_vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from pypaimon.common.file_io import FileIO
from pypaimon.table.source.deletion_file import DeletionFile
from pypaimon.utils.roaring_bitmap import RoaringBitmap


class DeletionVector(ABC):
Expand All @@ -28,7 +29,7 @@ class DeletionVector(ABC):
"""

@abstractmethod
def bit_map(self):
def bit_map(self) -> RoaringBitmap:
"""
Returns the bitmap of the DeletionVector.
"""
Expand Down
2 changes: 0 additions & 2 deletions paimon-python/pypaimon/globalindex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
GlobalIndexScanBuilder,
RowRangeGlobalIndexScanner,
)
from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
from pypaimon.globalindex.range import Range

__all__ = [
Expand All @@ -46,6 +45,5 @@
'GlobalIndexEvaluator',
'GlobalIndexScanBuilder',
'RowRangeGlobalIndexScanner',
'RoaringBitmap64',
'Range',
]
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from pypaimon.globalindex.global_index_meta import GlobalIndexIOMeta
from pypaimon.globalindex.global_index_reader import FieldRef, GlobalIndexReader
from pypaimon.globalindex.global_index_result import GlobalIndexResult
from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
from pypaimon.utils.roaring_bitmap import RoaringBitmap64
from pypaimon.globalindex.btree.btree_file_footer import BTreeFileFooter
from pypaimon.globalindex.btree.sst_file_reader import SstFileReader
from pypaimon.globalindex.btree.memory_slice_input import MemorySliceInput
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from pypaimon.globalindex.global_index_result import GlobalIndexResult
from pypaimon.globalindex.global_index_meta import GlobalIndexIOMeta
from pypaimon.globalindex.vector_search_result import DictBasedScoredIndexResult
from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
from pypaimon.utils.roaring_bitmap import RoaringBitmap64
from pypaimon.globalindex.faiss.faiss_options import (
FaissVectorIndexOptions,
FaissVectorMetric,
Expand Down
2 changes: 1 addition & 1 deletion paimon-python/pypaimon/globalindex/global_index_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from abc import ABC, abstractmethod
from typing import Callable, List, Optional

from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
from pypaimon.utils.roaring_bitmap import RoaringBitmap64
from pypaimon.globalindex.range import Range


Expand Down
2 changes: 1 addition & 1 deletion paimon-python/pypaimon/globalindex/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def offset_range(self, from_: int, to: int) -> 'VectorSearch':
"""
Create a new VectorSearch with include_row_ids offset to the given range.
"""
from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
from pypaimon.utils.roaring_bitmap import RoaringBitmap64

if self.include_row_ids is not None:
range_bitmap = RoaringBitmap64()
Expand Down
2 changes: 1 addition & 1 deletion paimon-python/pypaimon/globalindex/vector_search_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from typing import Callable, Dict, Optional

from pypaimon.globalindex.global_index_result import GlobalIndexResult
from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
from pypaimon.utils.roaring_bitmap import RoaringBitmap64


# Type alias for score getter function
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,28 @@
Roaring Bitmap.
"""

from typing import Iterator, Set
import struct
from typing import Iterator


class RoaringBitmap64:
"""
A 64-bit roaring bitmap implementation.

This class provides efficient storage and operations for sets of 64-bit integers.
It uses a set-based implementation for simplicity, which can be replaced with
a more efficient roaring bitmap library if needed.
It uses pyroaring.BitMap64 for better performance and memory efficiency.
"""

def __init__(self):
self._data: Set[int] = set()
from pyroaring import BitMap64
self._data = BitMap64()

def add(self, value: int) -> None:
"""Add a single value to the bitmap."""
self._data.add(value)

def add_range(self, from_: int, to: int) -> None:
"""Add a range of values [from_, to] to the bitmap."""
for i in range(from_, to + 1):
self._data.add(i)
self._data.add_range(from_, to + 1)

def contains(self, value: int) -> bool:
"""Check if the bitmap contains the given value."""
Expand All @@ -58,7 +57,7 @@ def cardinality(self) -> int:

def __iter__(self) -> Iterator[int]:
"""Iterate over all values in the bitmap in sorted order."""
return iter(sorted(self._data))
return iter(self._data)

def __len__(self) -> int:
"""Return the number of elements in the bitmap."""
Expand All @@ -74,7 +73,7 @@ def clear(self) -> None:

def to_list(self) -> list:
"""Return a sorted list of all values in the bitmap."""
return sorted(self._data)
return list(self._data)

def to_range_list(self) -> list:
"""
Expand All @@ -85,8 +84,9 @@ def to_range_list(self) -> list:
if self.is_empty():
return []

sorted_values = sorted(self._data)
# Use pyroaring's efficient iteration
ranges = []
sorted_values = list(self._data)
start = sorted_values[0]
end = start

Expand Down Expand Up @@ -127,23 +127,14 @@ def remove_all(a: 'RoaringBitmap64', b: 'RoaringBitmap64') -> 'RoaringBitmap64':

def serialize(self) -> bytes:
"""Serialize the bitmap to bytes."""
# Simple serialization format: count followed by sorted values
values = sorted(self._data)
data = struct.pack('>Q', len(values)) # 8-byte count
for v in values:
data += struct.pack('>q', v) # 8-byte signed value
return data
return self._data.serialize()

@staticmethod
def deserialize(data: bytes) -> 'RoaringBitmap64':
"""Deserialize a bitmap from bytes."""
result = RoaringBitmap64()
count = struct.unpack('>Q', data[:8])[0]
offset = 8
for _ in range(count):
value = struct.unpack('>q', data[offset:offset + 8])[0]
result.add(value)
offset += 8
from pyroaring import BitMap64
result._data = BitMap64.deserialize(data)
return result

def __eq__(self, other: object) -> bool:
Expand All @@ -152,9 +143,109 @@ def __eq__(self, other: object) -> bool:
return self._data == other._data

def __hash__(self) -> int:
return hash(frozenset(self._data))
return hash(tuple(sorted(self._data)))

def __repr__(self) -> str:
values = list(self._data)
if len(values) <= 10:
return f"RoaringBitmap64({values})"
return f"RoaringBitmap64({len(values)} elements)"


class RoaringBitmap:
"""
A 32-bit roaring bitmap implementation.

This class provides efficient storage and operations for sets of 32-bit integers.
It uses pyroaring.BitMap for better performance and memory efficiency.
"""

def __init__(self):
from pyroaring import BitMap
self._data = BitMap()

def add(self, value: int) -> None:
"""Add a single value to the bitmap."""
self._data.add(value)

def add_range(self, from_: int, to: int) -> None:
"""Add a range of values [from_, to] to the bitmap."""
self._data.add_range(from_, to + 1)

def contains(self, value: int) -> bool:
"""Check if the bitmap contains the given value."""
return value in self._data

def is_empty(self) -> bool:
"""Check if the bitmap is empty."""
return len(self._data) == 0

def cardinality(self) -> int:
"""Return the number of elements in the bitmap."""
return len(self._data)

def __iter__(self) -> Iterator[int]:
"""Iterate over all values in the bitmap in sorted order."""
return iter(self._data)

def __len__(self) -> int:
"""Return the number of elements in the bitmap."""
return len(self._data)

def __contains__(self, value: int) -> bool:
"""Check if the bitmap contains the given value."""
return self.contains(value)

def clear(self) -> None:
"""Clear all values from the bitmap."""
self._data.clear()

def to_list(self) -> list:
"""Return a sorted list of all values in the bitmap."""
return list(self._data)

@staticmethod
def and_(a: 'RoaringBitmap', b: 'RoaringBitmap') -> 'RoaringBitmap':
"""Return the intersection of two bitmaps."""
result = RoaringBitmap()
result._data = a._data & b._data
return result

@staticmethod
def or_(a: 'RoaringBitmap', b: 'RoaringBitmap') -> 'RoaringBitmap':
"""Return the union of two bitmaps."""
result = RoaringBitmap()
result._data = a._data | b._data
return result

@staticmethod
def remove_all(a: 'RoaringBitmap', b: 'RoaringBitmap') -> 'RoaringBitmap':
result = RoaringBitmap()
result._data = a._data - b._data
return result

def serialize(self) -> bytes:
"""Serialize the bitmap to bytes."""
return self._data.serialize()

@staticmethod
def deserialize(data: bytes) -> 'RoaringBitmap':
"""Deserialize a bitmap from bytes."""
result = RoaringBitmap()
from pyroaring import BitMap
result._data = BitMap.deserialize(data)
return result

def __eq__(self, other: object) -> bool:
if not isinstance(other, RoaringBitmap):
return False
return self._data == other._data

def __hash__(self) -> int:
return hash(tuple(sorted(self._data)))

def __repr__(self) -> str:
if len(self._data) <= 10:
return f"RoaringBitmap64({sorted(self._data)})"
return f"RoaringBitmap64({len(self._data)} elements)"
values = list(self._data)
if len(values) <= 10:
return f"RoaringBitmap({values})"
return f"RoaringBitmap({len(values)} elements)"