Skip to content

Commit 35d3853

Browse files
committed
Centralize the choice of a default buffer size for apply.
1 parent f84ecd8 commit 35d3853

File tree

6 files changed

+83
-23
lines changed

6 files changed

+83
-23
lines changed

src/delayedarray/DelayedArray.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,7 @@ def __getitem__(self, subset) -> Union["DelayedArray", ndarray]:
822822

823823

824824
# For python-level compute.
825-
def sum(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, buffer_size: int = 1e8) -> numpy.ndarray:
825+
def sum(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, buffer_size: Optional[int] = None) -> numpy.ndarray:
826826
"""
827827
Take the sum of values across the ``DelayedArray``, possibly over a
828828
given axis or set of axes. If the seed has a ``sum()`` method, that
@@ -840,8 +840,9 @@ def sum(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optiona
840840
:py:func:`~numpy.sum` for details.
841841
842842
buffer_size:
843-
Buffer size in bytes to use for block processing. Larger values
844-
generally improve speed at the cost of memory.
843+
Buffer size in bytes to use for block processing.
844+
Larger values generally improve speed at the cost of memory.
845+
If ``None``, defaults to the value returned by :py:func:`~delayedarray.default_buffer_size.default_buffer_size`.
845846
846847
Returns:
847848
A NumPy array containing the sums. If ``axis = None``, this will be
@@ -859,7 +860,7 @@ def sum(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optiona
859860
)
860861

861862

862-
def mean(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, buffer_size: int = 1e8) -> numpy.ndarray:
863+
def mean(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, buffer_size: Optional[int] = None) -> numpy.ndarray:
863864
"""
864865
Take the mean of values across the ``DelayedArray``, possibly over a
865866
given axis or set of axes. If the seed has a ``mean()`` method, that
@@ -877,8 +878,9 @@ def mean(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Option
877878
:py:func:`~numpy.mean` for details.
878879
879880
buffer_size:
880-
Buffer size in bytes to use for block processing. Larger values
881-
generally improve speed at the cost of memory.
881+
Buffer size in bytes to use for block processing.
882+
Larger values generally improve speed at the cost of memory.
883+
If ``None``, defaults to the value returned by :py:func:`~delayedarray.default_buffer_size.default_buffer_size`.
882884
883885
Returns:
884886
A NumPy array containing the means. If ``axis = None``, this will
@@ -896,7 +898,7 @@ def mean(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Option
896898
)
897899

898900

899-
def var(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, ddof: int = 0, buffer_size: int = 1e8) -> numpy.ndarray:
901+
def var(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, ddof: int = 0, buffer_size: Optional[int] = None) -> numpy.ndarray:
900902
"""
901903
Take the variances of values across the ``DelayedArray``, possibly over
902904
a given axis or set of axes. If the seed has a ``var()`` method, that
@@ -918,8 +920,9 @@ def var(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optiona
918920
Typically set to 1 to obtain the sample variance.
919921
920922
buffer_size:
921-
Buffer size in bytes to use for block processing. Larger values
922-
generally improve speed at the cost of memory.
923+
Buffer size in bytes to use for block processing.
924+
Larger values generally improve speed at the cost of memory.
925+
If ``None``, defaults to the value returned by :py:func:`~delayedarray.default_buffer_size.default_buffer_size`.
923926
924927
Returns:
925928
A NumPy array containing the variances. If ``axis = None``,
@@ -937,7 +940,7 @@ def var(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optiona
937940
masked=is_masked(self),
938941
)
939942

940-
def any(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, buffer_size: int = 1e8) -> numpy.ndarray:
943+
def any(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, buffer_size: Optional[int] = None) -> numpy.ndarray:
941944
"""Test whether any array element along a given axis evaluates to True.
942945
943946
Compute this test across the ``DelayedArray``, possibly over a
@@ -956,8 +959,9 @@ def any(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optiona
956959
:py:func:`~numpy.any` for details.
957960
958961
buffer_size:
959-
Buffer size in bytes to use for block processing. Larger values
960-
generally improve speed at the cost of memory.
962+
Buffer size in bytes to use for block processing.
963+
Larger values generally improve speed at the cost of memory.
964+
If ``None``, defaults to the value returned by :py:func:`~delayedarray.default_buffer_size.default_buffer_size`.
961965
962966
Returns:
963967
A NumPy array containing the boolean values. If ``axis = None``, this will
@@ -974,7 +978,7 @@ def any(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optiona
974978
masked=is_masked(self),
975979
)
976980

977-
def all(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, buffer_size: int = 1e8) -> numpy.ndarray:
981+
def all(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optional[numpy.dtype] = None, buffer_size: Optional[int] = None) -> numpy.ndarray:
978982
"""Test whether all array elements along a given axis evaluate to True.
979983
980984
Compute this test across the ``DelayedArray``, possibly over a
@@ -993,8 +997,9 @@ def all(self, axis: Optional[Union[int, Tuple[int, ...]]] = None, dtype: Optiona
993997
:py:func:`~numpy.all` for details.
994998
995999
buffer_size:
996-
Buffer size in bytes to use for block processing. Larger values
997-
generally improve speed at the cost of memory.
1000+
Buffer size in bytes to use for block processing.
1001+
Larger values generally improve speed at the cost of memory.
1002+
If ``None``, defaults to the value returned by :py:func:`~delayedarray.default_buffer_size.default_buffer_size`.
9981003
9991004
Returns:
10001005
A NumPy array containing the boolean values. If ``axis = None``, this will
@@ -1115,7 +1120,7 @@ def _reduce_SparseNdarray(x: SparseNdarray, multipliers: List[int], axes: List[i
11151120
return
11161121

11171122

1118-
def _reduce(x: DelayedArray, axes: List[int], operation: Callable, buffer_size: int):
1123+
def _reduce(x: DelayedArray, axes: List[int], operation: Callable, buffer_size: Optional[int]):
11191124
multipliers = _create_offset_multipliers(x.shape, axes)
11201125
if is_sparse(x):
11211126
apply_over_blocks(

src/delayedarray/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from .RegularTicks import RegularTicks
3838
from .apply_over_dimension import apply_over_dimension
3939
from .apply_over_blocks import apply_over_blocks
40+
from .default_buffer_size import default_buffer_size
4041

4142
from .create_dask_array import create_dask_array
4243
from .is_sparse import is_sparse

src/delayedarray/apply_over_blocks.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,14 @@
66
from .is_sparse import is_sparse
77
from .extract_dense_array import extract_dense_array
88
from .extract_sparse_array import extract_sparse_array
9+
from .default_buffer_size import default_buffer_size
910

1011
__author__ = "ltla"
1112
__copyright__ = "ltla"
1213
__license__ = "MIT"
1314

1415

15-
def apply_over_blocks(x, fun: Callable, allow_sparse: bool = False, grid: Optional[AbstractGrid] = None, buffer_size: int = 1e8) -> list:
16+
def apply_over_blocks(x, fun: Callable, allow_sparse: bool = False, grid: Optional[AbstractGrid] = None, buffer_size: Optional[int] = None) -> list:
1617
"""
1718
Iterate over an array by blocks. We apply a user-provided function and
1819
collect the results before proceeding to the next block.
@@ -38,8 +39,9 @@ def apply_over_blocks(x, fun: Callable, allow_sparse: bool = False, grid: Option
3839
of :py:func:`~delayedarray.chunk_grid.chunk_grid` on ``x``.
3940
4041
buffer_size:
41-
Buffer_size in bytes, to hold a single block per iteration. Larger
42-
values generally improve speed at the cost of memory.
42+
Buffer_size in bytes, to hold a single block per iteration.
43+
Larger values generally improve speed at the cost of memory.
44+
If ``None``, defaults to the value returned by :py:func:`~delayedarray.default_buffer_size.default_buffer_size`.
4345
4446
Returns:
4547
List containing the output of ``fun`` on each block.
@@ -54,6 +56,9 @@ def apply_over_blocks(x, fun: Callable, allow_sparse: bool = False, grid: Option
5456

5557
dims = (*range(len(x.shape)),)
5658
collected = []
59+
60+
if buffer_size is None:
61+
buffer_size = default_buffer_size()
5762
buffer_elements = buffer_size // x.dtype.itemsize
5863

5964
for job in grid.iterate(dims, buffer_elements = buffer_elements):

src/delayedarray/apply_over_dimension.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,14 @@
66
from .is_sparse import is_sparse
77
from .extract_dense_array import extract_dense_array
88
from .extract_sparse_array import extract_sparse_array
9+
from .default_buffer_size import default_buffer_size
910

1011
__author__ = "ltla"
1112
__copyright__ = "ltla"
1213
__license__ = "MIT"
1314

1415

15-
def apply_over_dimension(x, dimension: int, fun: Callable, allow_sparse: bool = False, grid: Optional[AbstractGrid] = None, buffer_size: int = 1e8) -> list:
16+
def apply_over_dimension(x, dimension: int, fun: Callable, allow_sparse: bool = False, grid: Optional[AbstractGrid] = None, buffer_size: Optional[int] = None) -> list:
1617
"""
1718
Iterate over an array on a certain dimension. At each iteration, the block
1819
of observations consists of the full extent of all dimensions other than
@@ -42,8 +43,9 @@ def apply_over_dimension(x, dimension: int, fun: Callable, allow_sparse: bool =
4243
of :py:func:`~delayedarray.chunk_grid.chunk_grid` on ``x``.
4344
4445
buffer_size:
45-
Buffer_size in bytes, to hold a single block per iteration. Larger
46-
values generally improve speed at the cost of memory.
46+
Buffer_size in bytes, to hold a single block per iteration.
47+
Larger values generally improve speed at the cost of memory.
48+
If ``None``, defaults to the value returned by :py:func:`~delayedarray.default_buffer_size.default_buffer_size`.
4749
4850
Returns:
4951
List containing the output of ``fun`` on each block.
@@ -57,9 +59,11 @@ def apply_over_dimension(x, dimension: int, fun: Callable, allow_sparse: bool =
5759
else:
5860
extractor = extract_dense_array
5961

60-
collected = []
62+
if buffer_size is None:
63+
buffer_size = default_buffer_size()
6164
buffer_elements = buffer_size // x.dtype.itemsize
6265

66+
collected = []
6367
for job in grid.iterate((dimension,), buffer_elements = buffer_elements):
6468
subsets = (*(range(s, e) for s, e in job),)
6569
output = fun(job[dimension], extractor(x, subsets))
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from typing import Optional
2+
3+
__author__ = "ltla"
4+
__copyright__ = "ltla"
5+
__license__ = "MIT"
6+
7+
8+
old_buffer_size = 1e8
9+
10+
11+
def default_buffer_size(buffer_size: Optional[int] = None) -> int:
12+
"""
13+
Get or set the default buffer size used by :py:func:`~delayedarray.apply_over_blocks.apply_over_blocks`,
14+
:py:func:`~delayedarray.apply_over_dimension.apply_over_dimension`, etc.
15+
16+
Args:
17+
buffer_size:
18+
Buffer size in bytes.
19+
The buffer is typically used to load a block of an array in memory for further processing.
20+
Alternatively ``None``.
21+
22+
Returns:
23+
If ``buffer_size = None``, the current default buffer size is returned.
24+
25+
If ``buffer_size`` is an integer, the default buffer size is set to this value, and the previous buffer size is returned.
26+
"""
27+
global old_buffer_size
28+
if buffer_size is None:
29+
return old_buffer_size
30+
31+
previous = old_buffer_size
32+
old_buffer_size = buffer_size
33+
return previous

tests/test_default_buffer_size.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import delayedarray
2+
3+
4+
def test_default_buffer_size():
5+
assert delayedarray.default_buffer_size() == 1e8
6+
7+
old = delayedarray.default_buffer_size(500)
8+
assert old == 1e8
9+
assert delayedarray.default_buffer_size() == 500
10+
11+
delayedarray.default_buffer_size(old)
12+
assert delayedarray.default_buffer_size() == old

0 commit comments

Comments
 (0)