Skip to content
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
cf13aad
[python-package]add_features_from with PyArrow Table incorrectly free…
suk1yak1 Apr 23, 2025
99ee66f
[python-package] add PyArrow Table case to test_add_features_from_dif…
suk1yak1 Apr 23, 2025
a3256a3
[python-package] fix handling and tests for PyArrow Table input in ad…
suk1yak1 Apr 24, 2025
9de2650
Merge branch 'master' into fix/6891-pyarrow-table-add-features
suk1yak1 Apr 24, 2025
01bc668
delete unnecessary-else
suk1yak1 Apr 28, 2025
27e2545
Merge branch 'master' of https://github.com/microsoft/LightGBM into f…
suk1yak1 Apr 28, 2025
219e61c
[python-package]add test for pyarrow table in Dataset.get_data()
suk1yak1 Apr 28, 2025
d4de309
[python-package]add test for add_features_from with pyarrow tables
suk1yak1 Apr 28, 2025
6686a60
Merge branch 'master' into fix/6891-pyarrow-table-add-features
suk1yak1 May 7, 2025
22ec314
[python-package] add PyArrow Table to get_data
suk1yak1 May 9, 2025
c7594c8
[python-package] add test for subset of PyArrow table dataset
suk1yak1 May 10, 2025
e0fce82
Merge branch 'master' into fix/6891-pyarrow-table-get-data
StrikerRUS May 20, 2025
1af15c6
Merge branch 'master' into fix/6891-pyarrow-table-get-data
suk1yak1 May 29, 2025
b5395a0
[python-package] improve PyArrow table subset tests for null values a…
suk1yak1 May 30, 2025
5f067ec
Merge branch 'master' into fix/6891-pyarrow-table-get-data
jameslamb Jul 27, 2025
eaf9510
Merge branch 'master' into fix/6891-pyarrow-table-get-data
suk1yak1 Aug 12, 2025
04db4aa
Merge branch 'master' into fix/6891-pyarrow-table-get-data
suk1yak1 Sep 1, 2025
e569ac5
[python-package]avoid TypeError: ChunkedArray.to_numpy() takes no key…
suk1yak1 Sep 1, 2025
5b4b197
Merge branch 'master' into fix/6891-pyarrow-table-get-data
suk1yak1 Sep 4, 2025
e826a4f
[python-package]Move final assertion before for loop to fail faster a…
suk1yak1 Sep 12, 2025
17dbc43
[python-package]Rename test helper and add docstring to clarify purpose
suk1yak1 Sep 12, 2025
8135e92
[python-package]Rename test helper
suk1yak1 Sep 12, 2025
71a65e5
Merge branch 'fix/6891-pyarrow-table-get-data' of github.com:suk1yak1…
suk1yak1 Sep 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
pd_Series,
)

if PYARROW_INSTALLED:
import pyarrow as pa

if TYPE_CHECKING:
from typing import Literal

Expand Down Expand Up @@ -3293,6 +3296,8 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]:
self.data = self.data[self.used_indices, :]
elif isinstance(self.data, Sequence):
self.data = self.data[self.used_indices]
elif isinstance(self.data, pa_Table):
self.data = self.data.take(self.used_indices)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! This was definitely just something we'd missed.

Can you please add a test in https://github.com/microsoft/LightGBM/blob/master/tests/python_package_test/test_arrow.py just for this change to get_data()? The other changes you made in test_basic.py do not cover these changes.

When you do that, please check that the content of self.data AND the returned value are correct (e.g., contain exactly the expected values and data types).

If you'd like, I'd even support opening a new pull request that only has the get_data() changes + test (and then making this PR only about add_features_from()). Totally your choice, I want to be respectful of your time.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestion!
I added tests for both get_data() and add_features_from() directly in test_arrow.py as part of this PR.
Please let me know if there’s anything else you’d like me to adjust!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jameslamb
I’ve opened a new pull request(#6911) that includes only the changes to get_data() along with the corresponding test. This should help keep things focused. I’d appreciate it if you could take a look when you have time.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! I'll focus there.

elif _is_list_of_sequences(self.data) and len(self.data) > 0:
self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices)))
else:
Expand Down Expand Up @@ -3480,6 +3485,22 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
elif isinstance(other.data, dt_DataTable):
_emit_datatable_deprecation_warning()
self.data = np.hstack((self.data, other.data.to_numpy()))
elif isinstance(other.data, pa_Table):
if not PYARROW_INSTALLED:
raise LightGBMError(
"Cannot add features to pyarrow.Table type of raw data "
"without pyarrow installed. "
"Install pyarrow and restart your session."
)
else:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove this else:. We have a slight preference in this project for just raising exceptions, to reduce unnecessary extra indentation.

Like this:

if not (PYARROW_INSTALLED and CFFI_INSTALLED):
raise LightGBMError("Cannot init Dataset from Arrow without 'pyarrow' and 'cffi' installed.")

That's enforced by convention today... if there's some ruff rule that could enforce that (like no-unnecessary-else or something), I'd support adding it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback!
I created an issue to propose adding a Ruff rule for this pattern: #6903.
Following that, I also opened a pull request #6904 to enable the RET506 (superfluous-else-raise) rule and fix the related code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this commit, I removed the unnecessary else block as suggested.

self.data = np.hstack(
(
self.data,
np.column_stack(
[other.data.column(i).to_numpy() for i in range(len(other.data.column_names))]
),
)
)
else:
self.data = None
elif isinstance(self.data, scipy.sparse.spmatrix):
Expand All @@ -3491,6 +3512,23 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
elif isinstance(other.data, dt_DataTable):
_emit_datatable_deprecation_warning()
self.data = scipy.sparse.hstack((self.data, other.data.to_numpy()), format=sparse_format)
elif isinstance(other.data, pa_Table):
if not PYARROW_INSTALLED:
raise LightGBMError(
"Cannot add features to pyarrow.Table type of raw data "
"without pyarrow installed. "
"Install pyarrow and restart your session."
)
else:
self.data = scipy.sparse.hstack(
(
self.data,
np.column_stack(
[other.data.column(i).to_numpy() for i in range(len(other.data.column_names))]
),
),
format=sparse_format,
)
else:
self.data = None
elif isinstance(self.data, pd_DataFrame):
Expand All @@ -3509,6 +3547,27 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
elif isinstance(other.data, dt_DataTable):
_emit_datatable_deprecation_warning()
self.data = concat((self.data, pd_DataFrame(other.data.to_numpy())), axis=1, ignore_index=True)
elif isinstance(other.data, pa_Table):
if not PYARROW_INSTALLED:
raise LightGBMError(
"Cannot add features to pyarrow.Table type of raw data "
"without pyarrow installed. "
"Install pyarrow and restart your session."
)
else:
self.data = concat(
(
self.data,
pd_DataFrame(
{
other.data.column_names[i]: other.data.column(i).to_numpy()
for i in range(len(other.data.column_names))
}
),
),
axis=1,
ignore_index=True,
)
else:
self.data = None
elif isinstance(self.data, dt_DataTable):
Expand All @@ -3521,6 +3580,79 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.values)))
elif isinstance(other.data, dt_DataTable):
self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.to_numpy())))
elif isinstance(other.data, pa_Table):
if not PYARROW_INSTALLED:
raise LightGBMError(
"Cannot add features to pyarrow.Table type of raw data "
"without pyarrow installed. "
"Install pyarrow and restart your session."
)
else:
self.data = dt_DataTable(
np.hstack(
(
self.data.to_numpy(),
np.column_stack(
[other.data.column(i).to_numpy() for i in range(len(other.data.column_names))]
),
)
)
)
else:
self.data = None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seeing more datatable code getting added here is making me think we should just do what xgboost did (dmlc/xgboost#11070) and just fully drop support for it now.

In #6662, I'd proposed having deprecation warnings for "2-3 releases", but I'm going to put up a PR just proposing dropping this in the next release. We got a deprecation warning into 4.6.0 (#6670), which was released about 2 months ago, and it'll probably be at least another 2 months until the next LightGBM release... I think that's enough time.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #6894

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for thinking through this and for moving things forward!
I appreciate you taking the initiative to propose a clear path for dropping datatable support.

elif isinstance(self.data, pa_Table):
if not PYARROW_INSTALLED:
raise LightGBMError(
"Cannot add features to pyarrow.Table type of raw data "
"without pyarrow installed. "
"Install pyarrow and restart your session."
)
if isinstance(other.data, np.ndarray):
self.data = pa_Table.from_arrays(
[
*self.data.columns,
*[pa.array(other.data[:, i]) for i in range(other.data.shape[1])],
],
names=[
*self.data.column_names,
*[f"D{len(self.data.column_names) + i + 1}" for i in range(other.data.shape[1])],
],
)
elif isinstance(other.data, scipy.sparse.spmatrix):
other_array = other.data.toarray()
self.data = pa_Table.from_arrays(
[
*self.data.columns,
*[pa.array(other_array[:, i]) for i in range(other_array.shape[1])],
],
names=[
*self.data.column_names,
*[f"D{len(self.data.column_names) + i + 1}" for i in range(other_array.shape[1])],
],
)
elif isinstance(other.data, pd_DataFrame):
self.data = pa_Table.from_arrays(
[
*self.data.columns,
*[pa.array(other.data.iloc[:, i].values) for i in range(len(other.data.columns))],
],
names=[*self.data.column_names, *map(str, other.data.columns.tolist())],
)
elif isinstance(other.data, dt_DataTable):
_emit_datatable_deprecation_warning()
other_array = other.data.to_numpy()
self.data = pa_Table.from_arrays(
[
*self.data.columns,
*[pa.array(other_array[:, i]) for i in range(other_array.shape[1])],
],
names=[*self.data.column_names, *other.data.names],
)
elif isinstance(other.data, pa_Table):
self.data = pa_Table.from_arrays(
[*self.data.columns, *other.data.columns],
names=[*self.data.column_names, *other.data.column_names],
)
else:
self.data = None
else:
Expand Down
22 changes: 21 additions & 1 deletion tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@

from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal

if getenv("ALLOW_SKIP_ARROW_TESTS") == "1":
pa = pytest.importorskip("pyarrow")
else:
import pyarrow as pa # type: ignore

assert lgb.compat.PYARROW_INSTALLED is True, (
"'pyarrow' and its dependencies must be installed to run the arrow tests"
)


def test_basic(tmp_path):
X_train, X_test, y_train, y_test = train_test_split(
Expand Down Expand Up @@ -345,7 +354,18 @@ def test_add_features_from_different_sources(rng):
n_row = 100
n_col = 5
X = rng.uniform(size=(n_row, n_col))
xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
xxs = [
X,
sparse.csr_matrix(X),
pd.DataFrame(X),
]
if getenv("ALLOW_SKIP_ARROW_TESTS") != "1":
xxs.append(
pa.Table.from_arrays(
[pa.array(X[:, i]) for i in range(X.shape[1])], names=[f"D{i}" for i in range(X.shape[1])]
)
)

names = [f"col_{i}" for i in range(n_col)]
seq = _create_sequence_from_ndarray(X, 1, 30)
seq_ds = lgb.Dataset(seq, feature_name=names, free_raw_data=False).construct()
Expand Down
Loading