microsoft · suk1yak1 · Apr 23, 2025 · Apr 23, 2025 · Apr 24, 2025 · Apr 24, 2025
@@ -46,6 +46,9 @@
     pd_Series,
 )
 
+if PYARROW_INSTALLED:
+    import pyarrow as pa
+
 if TYPE_CHECKING:
     from typing import Literal
 
@@ -3293,6 +3296,8 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]:
                     self.data = self.data[self.used_indices, :]
                 elif isinstance(self.data, Sequence):
                     self.data = self.data[self.used_indices]
+                elif isinstance(self.data, pa_Table):
+                    self.data = self.data.take(self.used_indices)
                 elif _is_list_of_sequences(self.data) and len(self.data) > 0:
                     self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices)))
                 else:
@@ -3480,6 +3485,22 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
                 elif isinstance(other.data, dt_DataTable):
                     _emit_datatable_deprecation_warning()
                     self.data = np.hstack((self.data, other.data.to_numpy()))
+                elif isinstance(other.data, pa_Table):
+                    if not PYARROW_INSTALLED:
+                        raise LightGBMError(
+                            "Cannot add features to pyarrow.Table type of raw data "
+                            "without pyarrow installed. "
+                            "Install pyarrow and restart your session."
+                        )
+                    else:
 if not (PYARROW_INSTALLED and CFFI_INSTALLED): 
     raise LightGBMError("Cannot init Dataset from Arrow without 'pyarrow' and 'cffi' installed.") 
 if not (PYARROW_INSTALLED and CFFI_INSTALLED): 
     raise LightGBMError("Cannot init Dataset from Arrow without 'pyarrow' and 'cffi' installed.") 
+                        self.data = np.hstack(
+                            (
+                                self.data,
+                                np.column_stack(
+                                    [other.data.column(i).to_numpy() for i in range(len(other.data.column_names))]
+                                ),
+                            )
+                        )
                 else:
                     self.data = None
             elif isinstance(self.data, scipy.sparse.spmatrix):
@@ -3491,6 +3512,23 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
                 elif isinstance(other.data, dt_DataTable):
                     _emit_datatable_deprecation_warning()
                     self.data = scipy.sparse.hstack((self.data, other.data.to_numpy()), format=sparse_format)
+                elif isinstance(other.data, pa_Table):
+                    if not PYARROW_INSTALLED:
+                        raise LightGBMError(
+                            "Cannot add features to pyarrow.Table type of raw data "
+                            "without pyarrow installed. "
+                            "Install pyarrow and restart your session."
+                        )
+                    else:
+                        self.data = scipy.sparse.hstack(
+                            (
+                                self.data,
+                                np.column_stack(
+                                    [other.data.column(i).to_numpy() for i in range(len(other.data.column_names))]
+                                ),
+                            ),
+                            format=sparse_format,
+                        )
                 else:
                     self.data = None
             elif isinstance(self.data, pd_DataFrame):
@@ -3509,6 +3547,27 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
                 elif isinstance(other.data, dt_DataTable):
                     _emit_datatable_deprecation_warning()
                     self.data = concat((self.data, pd_DataFrame(other.data.to_numpy())), axis=1, ignore_index=True)
+                elif isinstance(other.data, pa_Table):
+                    if not PYARROW_INSTALLED:
+                        raise LightGBMError(
+                            "Cannot add features to pyarrow.Table type of raw data "
+                            "without pyarrow installed. "
+                            "Install pyarrow and restart your session."
+                        )
+                    else:
+                        self.data = concat(
+                            (
+                                self.data,
+                                pd_DataFrame(
+                                    {
+                                        other.data.column_names[i]: other.data.column(i).to_numpy()
+                                        for i in range(len(other.data.column_names))
+                                    }
+                                ),
+                            ),
+                            axis=1,
+                            ignore_index=True,
+                        )
                 else:
                     self.data = None
             elif isinstance(self.data, dt_DataTable):
@@ -3521,6 +3580,79 @@ def add_features_from(self, other: "Dataset") -> "Dataset":
                     self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.values)))
                 elif isinstance(other.data, dt_DataTable):
                     self.data = dt_DataTable(np.hstack((self.data.to_numpy(), other.data.to_numpy())))
+                elif isinstance(other.data, pa_Table):
+                    if not PYARROW_INSTALLED:
+                        raise LightGBMError(
+                            "Cannot add features to pyarrow.Table type of raw data "
+                            "without pyarrow installed. "
+                            "Install pyarrow and restart your session."
+                        )
+                    else:
+                        self.data = dt_DataTable(
+                            np.hstack(
+                                (
+                                    self.data.to_numpy(),
+                                    np.column_stack(
+                                        [other.data.column(i).to_numpy() for i in range(len(other.data.column_names))]
+                                    ),
+                                )
+                            )
+                        )
+                else:
+                    self.data = None
+            elif isinstance(self.data, pa_Table):
+                if not PYARROW_INSTALLED:
+                    raise LightGBMError(
+                        "Cannot add features to pyarrow.Table type of raw data "
+                        "without pyarrow installed. "
+                        "Install pyarrow and restart your session."
+                    )
+                if isinstance(other.data, np.ndarray):
+                    self.data = pa_Table.from_arrays(
+                        [
+                            *self.data.columns,
+                            *[pa.array(other.data[:, i]) for i in range(other.data.shape[1])],
+                        ],
+                        names=[
+                            *self.data.column_names,
+                            *[f"D{len(self.data.column_names) + i + 1}" for i in range(other.data.shape[1])],
+                        ],
+                    )
+                elif isinstance(other.data, scipy.sparse.spmatrix):
+                    other_array = other.data.toarray()
+                    self.data = pa_Table.from_arrays(
+                        [
+                            *self.data.columns,
+                            *[pa.array(other_array[:, i]) for i in range(other_array.shape[1])],
+                        ],
+                        names=[
+                            *self.data.column_names,
+                            *[f"D{len(self.data.column_names) + i + 1}" for i in range(other_array.shape[1])],
+                        ],
+                    )
+                elif isinstance(other.data, pd_DataFrame):
+                    self.data = pa_Table.from_arrays(
+                        [
+                            *self.data.columns,
+                            *[pa.array(other.data.iloc[:, i].values) for i in range(len(other.data.columns))],
+                        ],
+                        names=[*self.data.column_names, *map(str, other.data.columns.tolist())],
+                    )
+                elif isinstance(other.data, dt_DataTable):
+                    _emit_datatable_deprecation_warning()
+                    other_array = other.data.to_numpy()
+                    self.data = pa_Table.from_arrays(
+                        [
+                            *self.data.columns,
+                            *[pa.array(other_array[:, i]) for i in range(other_array.shape[1])],
+                        ],
+                        names=[*self.data.column_names, *other.data.names],
+                    )
+                elif isinstance(other.data, pa_Table):
+                    self.data = pa_Table.from_arrays(
+                        [*self.data.columns, *other.data.columns],
+                        names=[*self.data.column_names, *other.data.column_names],
+                    )
                 else:
                     self.data = None
             else:

@@ -17,6 +17,15 @@
 
 from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
 
+if getenv("ALLOW_SKIP_ARROW_TESTS") == "1":
+    pa = pytest.importorskip("pyarrow")
+else:
+    import pyarrow as pa  # type: ignore
+
+    assert lgb.compat.PYARROW_INSTALLED is True, (
+        "'pyarrow' and its dependencies must be installed to run the arrow tests"
+    )
+
 
 def test_basic(tmp_path):
     X_train, X_test, y_train, y_test = train_test_split(
@@ -345,7 +354,18 @@ def test_add_features_from_different_sources(rng):
     n_row = 100
     n_col = 5
     X = rng.uniform(size=(n_row, n_col))
-    xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
+    xxs = [
+        X,
+        sparse.csr_matrix(X),
+        pd.DataFrame(X),
+    ]
+    if getenv("ALLOW_SKIP_ARROW_TESTS") != "1":
+        xxs.append(
+            pa.Table.from_arrays(
+                [pa.array(X[:, i]) for i in range(X.shape[1])], names=[f"D{i}" for i in range(X.shape[1])]
+            )
+        )
+
     names = [f"col_{i}" for i in range(n_col)]
     seq = _create_sequence_from_ndarray(X, 1, 30)
     seq_ds = lgb.Dataset(seq, feature_name=names, free_raw_data=False).construct()