Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package][R-package] allow using feature names when retrieving number of bins #5116

Merged
merged 10 commits into from
May 17, 2022
6 changes: 6 additions & 0 deletions R-package/R/lgb.Dataset.R
Original file line number Diff line number Diff line change
@@ -381,6 +381,12 @@ Dataset <- R6::R6Class(
if (lgb.is.null.handle(x = private$handle)) {
stop("Cannot get number of bins in feature before constructing Dataset.")
}
if (is.character(feature)) {
feature <- which(colnames(self) == feature)
if (length(feature) == 0L) {
stop("feature not found")
}
}
num_bin <- integer(1L)
.Call(
LGBM_DatasetGetFeatureNumBin_R
2 changes: 2 additions & 0 deletions R-package/tests/testthat/test_dataset.R
Original file line number Diff line number Diff line change
@@ -547,4 +547,6 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
)
actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin)
expect_identical(actual_num_bins, expected_num_bins)
bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin)
expect_identical(unname(bins_by_name), expected_num_bins)
})
8 changes: 5 additions & 3 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
@@ -2386,20 +2386,22 @@ def num_feature(self):
else:
raise LightGBMError("Cannot get num_feature before construct dataset")

def feature_num_bin(self, feature: int) -> int:
def feature_num_bin(self, feature: Union[int, str]) -> int:
"""Get the number of bins for a feature.

Parameters
----------
feature : int
Index of the feature.
feature : int or str
Index or name of the feature.

Returns
-------
number_of_bins : int
The number of constructed bins for the feature in the Dataset.
"""
if self.handle is not None:
if isinstance(feature, str):
feature = self.feature_name.index(feature)
ret = ctypes.c_int(0)
_safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle,
ctypes.c_int(feature),
6 changes: 5 additions & 1 deletion tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
@@ -634,7 +634,9 @@ def test_feature_num_bin(min_data_in_bin):
np.array([1, 2] * 49 + 2 * [np.nan]),
np.zeros(100),
]).T
ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}).construct()
feature_name = [f'x{i}' for i in range(X.shape[1])]
ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}, feature_name=feature_name)
ds.construct()
expected_num_bins = [
100 // min_data_in_bin + 1, # extra bin for zero
3, # 0, 1, 2
@@ -644,6 +646,8 @@ def test_feature_num_bin(min_data_in_bin):
]
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
assert actual_num_bins == expected_num_bins
bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
assert bins_by_name == expected_num_bins


def test_feature_num_bin_with_max_bin_by_feature():