Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package] support sub-classing scikit-learn estimators #6783

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add docs
  • Loading branch information
jameslamb committed Jan 7, 2025
commit 7b720cb509a06fd97c3aacf44c23879991c9d690
39 changes: 39 additions & 0 deletions docs/FAQ.rst
Original file line number Diff line number Diff line change
Expand Up @@ -377,3 +377,42 @@ We strongly recommend installation from the ``conda-forge`` channel and not from
For some specific examples, see `this comment <https://github.com/microsoft/LightGBM/issues/4948#issuecomment-1013766397>`__.

In addition, as of ``lightgbm==4.4.0``, the ``conda-forge`` package automatically supports CUDA-based GPU acceleration.

5. How do I subclass ``scikit-learn`` estimators?
-------------------------------------------------

For ``lightgbm <= 4.5.0``, copy all of the constructor arguments from the corresponding
``lightgbm`` class into the constructor of your custom estimator.

For later versions, just ensure that the constructor of your custom estimator calls ``super().__init__()``.

Consider the example below, which implements a regressor that allows creation of truncated predictions.
This pattern will work with ``lightgbm > 4.5.0``.

.. code-block:: python

import numpy as np
from lightgbm import LGBMRegressor
from sklearn.datasets import make_regression

class TruncatedRegressor(LGBMRegressor):

def __init__(self, **kwargs):
super().__init__(**kwargs)

def predict(self, X, max_score: float = np.inf):
preds = super().predict(X)
preds[np.where(preds > max_score)] = max_score
return preds

X, y = make_regression(n_samples=1_000, n_features=4)

reg_trunc = TruncatedRegressor().fit(X, y)

preds = reg_trunc.predict(X)
print(f"mean: {preds.mean():.2f}, max: {preds.max():.2f}")
# mean: -6.81, max: 345.10

preds_trunc = reg_trunc.predict(X, max_score = preds.mean())
print(f"mean: {preds_trunc.mean():.2f}, max: {preds_trunc.max():.2f}")
# mean: -56.50, max: -6.81
1 change: 0 additions & 1 deletion python-package/lightgbm/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
LGBMModel,
LGBMRanker,
LGBMRegressor,
_LGBM_ScikitCustomObjectiveFunction,
_LGBM_ScikitEvalMetricType,
_lgbmmodel_doc_custom_eval_note,
_lgbmmodel_doc_fit,
Expand Down
9 changes: 3 additions & 6 deletions python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,6 @@ def __init__(
For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
and grad and hess should be returned in the same format.
"""
print("LGBMModel.__init__()")
if not SKLEARN_INSTALLED:
raise LightGBMError(
"scikit-learn is required for lightgbm.sklearn. "
Expand Down Expand Up @@ -752,9 +751,9 @@ def get_params(self, deep: bool = True) -> Dict[str, Any]:
#
# `get_params()` flows like this:
#
# 0. Return parameters in subclass (self.__class__) first, by using inspect.
# 1. Return parameters in all parent classes (especially `LGBMModel`).
# 2. Return whatever is in `**kwargs`.
# 0. Get parameters in subclass (self.__class__) first, by using inspect.
# 1. Get parameters in all parent classes (especially `LGBMModel`).
# 2. Get whatever was passed via `**kwargs`.
# 3. Merge them.
#
# This needs to accommodate being called recursively in the following
Expand All @@ -768,7 +767,6 @@ def get_params(self, deep: bool = True) -> Dict[str, Any]:
#
params = super().get_params(deep=deep)
cp = copy.copy(self)
print(f"--- {cp.__class__.__bases__}")
# If the immediate parent defines get_params(), use that.
if callable(getattr(cp.__class__.__bases__[0], "get_params", None)):
cp.__class__ = cp.__class__.__bases__[0]
Expand Down Expand Up @@ -1322,7 +1320,6 @@ def __init__(self, **kwargs: Any):
# - https://stackoverflow.com/questions/40025406/inherit-from-scikit-learns-lassocv-model/40027200#40027200
# - https://stackoverflow.com/questions/79320289/why-cant-i-wrap-lgbm
# - https://github.com/dmlc/xgboost/blob/bd92b1c9c0db3e75ec3dfa513e1435d518bb535d/python-package/xgboost/sklearn.py#L941
print("LGBMRegressor.__init__()")
super().__init__(**kwargs)

__init__.__doc__ = LGBMModel.__init__.__doc__
Expand Down
189 changes: 188 additions & 1 deletion tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import lightgbm as lgb
from lightgbm.compat import (
DASK_INSTALLED,
DATATABLE_INSTALLED,
PANDAS_INSTALLED,
_sklearn_version,
Expand Down Expand Up @@ -83,6 +84,30 @@ def __call__(self, env):
env.model.attr_set_inside_callback = env.iteration * 10


class ExtendedLGBMClassifier(lgb.LGBMClassifier):
"""Class for testing that inheriting from LGBMClassifier works"""

def __init__(self, *, some_other_param: str = "lgbm-classifier", **kwargs):
self.some_other_param = some_other_param
super().__init__(**kwargs)


class ExtendedLGBMRanker(lgb.LGBMRanker):
"""Class for testing that inheriting from LGBMRanker works"""

def __init__(self, *, some_other_param: str = "lgbm-ranker", **kwargs):
self.some_other_param = some_other_param
super().__init__(**kwargs)


class ExtendedLGBMRegressor(lgb.LGBMRegressor):
"""Class for testing that inheriting from LGBMRegressor works"""

def __init__(self, *, some_other_param: str = "lgbm-regressor", **kwargs):
self.some_other_param = some_other_param
super().__init__(**kwargs)


def custom_asymmetric_obj(y_true, y_pred):
residual = (y_true - y_pred).astype(np.float64)
grad = np.where(residual < 0, -2 * 10.0 * residual, -2 * residual)
Expand Down Expand Up @@ -475,6 +500,165 @@ def test_clone_and_property():
assert isinstance(clf.feature_importances_, np.ndarray)


def test_subclassing_get_params_works():
expected_params = {
"boosting_type": "gbdt",
"class_weight": None,
"colsample_bytree": 1.0,
"importance_type": "split",
"learning_rate": 0.1,
"max_depth": -1,
"min_child_samples": 20,
"min_child_weight": 0.001,
"min_split_gain": 0.0,
"n_estimators": 100,
"n_jobs": None,
"num_leaves": 31,
"objective": None,
"random_state": None,
"reg_alpha": 0.0,
"reg_lambda": 0.0,
"subsample": 1.0,
"subsample_for_bin": 200000,
"subsample_freq": 0,
}

# Overrides, used to test that passing through **kwargs works as expected.
#
# why these?
#
# - 'n_estimators" directly matches a keyword arg for the scikit-learn estimators
# - 'eta' is a parameter alias for 'learning_rate'
overrides = {"n_estimators": 13, "eta": 0.07}

# lightgbm-official classes
for est in [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRanker, lgb.LGBMRegressor]:
assert est().get_params() == expected_params
assert est(**overrides).get_params() == {
**expected_params,
"eta": 0.07,
"n_estimators": 13,
"learning_rate": 0.1,
}

if DASK_INSTALLED:
for est in [lgb.DaskLGBMClassifier, lgb.DaskLGBMRanker, lgb.DaskLGBMRegressor]:
assert est().get_params() == {
**expected_params,
"client": None,
}
assert est(**overrides).get_params() == {
**expected_params,
"eta": 0.07,
"n_estimators": 13,
"learning_rate": 0.1,
"client": None,
}

# custom sub-classes
assert ExtendedLGBMClassifier().get_params() == {**expected_params, "some_other_param": "lgbm-classifier"}
assert ExtendedLGBMClassifier(**overrides).get_params() == {
**expected_params,
"eta": 0.07,
"n_estimators": 13,
"learning_rate": 0.1,
"some_other_param": "lgbm-classifier",
}
assert ExtendedLGBMRanker().get_params() == {
**expected_params,
"some_other_param": "lgbm-ranker",
}
assert ExtendedLGBMRanker(**overrides).get_params() == {
**expected_params,
"eta": 0.07,
"n_estimators": 13,
"learning_rate": 0.1,
"some_other_param": "lgbm-ranker",
}
assert ExtendedLGBMRegressor().get_params() == {
**expected_params,
"some_other_param": "lgbm-regressor",
}
assert ExtendedLGBMRegressor(**overrides).get_params() == {
**expected_params,
"eta": 0.07,
"n_estimators": 13,
"learning_rate": 0.1,
"some_other_param": "lgbm-regressor",
}


@pytest.mark.parametrize("task", all_tasks)
def test_subclassing_works(task):
# param values to make training deterministic and
# just train a small, cheap model
params = {
"deterministic": True,
"force_row_wise": True,
"n_jobs": 1,
"n_estimators": 5,
"num_leaves": 11,
"random_state": 708,
}

X, y, g = _create_data(task=task)
if task == "ranking":
est = lgb.LGBMRanker(**params).fit(X, y, group=g)
est_sub = ExtendedLGBMRanker(**params).fit(X, y, group=g)
elif task.endswith("classification"):
est = lgb.LGBMClassifier(**params).fit(X, y)
est_sub = ExtendedLGBMClassifier(**params).fit(X, y)
else:
est = lgb.LGBMRegressor(**params).fit(X, y)
est_sub = ExtendedLGBMRegressor(**params).fit(X, y)

np.testing.assert_allclose(est.predict(X), est_sub.predict(X))


@pytest.mark.parametrize(
"estimator_to_task",
[
(lgb.LGBMClassifier, "binary-classification"),
(ExtendedLGBMClassifier, "binary-classification"),
(lgb.LGBMRanker, "ranking"),
(ExtendedLGBMRanker, "ranking"),
(lgb.LGBMRegressor, "regression"),
(ExtendedLGBMRegressor, "regression"),
],
)
def test_parameter_aliases_are_handled_correctly(estimator_to_task):
estimator, task = estimator_to_task
# scikit-learn estimators should remember every parameter passed
# via keyword arguments in the estimator constructor, but then
# only pass the correct value down to LightGBM's C++ side
params = {
"eta": 0.08,
"num_iterations": 3,
"num_leaves": 5,
}
X, y, g = _create_data(task=task)
mod = estimator(**params)
if task == "ranking":
mod.fit(X, y, group=g)
else:
mod.fit(X, y)

# scikit-learn get_params()
p = mod.get_params()
assert p["eta"] == 0.08
assert p["learning_rate"] == 0.1

# lgb.Booster's 'params' attribute
p = mod.booster_.params
assert p["eta"] == 0.08
assert p["learning_rate"] == 0.1

# Config in the 'LightGBM::Booster' on the C++ side
p = mod.booster_._get_loaded_param()
assert p["learning_rate"] == 0.1
assert "eta" not in p


def test_joblib(tmp_path):
X, y = make_synthetic_regression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
Expand Down Expand Up @@ -1463,7 +1647,10 @@ def _get_expected_failed_tests(estimator):
return estimator._more_tags()["_xfail_checks"]


@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()], expected_failed_checks=_get_expected_failed_tests)
@parametrize_with_checks(
[ExtendedLGBMClassifier(), ExtendedLGBMRegressor(), lgb.LGBMClassifier(), lgb.LGBMRegressor()],
expected_failed_checks=_get_expected_failed_tests,
)
def test_sklearn_integration(estimator, check):
estimator.set_params(min_child_samples=1, min_data_in_bin=1)
check(estimator)
Expand Down
Loading