diff --git a/docs/FAQ.rst b/docs/FAQ.rst index 14c7f7dd7265..6f8b71378ddf 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -377,3 +377,42 @@ We strongly recommend installation from the ``conda-forge`` channel and not from For some specific examples, see `this comment `__. In addition, as of ``lightgbm==4.4.0``, the ``conda-forge`` package automatically supports CUDA-based GPU acceleration. + +5. How do I subclass ``scikit-learn`` estimators? +------------------------------------------------- + +For ``lightgbm <= 4.5.0``, copy all of the constructor arguments from the corresponding +``lightgbm`` class into the constructor of your custom estimator. + +For later versions, just ensure that the constructor of your custom estimator calls ``super().__init__()``. + +Consider the example below, which implements a regressor that allows creation of truncated predictions. +This pattern will work with ``lightgbm > 4.5.0``. + +.. code-block:: python + + import numpy as np + from lightgbm import LGBMRegressor + from sklearn.datasets import make_regression + + class TruncatedRegressor(LGBMRegressor): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def predict(self, X, max_score: float = np.inf): + preds = super().predict(X) + preds[np.where(preds > max_score)] = max_score + return preds + + X, y = make_regression(n_samples=1_000, n_features=4) + + reg_trunc = TruncatedRegressor().fit(X, y) + + preds = reg_trunc.predict(X) + print(f"mean: {preds.mean():.2f}, max: {preds.max():.2f}") + # mean: -6.81, max: 345.10 + + preds_trunc = reg_trunc.predict(X, max_score = preds.mean()) + print(f"mean: {preds_trunc.mean():.2f}, max: {preds_trunc.max():.2f}") + # mean: -56.50, max: -6.81 diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index dcdacba7366c..cd1648c8c1cf 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -44,7 +44,6 @@ LGBMModel, LGBMRanker, LGBMRegressor, - _LGBM_ScikitCustomObjectiveFunction, _LGBM_ScikitEvalMetricType, _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit, @@ -1115,52 +1114,13 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): def __init__( self, - boosting_type: str = "gbdt", - num_leaves: int = 31, - max_depth: int = -1, - learning_rate: float = 0.1, - n_estimators: int = 100, - subsample_for_bin: int = 200000, - objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None, - class_weight: Optional[Union[dict, str]] = None, - min_split_gain: float = 0.0, - min_child_weight: float = 1e-3, - min_child_samples: int = 20, - subsample: float = 1.0, - subsample_freq: int = 0, - colsample_bytree: float = 1.0, - reg_alpha: float = 0.0, - reg_lambda: float = 0.0, - random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None, - n_jobs: Optional[int] = None, - importance_type: str = "split", + *, client: Optional[Client] = None, **kwargs: Any, ): """Docstring is inherited from the lightgbm.LGBMClassifier.__init__.""" self.client = client - super().__init__( - boosting_type=boosting_type, - num_leaves=num_leaves, - max_depth=max_depth, - learning_rate=learning_rate, - n_estimators=n_estimators, - subsample_for_bin=subsample_for_bin, - objective=objective, - class_weight=class_weight, - min_split_gain=min_split_gain, - min_child_weight=min_child_weight, - min_child_samples=min_child_samples, - subsample=subsample, - subsample_freq=subsample_freq, - colsample_bytree=colsample_bytree, - reg_alpha=reg_alpha, - reg_lambda=reg_lambda, - random_state=random_state, - n_jobs=n_jobs, - importance_type=importance_type, - **kwargs, - ) + super().__init__(**kwargs) _base_doc = LGBMClassifier.__init__.__doc__ _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs") # type: ignore @@ -1318,52 +1278,13 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel): def __init__( self, - boosting_type: str = "gbdt", - num_leaves: int = 31, - max_depth: int = -1, - learning_rate: float = 0.1, - n_estimators: int = 100, - subsample_for_bin: int = 200000, - objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None, - class_weight: Optional[Union[dict, str]] = None, - min_split_gain: float = 0.0, - min_child_weight: float = 1e-3, - min_child_samples: int = 20, - subsample: float = 1.0, - subsample_freq: int = 0, - colsample_bytree: float = 1.0, - reg_alpha: float = 0.0, - reg_lambda: float = 0.0, - random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None, - n_jobs: Optional[int] = None, - importance_type: str = "split", + *, client: Optional[Client] = None, **kwargs: Any, ): """Docstring is inherited from the lightgbm.LGBMRegressor.__init__.""" self.client = client - super().__init__( - boosting_type=boosting_type, - num_leaves=num_leaves, - max_depth=max_depth, - learning_rate=learning_rate, - n_estimators=n_estimators, - subsample_for_bin=subsample_for_bin, - objective=objective, - class_weight=class_weight, - min_split_gain=min_split_gain, - min_child_weight=min_child_weight, - min_child_samples=min_child_samples, - subsample=subsample, - subsample_freq=subsample_freq, - colsample_bytree=colsample_bytree, - reg_alpha=reg_alpha, - reg_lambda=reg_lambda, - random_state=random_state, - n_jobs=n_jobs, - importance_type=importance_type, - **kwargs, - ) + super().__init__(**kwargs) _base_doc = LGBMRegressor.__init__.__doc__ _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs") # type: ignore @@ -1485,52 +1406,13 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel): def __init__( self, - boosting_type: str = "gbdt", - num_leaves: int = 31, - max_depth: int = -1, - learning_rate: float = 0.1, - n_estimators: int = 100, - subsample_for_bin: int = 200000, - objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None, - class_weight: Optional[Union[dict, str]] = None, - min_split_gain: float = 0.0, - min_child_weight: float = 1e-3, - min_child_samples: int = 20, - subsample: float = 1.0, - subsample_freq: int = 0, - colsample_bytree: float = 1.0, - reg_alpha: float = 0.0, - reg_lambda: float = 0.0, - random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None, - n_jobs: Optional[int] = None, - importance_type: str = "split", + *, client: Optional[Client] = None, **kwargs: Any, ): """Docstring is inherited from the lightgbm.LGBMRanker.__init__.""" self.client = client - super().__init__( - boosting_type=boosting_type, - num_leaves=num_leaves, - max_depth=max_depth, - learning_rate=learning_rate, - n_estimators=n_estimators, - subsample_for_bin=subsample_for_bin, - objective=objective, - class_weight=class_weight, - min_split_gain=min_split_gain, - min_child_weight=min_child_weight, - min_child_samples=min_child_samples, - subsample=subsample, - subsample_freq=subsample_freq, - colsample_bytree=colsample_bytree, - reg_alpha=reg_alpha, - reg_lambda=reg_lambda, - random_state=random_state, - n_jobs=n_jobs, - importance_type=importance_type, - **kwargs, - ) + super().__init__(**kwargs) _base_doc = LGBMRanker.__init__.__doc__ _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs") # type: ignore diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 108ef1e14498..fc5e716692a3 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -488,6 +488,7 @@ class LGBMModel(_LGBMModelBase): def __init__( self, + *, boosting_type: str = "gbdt", num_leaves: int = 31, max_depth: int = -1, @@ -745,7 +746,35 @@ def get_params(self, deep: bool = True) -> Dict[str, Any]: params : dict Parameter names mapped to their values. """ + # Based on: https://github.com/dmlc/xgboost/blob/bd92b1c9c0db3e75ec3dfa513e1435d518bb535d/python-package/xgboost/sklearn.py#L941 + # which was based on: https://stackoverflow.com/questions/59248211 + # + # `get_params()` flows like this: + # + # 0. Get parameters in subclass (self.__class__) first, by using inspect. + # 1. Get parameters in all parent classes (especially `LGBMModel`). + # 2. Get whatever was passed via `**kwargs`. + # 3. Merge them. + # + # This needs to accommodate being called recursively in the following + # inheritance graphs (and similar for classification and ranking): + # + # DaskLGBMRegressor -> LGBMRegressor -> LGBMModel -> BaseEstimator + # (custom subclass) -> LGBMRegressor -> LGBMModel -> BaseEstimator + # LGBMRegressor -> LGBMModel -> BaseEstimator + # (custom subclass) -> LGBMModel -> BaseEstimator + # LGBMModel -> BaseEstimator + # params = super().get_params(deep=deep) + cp = copy.copy(self) + # If the immediate parent defines get_params(), use that. + if callable(getattr(cp.__class__.__bases__[0], "get_params", None)): + cp.__class__ = cp.__class__.__bases__[0] + # Otherwise, skip it and assume the next class will have it. + # This is here primarily for cases where the first class in MRO is a scikit-learn mixin. + else: + cp.__class__ = cp.__class__.__bases__[1] + params.update(cp.__class__.get_params(cp, deep)) params.update(self._other_params) return params @@ -1285,6 +1314,11 @@ def feature_names_in_(self) -> None: class LGBMRegressor(_LGBMRegressorBase, LGBMModel): """LightGBM regressor.""" + def __init__(self, **kwargs: Any): + super().__init__(**kwargs) + + __init__.__doc__ = LGBMModel.__init__.__doc__ + def _more_tags(self) -> Dict[str, Any]: # handle the case where RegressorMixin possibly provides _more_tags() if callable(getattr(_LGBMRegressorBase, "_more_tags", None)): @@ -1344,6 +1378,11 @@ def fit( # type: ignore[override] class LGBMClassifier(_LGBMClassifierBase, LGBMModel): """LightGBM classifier.""" + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + + __init__.__doc__ = LGBMModel.__init__.__doc__ + def _more_tags(self) -> Dict[str, Any]: # handle the case where ClassifierMixin possibly provides _more_tags() if callable(getattr(_LGBMClassifierBase, "_more_tags", None)): @@ -1554,6 +1593,11 @@ class LGBMRanker(LGBMModel): Please use this class mainly for training and applying ranking models in common sklearnish way. """ + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + + __init__.__doc__ = LGBMModel.__init__.__doc__ + def fit( # type: ignore[override] self, X: _LGBM_ScikitMatrixLike, diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index b5e17991f63d..dacef3305547 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1385,13 +1385,14 @@ def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except sklearn_spec = inspect.getfullargspec(classes[1]) assert dask_spec.varargs == sklearn_spec.varargs assert dask_spec.varkw == sklearn_spec.varkw - assert dask_spec.kwonlyargs == sklearn_spec.kwonlyargs - assert dask_spec.kwonlydefaults == sklearn_spec.kwonlydefaults - # "client" should be the only different, and the final argument - assert dask_spec.args[:-1] == sklearn_spec.args - assert dask_spec.defaults[:-1] == sklearn_spec.defaults - assert dask_spec.args[-1] == "client" + assert dask_spec.kwonlyargs == [*sklearn_spec.kwonlyargs, "client"] + assert dask_spec.kwonlydefaults == {"client": None} + assert sklearn_spec.kwonlydefaults is None + + # only positional argument should be 'self' + assert dask_spec.args == sklearn_spec.args + assert dask_spec.args == ["self"] assert dask_spec.defaults[-1] is None diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 1cdd047f1857..991b3e5f8cf8 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -22,6 +22,7 @@ import lightgbm as lgb from lightgbm.compat import ( + DASK_INSTALLED, DATATABLE_INSTALLED, PANDAS_INSTALLED, _sklearn_version, @@ -83,6 +84,30 @@ def __call__(self, env): env.model.attr_set_inside_callback = env.iteration * 10 +class ExtendedLGBMClassifier(lgb.LGBMClassifier): + """Class for testing that inheriting from LGBMClassifier works""" + + def __init__(self, *, some_other_param: str = "lgbm-classifier", **kwargs): + self.some_other_param = some_other_param + super().__init__(**kwargs) + + +class ExtendedLGBMRanker(lgb.LGBMRanker): + """Class for testing that inheriting from LGBMRanker works""" + + def __init__(self, *, some_other_param: str = "lgbm-ranker", **kwargs): + self.some_other_param = some_other_param + super().__init__(**kwargs) + + +class ExtendedLGBMRegressor(lgb.LGBMRegressor): + """Class for testing that inheriting from LGBMRegressor works""" + + def __init__(self, *, some_other_param: str = "lgbm-regressor", **kwargs): + self.some_other_param = some_other_param + super().__init__(**kwargs) + + def custom_asymmetric_obj(y_true, y_pred): residual = (y_true - y_pred).astype(np.float64) grad = np.where(residual < 0, -2 * 10.0 * residual, -2 * residual) @@ -475,6 +500,165 @@ def test_clone_and_property(): assert isinstance(clf.feature_importances_, np.ndarray) +def test_subclassing_get_params_works(): + expected_params = { + "boosting_type": "gbdt", + "class_weight": None, + "colsample_bytree": 1.0, + "importance_type": "split", + "learning_rate": 0.1, + "max_depth": -1, + "min_child_samples": 20, + "min_child_weight": 0.001, + "min_split_gain": 0.0, + "n_estimators": 100, + "n_jobs": None, + "num_leaves": 31, + "objective": None, + "random_state": None, + "reg_alpha": 0.0, + "reg_lambda": 0.0, + "subsample": 1.0, + "subsample_for_bin": 200000, + "subsample_freq": 0, + } + + # Overrides, used to test that passing through **kwargs works as expected. + # + # why these? + # + # - 'n_estimators' directly matches a keyword arg for the scikit-learn estimators + # - 'eta' is a parameter alias for 'learning_rate' + overrides = {"n_estimators": 13, "eta": 0.07} + + # lightgbm-official classes + for est in [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRanker, lgb.LGBMRegressor]: + assert est().get_params() == expected_params + assert est(**overrides).get_params() == { + **expected_params, + "eta": 0.07, + "n_estimators": 13, + "learning_rate": 0.1, + } + + if DASK_INSTALLED: + for est in [lgb.DaskLGBMClassifier, lgb.DaskLGBMRanker, lgb.DaskLGBMRegressor]: + assert est().get_params() == { + **expected_params, + "client": None, + } + assert est(**overrides).get_params() == { + **expected_params, + "eta": 0.07, + "n_estimators": 13, + "learning_rate": 0.1, + "client": None, + } + + # custom sub-classes + assert ExtendedLGBMClassifier().get_params() == {**expected_params, "some_other_param": "lgbm-classifier"} + assert ExtendedLGBMClassifier(**overrides).get_params() == { + **expected_params, + "eta": 0.07, + "n_estimators": 13, + "learning_rate": 0.1, + "some_other_param": "lgbm-classifier", + } + assert ExtendedLGBMRanker().get_params() == { + **expected_params, + "some_other_param": "lgbm-ranker", + } + assert ExtendedLGBMRanker(**overrides).get_params() == { + **expected_params, + "eta": 0.07, + "n_estimators": 13, + "learning_rate": 0.1, + "some_other_param": "lgbm-ranker", + } + assert ExtendedLGBMRegressor().get_params() == { + **expected_params, + "some_other_param": "lgbm-regressor", + } + assert ExtendedLGBMRegressor(**overrides).get_params() == { + **expected_params, + "eta": 0.07, + "n_estimators": 13, + "learning_rate": 0.1, + "some_other_param": "lgbm-regressor", + } + + +@pytest.mark.parametrize("task", all_tasks) +def test_subclassing_works(task): + # param values to make training deterministic and + # just train a small, cheap model + params = { + "deterministic": True, + "force_row_wise": True, + "n_jobs": 1, + "n_estimators": 5, + "num_leaves": 11, + "random_state": 708, + } + + X, y, g = _create_data(task=task) + if task == "ranking": + est = lgb.LGBMRanker(**params).fit(X, y, group=g) + est_sub = ExtendedLGBMRanker(**params).fit(X, y, group=g) + elif task.endswith("classification"): + est = lgb.LGBMClassifier(**params).fit(X, y) + est_sub = ExtendedLGBMClassifier(**params).fit(X, y) + else: + est = lgb.LGBMRegressor(**params).fit(X, y) + est_sub = ExtendedLGBMRegressor(**params).fit(X, y) + + np.testing.assert_allclose(est.predict(X), est_sub.predict(X)) + + +@pytest.mark.parametrize( + "estimator_to_task", + [ + (lgb.LGBMClassifier, "binary-classification"), + (ExtendedLGBMClassifier, "binary-classification"), + (lgb.LGBMRanker, "ranking"), + (ExtendedLGBMRanker, "ranking"), + (lgb.LGBMRegressor, "regression"), + (ExtendedLGBMRegressor, "regression"), + ], +) +def test_parameter_aliases_are_handled_correctly(estimator_to_task): + estimator, task = estimator_to_task + # scikit-learn estimators should remember every parameter passed + # via keyword arguments in the estimator constructor, but then + # only pass the correct value down to LightGBM's C++ side + params = { + "eta": 0.08, + "num_iterations": 3, + "num_leaves": 5, + } + X, y, g = _create_data(task=task) + mod = estimator(**params) + if task == "ranking": + mod.fit(X, y, group=g) + else: + mod.fit(X, y) + + # scikit-learn get_params() + p = mod.get_params() + assert p["eta"] == 0.08 + assert p["learning_rate"] == 0.1 + + # lgb.Booster's 'params' attribute + p = mod.booster_.params + assert p["eta"] == 0.08 + assert p["learning_rate"] == 0.1 + + # Config in the 'LightGBM::Booster' on the C++ side + p = mod.booster_._get_loaded_param() + assert p["learning_rate"] == 0.1 + assert "eta" not in p + + def test_joblib(tmp_path): X, y = make_synthetic_regression() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -1463,7 +1647,10 @@ def _get_expected_failed_tests(estimator): return estimator._more_tags()["_xfail_checks"] -@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()], expected_failed_checks=_get_expected_failed_tests) +@parametrize_with_checks( + [ExtendedLGBMClassifier(), ExtendedLGBMRegressor(), lgb.LGBMClassifier(), lgb.LGBMRegressor()], + expected_failed_checks=_get_expected_failed_tests, +) def test_sklearn_integration(estimator, check): estimator.set_params(min_child_samples=1, min_data_in_bin=1) check(estimator)