Skip to content

Commit

Permalink
minor docs improvements and added defines for importance types
Browse files Browse the repository at this point in the history
  • Loading branch information
StrikerRUS committed Jul 13, 2020
1 parent 28cc2e0 commit 4f3803d
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 33 deletions.
4 changes: 3 additions & 1 deletion docs/Parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,9 @@ Learning Control Parameters

- the feature importance type in the saved model file

- ``0``: count-based feature importance; ``1``: gain-based feature importance
- ``0``: count-based feature importance (numbers of splits are counted); ``1``: gain-based feature importance (values of gain are counted)

- **Note**: can be used only in CLI version

- ``snapshot_freq`` :raw-html:`<a id="snapshot_freq" title="Permalink to this parameter" href="#snapshot_freq">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int, aliases: ``save_period``

Expand Down
6 changes: 3 additions & 3 deletions include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ class LIGHTGBM_EXPORT Boosting {
* \brief Dump model to json format string
* \param start_iteration The model will be saved start from
* \param num_iteration Number of iterations that want to dump, -1 means dump all
* \param feature_importance_type type of feature importance, 0: count, 1: gain
* \param feature_importance_type Type of feature importance, 0: split, 1: gain
* \return Json format string of model
*/
virtual std::string DumpModel(int start_iteration, int num_iteration, int feature_importance_type) const = 0;
Expand All @@ -200,7 +200,7 @@ class LIGHTGBM_EXPORT Boosting {
* \brief Save model to file
* \param start_iteration The model will be saved start from
* \param num_iterations Number of model that want to save, -1 means save all
* \param feature_importance_type type of feature importance, 0: count, 1:gain
* \param feature_importance_type Type of feature importance, 0: split, 1: gain
* \param filename Filename that want to save to
* \return true if succeeded
*/
Expand All @@ -210,7 +210,7 @@ class LIGHTGBM_EXPORT Boosting {
* \brief Save model to string
* \param start_iteration The model will be saved start from
* \param num_iterations Number of model that want to save, -1 means save all
* \param feature_importance_type type of feature importance, 0: count, 1:gain
* \param feature_importance_type Type of feature importance, 0: split, 1: gain
* \return Non-empty string if succeeded
*/
virtual std::string SaveModelToString(int start_iteration, int num_iterations, int feature_importance_type) const = 0;
Expand Down
13 changes: 8 additions & 5 deletions include/LightGBM/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ typedef void* BoosterHandle; /*!< \brief Handle of booster. */
#define C_API_MATRIX_TYPE_CSR (0) /*!< \brief CSR sparse matrix type. */
#define C_API_MATRIX_TYPE_CSC (1) /*!< \brief CSC sparse matrix type. */

#define C_API_FEATURE_IMPORTANCE_SPLIT (0) /*!< \brief Split type of feature importance. */
#define C_API_FEATURE_IMPORTANCE_GAIN (1) /*!< \brief Gain type of feature importance. */

/*!
* \brief Get string message of the last error.
* \return Error information
Expand Down Expand Up @@ -996,7 +999,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMats(BoosterHandle handle,
* \param handle Handle of booster
* \param start_iteration Start index of the iteration that should be saved
* \param num_iteration Index of the iteration that should be saved, <= 0 means save all
* \param feature_importance_type type of feature importance, 0: count, 1:gain
* \param feature_importance_type Type of feature importance, can be ``C_API_FEATURE_IMPORTANCE_SPLIT`` or ``C_API_FEATURE_IMPORTANCE_GAIN``
* \param filename The name of the file
* \return 0 when succeed, -1 when failure happens
*/
Expand All @@ -1011,7 +1014,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterSaveModel(BoosterHandle handle,
* \param handle Handle of booster
* \param start_iteration Start index of the iteration that should be saved
* \param num_iteration Index of the iteration that should be saved, <= 0 means save all
* \param feature_importance_type type of feature importance, 0: count, 1:gain
* \param feature_importance_type Type of feature importance, can be ``C_API_FEATURE_IMPORTANCE_SPLIT`` or ``C_API_FEATURE_IMPORTANCE_GAIN``
* \param buffer_len String buffer length, if ``buffer_len < out_len``, you should re-allocate buffer
* \param[out] out_len Actual output length
* \param[out] out_str String of model, should pre-allocate memory
Expand All @@ -1030,7 +1033,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterSaveModelToString(BoosterHandle handle,
* \param handle Handle of booster
* \param start_iteration Start index of the iteration that should be dumped
* \param num_iteration Index of the iteration that should be dumped, <= 0 means dump all
* \param feature_importance_type type of feature importance, 0: count, 1:gain
* \param feature_importance_type Type of feature importance, can be ``C_API_FEATURE_IMPORTANCE_SPLIT`` or ``C_API_FEATURE_IMPORTANCE_GAIN``
* \param buffer_len String buffer length, if ``buffer_len < out_len``, you should re-allocate buffer
* \param[out] out_len Actual output length
* \param[out] out_str JSON format string of model, should pre-allocate memory
Expand Down Expand Up @@ -1075,8 +1078,8 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterSetLeafValue(BoosterHandle handle,
* \param handle Handle of booster
* \param num_iteration Number of iterations for which feature importance is calculated, <= 0 means use all
* \param importance_type Method of importance calculation:
* - 0 for split, result contains numbers of times the feature is used in a model;
* - 1 for gain, result contains total gains of splits which use the feature
* - ``C_API_FEATURE_IMPORTANCE_SPLIT``: result contains numbers of times the feature is used in a model;
* - ``C_API_FEATURE_IMPORTANCE_GAIN``: result contains total gains of splits which use the feature
* \param[out] out_results Result array with feature importance
* \return 0 when succeed, -1 when failure happens
*/
Expand Down
3 changes: 2 additions & 1 deletion include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,8 @@ struct Config {
std::string output_model = "LightGBM_model.txt";

// desc = the feature importance type in the saved model file
// desc = ``0``: count-based feature importance; ``1``: gain-based feature importance
// desc = ``0``: count-based feature importance (numbers of splits are counted); ``1``: gain-based feature importance (values of gain are counted)
// desc = **Note**: can be used only in CLI version
int saved_feature_importance_type = 0;

// [no-save]
Expand Down
52 changes: 32 additions & 20 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,12 +284,20 @@ def get(cls, *args):
C_API_MATRIX_TYPE_CSR = 0
C_API_MATRIX_TYPE_CSC = 1

"""Macro definition of feature importance type"""
C_API_FEATURE_IMPORTANCE_SPLIT = 0
C_API_FEATURE_IMPORTANCE_GAIN = 1

"""Data type of data field"""
FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
"weight": C_API_DTYPE_FLOAT32,
"init_score": C_API_DTYPE_FLOAT64,
"group": C_API_DTYPE_INT32}

"""String name to int feature importance type mapper"""
FEATURE_IMPORTANCE_TYPE_MAPPER = {"split": C_API_FEATURE_IMPORTANCE_SPLIT,
"gain": C_API_FEATURE_IMPORTANCE_GAIN}


def convert_from_sliced_object(data):
"""Fix the memory of multi-dimensional sliced object."""
Expand Down Expand Up @@ -2600,7 +2608,7 @@ def eval_valid(self, feval=None):
return [item for i in range_(1, self.__num_dataset)
for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)]

def save_model(self, filename, num_iteration=None, start_iteration=0, feature_importance_type=0):
def save_model(self, filename, num_iteration=None, start_iteration=0, importance_type='split'):
"""Save Booster to file.
Parameters
Expand All @@ -2613,8 +2621,10 @@ def save_model(self, filename, num_iteration=None, start_iteration=0, feature_im
If <= 0, all iterations are saved.
start_iteration : int, optional (default=0)
Start index of the iteration that should be saved.
feature_importance_type : int, optional (default=0)
0: count-based; 1: gain-based
importance_type : string, optional (default="split")
What type of feature importance should be saved.
If "split", result contains numbers of times the feature is used in a model.
If "gain", result contains total gains of splits which use the feature.
Returns
-------
Expand All @@ -2623,11 +2633,12 @@ def save_model(self, filename, num_iteration=None, start_iteration=0, feature_im
"""
if num_iteration is None:
num_iteration = self.best_iteration
importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
_safe_call(_LIB.LGBM_BoosterSaveModel(
self.handle,
ctypes.c_int(start_iteration),
ctypes.c_int(num_iteration),
ctypes.c_int(feature_importance_type),
ctypes.c_int(importance_type_int),
c_str(filename)))
_dump_pandas_categorical(self.pandas_categorical, filename)
return self
Expand Down Expand Up @@ -2688,7 +2699,7 @@ def model_from_string(self, model_str, verbose=True):
self.pandas_categorical = _load_pandas_categorical(model_str=model_str)
return self

def model_to_string(self, num_iteration=None, start_iteration=0, feature_importance_type=0):
def model_to_string(self, num_iteration=None, start_iteration=0, importance_type='split'):
"""Save Booster to string.
Parameters
Expand All @@ -2699,8 +2710,10 @@ def model_to_string(self, num_iteration=None, start_iteration=0, feature_importa
If <= 0, all iterations are saved.
start_iteration : int, optional (default=0)
Start index of the iteration that should be saved.
feature_importance_type : int, optional (default=0)
0: count-based; 1: gain-based
importance_type : string, optional (default="split")
What type of feature importance should be saved.
If "split", result contains numbers of times the feature is used in a model.
If "gain", result contains total gains of splits which use the feature.
Returns
-------
Expand All @@ -2709,6 +2722,7 @@ def model_to_string(self, num_iteration=None, start_iteration=0, feature_importa
"""
if num_iteration is None:
num_iteration = self.best_iteration
importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
buffer_len = 1 << 20
tmp_out_len = ctypes.c_int64(0)
string_buffer = ctypes.create_string_buffer(buffer_len)
Expand All @@ -2717,7 +2731,7 @@ def model_to_string(self, num_iteration=None, start_iteration=0, feature_importa
self.handle,
ctypes.c_int(start_iteration),
ctypes.c_int(num_iteration),
ctypes.c_int(feature_importance_type),
ctypes.c_int(importance_type_int),
ctypes.c_int64(buffer_len),
ctypes.byref(tmp_out_len),
ptr_string_buffer))
Expand All @@ -2730,15 +2744,15 @@ def model_to_string(self, num_iteration=None, start_iteration=0, feature_importa
self.handle,
ctypes.c_int(start_iteration),
ctypes.c_int(num_iteration),
ctypes.c_int(feature_importance_type),
ctypes.c_int(importance_type_int),
ctypes.c_int64(actual_len),
ctypes.byref(tmp_out_len),
ptr_string_buffer))
ret = string_buffer.value.decode('utf-8')
ret += _dump_pandas_categorical(self.pandas_categorical)
return ret

def dump_model(self, num_iteration=None, start_iteration=0, feature_importance_type=0):
def dump_model(self, num_iteration=None, start_iteration=0, importance_type='split'):
"""Dump Booster to JSON format.
Parameters
Expand All @@ -2749,8 +2763,10 @@ def dump_model(self, num_iteration=None, start_iteration=0, feature_importance_t
If <= 0, all iterations are dumped.
start_iteration : int, optional (default=0)
Start index of the iteration that should be dumped.
feature_importance_type : int, optional (default=0)
0: count-based; 1: gain-based
importance_type : string, optional (default="split")
What type of feature importance should be dumped.
If "split", result contains numbers of times the feature is used in a model.
If "gain", result contains total gains of splits which use the feature.
Returns
-------
Expand All @@ -2759,6 +2775,7 @@ def dump_model(self, num_iteration=None, start_iteration=0, feature_importance_t
"""
if num_iteration is None:
num_iteration = self.best_iteration
importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
buffer_len = 1 << 20
tmp_out_len = ctypes.c_int64(0)
string_buffer = ctypes.create_string_buffer(buffer_len)
Expand All @@ -2767,7 +2784,7 @@ def dump_model(self, num_iteration=None, start_iteration=0, feature_importance_t
self.handle,
ctypes.c_int(start_iteration),
ctypes.c_int(num_iteration),
ctypes.c_int(feature_importance_type),
ctypes.c_int(importance_type_int),
ctypes.c_int64(buffer_len),
ctypes.byref(tmp_out_len),
ptr_string_buffer))
Expand All @@ -2780,7 +2797,7 @@ def dump_model(self, num_iteration=None, start_iteration=0, feature_importance_t
self.handle,
ctypes.c_int(start_iteration),
ctypes.c_int(num_iteration),
ctypes.c_int(feature_importance_type),
ctypes.c_int(importance_type_int),
ctypes.c_int64(actual_len),
ctypes.byref(tmp_out_len),
ptr_string_buffer))
Expand Down Expand Up @@ -2980,12 +2997,7 @@ def feature_importance(self, importance_type='split', iteration=None):
"""
if iteration is None:
iteration = self.best_iteration
if importance_type == "split":
importance_type_int = 0
elif importance_type == "gain":
importance_type_int = 1
else:
importance_type_int = -1
importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
result = np.zeros(self.num_feature(), dtype=np.float64)
_safe_call(_LIB.LGBM_BoosterFeatureImportance(
self.handle,
Expand Down
6 changes: 3 additions & 3 deletions src/boosting/gbdt.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ class GBDT : public GBDTBase {
* \brief Dump model to json format string
* \param start_iteration The model will be saved start from
* \param num_iteration Number of iterations that want to dump, -1 means dump all
* \param feature_importance_type type of feature importance, 0: count, 1:gain
* \param feature_importance_type Type of feature importance, 0: split, 1: gain
* \return Json format string of model
*/
std::string DumpModel(int start_iteration, int num_iteration,
Expand All @@ -274,7 +274,7 @@ class GBDT : public GBDTBase {
* \brief Save model to file
* \param start_iteration The model will be saved start from
* \param num_iterations Number of model that want to save, -1 means save all
* \param feature_importance_type type of feature importance, 0: count, 1:gain
* \param feature_importance_type Type of feature importance, 0: split, 1: gain
* \param filename Filename that want to save to
* \return is_finish Is training finished or not
*/
Expand All @@ -286,7 +286,7 @@ class GBDT : public GBDTBase {
* \brief Save model to string
* \param start_iteration The model will be saved start from
* \param num_iterations Number of model that want to save, -1 means save all
* \param feature_importance_type type of feature importance, 0: count, 1:gain
* \param feature_importance_type Type of feature importance, 0: split, 1: gain
* \return Non-empty string if succeeded
*/
std::string SaveModelToString(int start_iteration, int num_iterations, int feature_importance_type) const override;
Expand Down

0 comments on commit 4f3803d

Please sign in to comment.