minor docs improvements and added defines for importance types

microsoft · Jul 13, 2020 · 4f3803d · 4f3803d
1 parent 28cc2e0
commit 4f3803d
Show file tree

Hide file tree

Showing 6 changed files with 51 additions and 33 deletions.
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
@@ -578,7 +578,9 @@ Learning Control Parameters
 
    -  the feature importance type in the saved model file
 
-   -  ``0``: count-based feature importance; ``1``: gain-based feature importance
+   -  ``0``: count-based feature importance (numbers of splits are counted); ``1``: gain-based feature importance (values of gain are counted)
+
+   -  **Note**: can be used only in CLI version
 
 -  ``snapshot_freq`` :raw-html:`<a id="snapshot_freq" title="Permalink to this parameter" href="#snapshot_freq">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int, aliases: ``save_period``
 

diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h
@@ -176,7 +176,7 @@ class LIGHTGBM_EXPORT Boosting {
   * \brief Dump model to json format string
   * \param start_iteration The model will be saved start from
   * \param num_iteration Number of iterations that want to dump, -1 means dump all
-  * \param feature_importance_type type of feature importance, 0: count, 1: gain
+  * \param feature_importance_type Type of feature importance, 0: split, 1: gain
   * \return Json format string of model
   */
   virtual std::string DumpModel(int start_iteration, int num_iteration, int feature_importance_type) const = 0;
@@ -200,7 +200,7 @@ class LIGHTGBM_EXPORT Boosting {
   * \brief Save model to file
   * \param start_iteration The model will be saved start from
   * \param num_iterations Number of model that want to save, -1 means save all
-  * \param feature_importance_type type of feature importance, 0: count, 1:gain
+  * \param feature_importance_type Type of feature importance, 0: split, 1: gain
   * \param filename Filename that want to save to
   * \return true if succeeded
   */
@@ -210,7 +210,7 @@ class LIGHTGBM_EXPORT Boosting {
   * \brief Save model to string
   * \param start_iteration The model will be saved start from
   * \param num_iterations Number of model that want to save, -1 means save all
-  * \param feature_importance_type type of feature importance, 0: count, 1:gain
+  * \param feature_importance_type Type of feature importance, 0: split, 1: gain
   * \return Non-empty string if succeeded
   */
   virtual std::string SaveModelToString(int start_iteration, int num_iterations, int feature_importance_type) const = 0;

diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h
@@ -36,6 +36,9 @@ typedef void* BoosterHandle;  /*!< \brief Handle of booster. */
 #define C_API_MATRIX_TYPE_CSR (0)  /*!< \brief CSR sparse matrix type. */
 #define C_API_MATRIX_TYPE_CSC (1)  /*!< \brief CSC sparse matrix type. */
 
+#define C_API_FEATURE_IMPORTANCE_SPLIT (0)  /*!< \brief Split type of feature importance. */
+#define C_API_FEATURE_IMPORTANCE_GAIN  (1)  /*!< \brief Gain type of feature importance. */
+
 /*!
  * \brief Get string message of the last error.
  * \return Error information
@@ -996,7 +999,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMats(BoosterHandle handle,
  * \param handle Handle of booster
  * \param start_iteration Start index of the iteration that should be saved
  * \param num_iteration Index of the iteration that should be saved, <= 0 means save all
- * \param feature_importance_type type of feature importance, 0: count, 1:gain
+ * \param feature_importance_type Type of feature importance, can be ``C_API_FEATURE_IMPORTANCE_SPLIT`` or ``C_API_FEATURE_IMPORTANCE_GAIN``
  * \param filename The name of the file
  * \return 0 when succeed, -1 when failure happens
  */
@@ -1011,7 +1014,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterSaveModel(BoosterHandle handle,
  * \param handle Handle of booster
  * \param start_iteration Start index of the iteration that should be saved
  * \param num_iteration Index of the iteration that should be saved, <= 0 means save all
- * \param feature_importance_type type of feature importance, 0: count, 1:gain
+ * \param feature_importance_type Type of feature importance, can be ``C_API_FEATURE_IMPORTANCE_SPLIT`` or ``C_API_FEATURE_IMPORTANCE_GAIN``
  * \param buffer_len String buffer length, if ``buffer_len < out_len``, you should re-allocate buffer
  * \param[out] out_len Actual output length
  * \param[out] out_str String of model, should pre-allocate memory
@@ -1030,7 +1033,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterSaveModelToString(BoosterHandle handle,
  * \param handle Handle of booster
  * \param start_iteration Start index of the iteration that should be dumped
  * \param num_iteration Index of the iteration that should be dumped, <= 0 means dump all
- * \param feature_importance_type type of feature importance, 0: count, 1:gain
+ * \param feature_importance_type Type of feature importance, can be ``C_API_FEATURE_IMPORTANCE_SPLIT`` or ``C_API_FEATURE_IMPORTANCE_GAIN``
  * \param buffer_len String buffer length, if ``buffer_len < out_len``, you should re-allocate buffer
  * \param[out] out_len Actual output length
  * \param[out] out_str JSON format string of model, should pre-allocate memory
@@ -1075,8 +1078,8 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterSetLeafValue(BoosterHandle handle,
  * \param handle Handle of booster
  * \param num_iteration Number of iterations for which feature importance is calculated, <= 0 means use all
  * \param importance_type Method of importance calculation:
- *   - 0 for split, result contains numbers of times the feature is used in a model;
- *   - 1 for gain, result contains total gains of splits which use the feature
+ *   - ``C_API_FEATURE_IMPORTANCE_SPLIT``: result contains numbers of times the feature is used in a model;
+ *   - ``C_API_FEATURE_IMPORTANCE_GAIN``: result contains total gains of splits which use the feature
  * \param[out] out_results Result array with feature importance
  * \return 0 when succeed, -1 when failure happens
  */

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
@@ -533,7 +533,8 @@ struct Config {
   std::string output_model = "LightGBM_model.txt";
 
   // desc = the feature importance type in the saved model file
-  // desc = ``0``: count-based feature importance; ``1``: gain-based feature importance
+  // desc = ``0``: count-based feature importance (numbers of splits are counted); ``1``: gain-based feature importance (values of gain are counted)
+  // desc = **Note**: can be used only in CLI version
   int saved_feature_importance_type = 0;
 
   // [no-save]

diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
@@ -284,12 +284,20 @@ def get(cls, *args):
 C_API_MATRIX_TYPE_CSR = 0
 C_API_MATRIX_TYPE_CSC = 1
 
+"""Macro definition of feature importance type"""
+C_API_FEATURE_IMPORTANCE_SPLIT = 0
+C_API_FEATURE_IMPORTANCE_GAIN = 1
+
 """Data type of data field"""
 FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
                      "weight": C_API_DTYPE_FLOAT32,
                      "init_score": C_API_DTYPE_FLOAT64,
                      "group": C_API_DTYPE_INT32}
 
+"""String name to int feature importance type mapper"""
+FEATURE_IMPORTANCE_TYPE_MAPPER = {"split": C_API_FEATURE_IMPORTANCE_SPLIT,
+                                  "gain": C_API_FEATURE_IMPORTANCE_GAIN}
+
 
 def convert_from_sliced_object(data):
     """Fix the memory of multi-dimensional sliced object."""
@@ -2600,7 +2608,7 @@ def eval_valid(self, feval=None):
         return [item for i in range_(1, self.__num_dataset)
                 for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)]
 
-    def save_model(self, filename, num_iteration=None, start_iteration=0, feature_importance_type=0):
+    def save_model(self, filename, num_iteration=None, start_iteration=0, importance_type='split'):
         """Save Booster to file.
 
         Parameters
@@ -2613,8 +2621,10 @@ def save_model(self, filename, num_iteration=None, start_iteration=0, feature_im
             If <= 0, all iterations are saved.
         start_iteration : int, optional (default=0)
             Start index of the iteration that should be saved.
-        feature_importance_type : int, optional (default=0)
-            0: count-based; 1: gain-based
+        importance_type : string, optional (default="split")
+            What type of feature importance should be saved.
+            If "split", result contains numbers of times the feature is used in a model.
+            If "gain", result contains total gains of splits which use the feature.
 
         Returns
         -------
@@ -2623,11 +2633,12 @@ def save_model(self, filename, num_iteration=None, start_iteration=0, feature_im
         """
         if num_iteration is None:
             num_iteration = self.best_iteration
+        importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
         _safe_call(_LIB.LGBM_BoosterSaveModel(
             self.handle,
             ctypes.c_int(start_iteration),
             ctypes.c_int(num_iteration),
-            ctypes.c_int(feature_importance_type),
+            ctypes.c_int(importance_type_int),
             c_str(filename)))
         _dump_pandas_categorical(self.pandas_categorical, filename)
         return self
@@ -2688,7 +2699,7 @@ def model_from_string(self, model_str, verbose=True):
         self.pandas_categorical = _load_pandas_categorical(model_str=model_str)
         return self
 
-    def model_to_string(self, num_iteration=None, start_iteration=0, feature_importance_type=0):
+    def model_to_string(self, num_iteration=None, start_iteration=0, importance_type='split'):
         """Save Booster to string.
 
         Parameters
@@ -2699,8 +2710,10 @@ def model_to_string(self, num_iteration=None, start_iteration=0, feature_importa
             If <= 0, all iterations are saved.
         start_iteration : int, optional (default=0)
             Start index of the iteration that should be saved.
-        feature_importance_type : int, optional (default=0)
-            0: count-based; 1: gain-based
+        importance_type : string, optional (default="split")
+            What type of feature importance should be saved.
+            If "split", result contains numbers of times the feature is used in a model.
+            If "gain", result contains total gains of splits which use the feature.
 
         Returns
         -------
@@ -2709,6 +2722,7 @@ def model_to_string(self, num_iteration=None, start_iteration=0, feature_importa
         """
         if num_iteration is None:
             num_iteration = self.best_iteration
+        importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
@@ -2717,7 +2731,7 @@ def model_to_string(self, num_iteration=None, start_iteration=0, feature_importa
             self.handle,
             ctypes.c_int(start_iteration),
             ctypes.c_int(num_iteration),
-            ctypes.c_int(feature_importance_type),
+            ctypes.c_int(importance_type_int),
             ctypes.c_int64(buffer_len),
             ctypes.byref(tmp_out_len),
             ptr_string_buffer))
@@ -2730,15 +2744,15 @@ def model_to_string(self, num_iteration=None, start_iteration=0, feature_importa
                 self.handle,
                 ctypes.c_int(start_iteration),
                 ctypes.c_int(num_iteration),
-                ctypes.c_int(feature_importance_type),
+                ctypes.c_int(importance_type_int),
                 ctypes.c_int64(actual_len),
                 ctypes.byref(tmp_out_len),
                 ptr_string_buffer))
         ret = string_buffer.value.decode('utf-8')
         ret += _dump_pandas_categorical(self.pandas_categorical)
         return ret
 
-    def dump_model(self, num_iteration=None, start_iteration=0, feature_importance_type=0):
+    def dump_model(self, num_iteration=None, start_iteration=0, importance_type='split'):
         """Dump Booster to JSON format.
 
         Parameters
@@ -2749,8 +2763,10 @@ def dump_model(self, num_iteration=None, start_iteration=0, feature_importance_t
             If <= 0, all iterations are dumped.
         start_iteration : int, optional (default=0)
             Start index of the iteration that should be dumped.
-        feature_importance_type : int, optional (default=0)
-            0: count-based; 1: gain-based
+        importance_type : string, optional (default="split")
+            What type of feature importance should be dumped.
+            If "split", result contains numbers of times the feature is used in a model.
+            If "gain", result contains total gains of splits which use the feature.
 
         Returns
         -------
@@ -2759,6 +2775,7 @@ def dump_model(self, num_iteration=None, start_iteration=0, feature_importance_t
         """
         if num_iteration is None:
             num_iteration = self.best_iteration
+        importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
@@ -2767,7 +2784,7 @@ def dump_model(self, num_iteration=None, start_iteration=0, feature_importance_t
             self.handle,
             ctypes.c_int(start_iteration),
             ctypes.c_int(num_iteration),
-            ctypes.c_int(feature_importance_type),
+            ctypes.c_int(importance_type_int),
             ctypes.c_int64(buffer_len),
             ctypes.byref(tmp_out_len),
             ptr_string_buffer))
@@ -2780,7 +2797,7 @@ def dump_model(self, num_iteration=None, start_iteration=0, feature_importance_t
                 self.handle,
                 ctypes.c_int(start_iteration),
                 ctypes.c_int(num_iteration),
-                ctypes.c_int(feature_importance_type),
+                ctypes.c_int(importance_type_int),
                 ctypes.c_int64(actual_len),
                 ctypes.byref(tmp_out_len),
                 ptr_string_buffer))
@@ -2980,12 +2997,7 @@ def feature_importance(self, importance_type='split', iteration=None):
         """
         if iteration is None:
             iteration = self.best_iteration
-        if importance_type == "split":
-            importance_type_int = 0
-        elif importance_type == "gain":
-            importance_type_int = 1
-        else:
-            importance_type_int = -1
+        importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
         result = np.zeros(self.num_feature(), dtype=np.float64)
         _safe_call(_LIB.LGBM_BoosterFeatureImportance(
             self.handle,

diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
@@ -249,7 +249,7 @@ class GBDT : public GBDTBase {
   * \brief Dump model to json format string
   * \param start_iteration The model will be saved start from
   * \param num_iteration Number of iterations that want to dump, -1 means dump all
-  * \param feature_importance_type type of feature importance, 0: count, 1:gain
+  * \param feature_importance_type Type of feature importance, 0: split, 1: gain
   * \return Json format string of model
   */
   std::string DumpModel(int start_iteration, int num_iteration,
@@ -274,7 +274,7 @@ class GBDT : public GBDTBase {
   * \brief Save model to file
   * \param start_iteration The model will be saved start from
   * \param num_iterations Number of model that want to save, -1 means save all
-  * \param feature_importance_type type of feature importance, 0: count, 1:gain
+  * \param feature_importance_type Type of feature importance, 0: split, 1: gain
   * \param filename Filename that want to save to
   * \return is_finish Is training finished or not
   */
@@ -286,7 +286,7 @@ class GBDT : public GBDTBase {
   * \brief Save model to string
   * \param start_iteration The model will be saved start from
   * \param num_iterations Number of model that want to save, -1 means save all
-  * \param feature_importance_type type of feature importance, 0: count, 1:gain
+  * \param feature_importance_type Type of feature importance, 0: split, 1: gain
   * \return Non-empty string if succeeded
   */
   std::string SaveModelToString(int start_iteration, int num_iterations, int feature_importance_type) const override;