[R-package] Add Leaf (in)Stability Example (microsoft#386)

* Add leaf stability example * 00index leaf stability * Add leaf stability to README
zhaowhoami · Apr 7, 2017 · 3c14900 · 3c14900
1 parent ddcbe71
commit 3c14900
Showing 3 changed files with 129 additions and 0 deletions.
diff --git a/R-package/demo/00Index b/R-package/demo/00Index
@@ -3,3 +3,4 @@ boost_from_prediction           Boosting from existing prediction
 early_stopping                  Early Stop in training
 cross_validation                Cross Validation
 multiclass                      Multiclass training/prediction
+leaf_stability                  Leaf (in)Stability example
diff --git a/R-package/demo/README.md b/R-package/demo/README.md
@@ -5,3 +5,4 @@ LightGBM R examples
 * [Early Stopping](early_stopping.R)
 * [Cross Validation](cross_validation.R)
 * [Multiclass Training/Prediction](multiclass.R)
+* [Leaf (in)Stability](leaf_stability.R)
diff --git a/R-package/demo/leaf_stability.R b/R-package/demo/leaf_stability.R
@@ -0,0 +1,127 @@
+# We are going to look at how iterating too much might generate observation instability.
+# Obviously, we are in a controlled environment, without issues (real rules).
+# Do not do this in a real scenario.
+
+# First, we load our libraries
+library(lightgbm)
+library(ggplot2)
+
+# Second, we load our data
+data(agaricus.train, package = "lightgbm")
+train <- agaricus.train
+dtrain <- lgb.Dataset(train$data, label = train$label)
+data(agaricus.test, package = "lightgbm")
+test <- agaricus.test
+dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
+
+# Third, we setup parameters and we train a model
+params <- list(objective = "regression", metric = "l2")
+valids <- list(test = dtest)
+model <- lgb.train(params,
+                   dtrain,
+                   50,
+                   valids,
+                   min_data = 1,
+                   learning_rate = 0.1,
+                   bagging_fraction = 0.1,
+                   bagging_freq = 1,
+                   bagging_seed = 1)
+
+# We create a data.frame with the following structure:
+# X = average leaf of the observation throughout all trees
+# Y = prediction probability (clamped to [1e-15, 1-1e-15])
+# Z = logloss
+# binned = binned quantile of average leaf
+new_data <- data.frame(X = rowMeans(predict(model,
+                                            agaricus.test$data,
+                                            predleaf = TRUE)),
+                       Y = pmin(pmax(predict(model,
+                                             agaricus.test$data), 1e-15), 1 - 1e-15))
+new_data$Z <- -(agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y))
+new_data$binned <- .bincode(x = new_data$X,
+                            breaks = quantile(x = new_data$X,
+                                              probs = (1:9)/10),
+                            right = TRUE,
+                            include.lowest = TRUE)
+new_data$binned[is.na(new_data$binned)] <- 0
+new_data$binned <- as.factor(new_data$binned)
+
+# We can check the binned content
+table(new_data$binned)
+
+# We can plot the binned content
+# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
+# On the third plot, it is smooth!
+ggplot(data = new_data, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
+ggplot(data = new_data, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
+ggplot(data = new_data, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
+
+
+# Now, let's show with other parameters
+model2 <- lgb.train(params,
+                    dtrain,
+                    100,
+                    valids,
+                    min_data = 1,
+                    learning_rate = 1)
+
+# We create the data structure, but for model2
+new_data2 <- data.frame(X = rowMeans(predict(model2,
+                                             agaricus.test$data,
+                                             predleaf = TRUE)),
+                        Y = pmin(pmax(predict(model2,
+                                              agaricus.test$data), 1e-15), 1 - 1e-15))
+new_data2$Z <- -(agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y))
+new_data2$binned <- .bincode(x = new_data2$X,
+                             breaks = quantile(x = new_data2$X,
+                                               probs = (1:9)/10),
+                             right = TRUE,
+                             include.lowest = TRUE)
+new_data2$binned[is.na(new_data2$binned)] <- 0
+new_data2$binned <- as.factor(new_data2$binned)
+
+# We can check the binned content
+table(new_data2$binned)
+
+# We can plot the binned content
+# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
+# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue
+# However, if the rules were not true, the loss would explode.
+ggplot(data = new_data2, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
+ggplot(data = new_data2, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
+ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
+
+
+# Now, try with very severe overfitting
+model3 <- lgb.train(params,
+                    dtrain,
+                    1000,
+                    valids,
+                    min_data = 1,
+                    learning_rate = 1)
+
+# We create the data structure, but for model3
+new_data3 <- data.frame(X = rowMeans(predict(model3,
+                                             agaricus.test$data,
+                                             predleaf = TRUE)),
+                        Y = pmin(pmax(predict(model3,
+                                              agaricus.test$data), 1e-15), 1 - 1e-15))
+new_data3$Z <- -(agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y))
+new_data3$binned <- .bincode(x = new_data3$X,
+                             breaks = quantile(x = new_data3$X,
+                                               probs = (1:9)/10),
+                             right = TRUE,
+                             include.lowest = TRUE)
+new_data3$binned[is.na(new_data3$binned)] <- 0
+new_data3$binned <- as.factor(new_data3$binned)
+
+# We can check the binned content
+table(new_data3$binned)
+
+# We can plot the binned content
+# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue.
+# However, if the rules were not true, the loss would explode. See the sudden spikes?
+ggplot(data = new_data3, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
+
+# Compare with our second model, the difference is severe. This is smooth.
+ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")