RndForest InfoGainRatio Split+ NFoldCrossValidation + AUC_ROC Upgrade

Adding InfoGainRatio split for RndForest - Implement InfoGainRatio based split in RndForest LearnD NFoldCrossValidation Implement N-Fold CrossValidation MACRO AUC_ROC Upgrade Replace prior AUC_ROC with new implementation that calculates multiple classification AUC_ROC
richardkchapman · Nov 10, 2015 · f6a77d9 · f6a77d9
1 parent e9d385a
commit f6a77d9
Show file tree

Hide file tree

Showing 5 changed files with 376 additions and 67 deletions.
diff --git a/ML/Classify.ecl b/ML/Classify.ecl
@@ -1526,9 +1526,10 @@ Configuration Input
    Purity     p <= 1.0
    Depth      max tree level
 */
-  EXPORT RandomForest(t_Count treeNum, t_Count fsNum, REAL Purity=1.0, INTEGER1 Depth=32):= MODULE
+  EXPORT RandomForest(t_Count treeNum, t_Count fsNum, REAL Purity=1.0, t_level Depth=32, BOOLEAN GiniSplit = TRUE):= MODULE
     EXPORT LearnD(DATASET(Types.DiscreteField) Indep, DATASET(Types.DiscreteField) Dep) := FUNCTION
-      nodes := Ensemble.SplitFeatureSampleGI(Indep, Dep, treeNum, fsNum, Purity, Depth);
+      nodes := IF(GiniSplit, Ensemble.SplitFeatureSampleGI(Indep, Dep, treeNum, fsNum, Purity, Depth), 
+                             Ensemble.SplitFeatureSampleIGR(Indep, Dep, treeNum, fsNum, Depth));
       RETURN ML.Ensemble.ToDiscreteForest(nodes);
     END;
     EXPORT LearnC(DATASET(Types.NumericField) Indep, DATASET(Types.DiscreteField) Dep) := FUNCTION
@@ -1561,77 +1562,84 @@ Configuration Input
   END; // RandomTree module
 
 /*
-  Area Under the ROC curve
+  Area Under the ROC curve - Multiple Classification Results
   // The function calculate the Area Under the ROC curve based on:
   // - classProbDistclass : probability distribution for each instance
   // - positiveClass      : the class of interest
   // - Dep                : instance's class value
-  // The function returns all points of the ROC curve for graphic purposes:
+  // The function returns all points of the ROC curves for graphic purposes:
   // label: threshold, point: (threshold's false negative rate, threshold's true positive rate).
-  // The area under the ROC curve is returned in the AUC field of the last record.
+  // The area under the ROC curve is returned by the last record's AUC field for each Classification Results(group_id).
   // Note: threshold = 100 means classifying all instances as negative, it is not necessarily part of the curve
 */
-  EXPORT AUC_ROC(DATASET(l_result) classProbDist, Types.t_Discrete positiveClass, DATASET(Types.DiscreteField) Dep) := FUNCTION
+  EXPORT AUCcurvePoint:= RECORD
+    t_Count       id;
+    t_Discrete    posClass;  
+    t_FieldNumber classifier;
+    t_FieldReal   thresho;
+    t_FieldReal   fpr;
+    t_FieldReal   tpr;
+    t_FieldReal   deltaPos:=0;
+    t_FieldReal   deltaNeg:=0;
+    t_FieldReal   cumNeg:=0;
+    t_FieldReal   AUC:=0;
+  END;
+  EXPORT AUC_ROC(DATASET(l_result) classProbDist, Types.t_Discrete positiveClass, DATASET(Types.DiscreteField) allDep) := FUNCTION
     SHARED cntREC:= RECORD
-      Types.t_FieldNumber classifier;  // The classifier in question (value of 'number' on outcome data)
-      Types.t_Discrete  c_actual;      // The value of c provided
-      Types.t_FieldReal score :=-1;
-      Types.t_count     tp_cnt:=0;
-      Types.t_count     fn_cnt:=0;
-      Types.t_count     fp_cnt:=0;
-      Types.t_count     tn_cnt:=0;
+      t_FieldNumber classifier;  // The classifier in question (value of &amp;apos;number&amp;apos; on outcome data)
+      t_Discrete  c_actual;      // The value of c provided
+      t_FieldReal score :=-1;
+      t_count     tp_cnt:= 0;
+      t_count     fn_cnt:= 0;
+      t_count     fp_cnt:= 0;
+      t_count     tn_cnt:= 0;
+      t_count     totPos:= 0;
+      t_count     totNeg:= 0;      
     END;
     SHARED compREC:= RECORD(cntREC)
-      Types.t_Discrete  c_modeled;
+      t_Discrete  c_modeled;
     END;
     classOfInterest := classProbDist(value = positiveClass);
-    compared:= JOIN(classOfInterest, Dep, LEFT.id=RIGHT.id AND LEFT.number=RIGHT.number,
-                            TRANSFORM(compREC, SELF.classifier:= LEFT.number, SELF.c_actual:=RIGHT.value,
-                            SELF.c_modeled:=LEFT.value, SELF.score:=LEFT.conf), HASH);
-    sortComp:= SORT(compared, score);
+    dCPD  := DISTRIBUTE(classOfInterest, HASH(id));
+    dDep  := DISTRIBUTE(allDep, HASH(id));
+    compared:= JOIN(dCPD, dDep, LEFT.id=RIGHT.id AND LEFT.number=RIGHT.number,
+                            TRANSFORM(compREC, SELF.classifier:= RIGHT.number, SELF.c_actual:=RIGHT.value,
+                            SELF.c_modeled:= positiveClass, SELF.score:=ROUND(LEFT.conf, 14)), RIGHT OUTER, LOCAL);
+    sortComp:= SORT(compared, classifier, score);
     coi_acc:= TABLE(sortComp, {classifier, score, cntPos:= COUNT(GROUP, c_actual = c_modeled),
                                   cntNeg:= COUNT(GROUP, c_actual<>c_modeled)}, classifier, score, LOCAL);
     coi_tot:= TABLE(coi_acc, {classifier, totPos:= SUM(GROUP, cntPos), totNeg:= SUM(GROUP, cntNeg)}, classifier, FEW);
-    totPos:=EVALUATE(coi_tot[1], totPos);
-    totNeg:=EVALUATE(coi_tot[1], totNeg);
     // Count and accumulate number of TP, FP, TN and FN instances for each threshold (score)
-    acc_sorted:= PROJECT(coi_acc, TRANSFORM(cntREC, SELF.c_actual:= positiveClass, SELF.fn_cnt:= LEFT.cntPos,
-                                  SELF.tn_cnt:= LEFT.cntNeg, SELF:= LEFT), LOCAL);
+    acc_tot:= JOIN(coi_acc, coi_tot, LEFT.classifier = RIGHT.classifier, TRANSFORM(cntREC , SELF.c_actual:= positiveClass, 
+                                                 SELF.fn_cnt:= LEFT.cntPos, SELF.tn_cnt:= LEFT.cntNeg,
+                                                 SELF.totPos:= RIGHT.totPos, SELF.totNeg:= RIGHT.totNeg, SELF:= LEFT),LOOKUP);
     cntREC accNegPos(cntREC l, cntREC r) := TRANSFORM
-      deltaPos:= l.fn_cnt + r.fn_cnt;
-      deltaNeg:= l.tn_cnt + r.tn_cnt;
+      deltaPos:= IF(l.classifier <> r.classifier, 0, l.fn_cnt) + r.fn_cnt;
+      deltaNeg:= IF(l.classifier <> r.classifier, 0, l.tn_cnt) + r.tn_cnt;
       SELF.score:= r.score;
-      SELF.tp_cnt:=  totPos - deltaPos;
+      SELF.tp_cnt:=  r.totPos - deltaPos;
       SELF.fn_cnt:=  deltaPos;
-      SELF.fp_cnt:=  totNeg - deltaNeg;
+      SELF.fp_cnt:=  r.totNeg - deltaNeg;
       SELF.tn_cnt:= deltaNeg;
       SELF:= r;
     END;
-    cntNegPos:= ITERATE(acc_sorted, accNegPos(LEFT, RIGHT));
-    accnew := DATASET([{1,positiveClass,-1,totPos,0,totNeg,0}], cntREC) + cntNegPos;
-    curvePoint:= RECORD
-      Types.t_Count       id;
-      Types.t_FieldNumber classifier;
-      Types.t_FieldReal   thresho;
-      Types.t_FieldReal   fpr;
-      Types.t_FieldReal   tpr;
-      Types.t_FieldReal   deltaPos:=0;
-      Types.t_FieldReal   deltaNeg:=0;
-      Types.t_FieldReal   cumNeg:=0;
-      Types.t_FieldReal   AUC:=0;
-    END;
+    cntNegPos:= ITERATE(acc_tot, accNegPos(LEFT, RIGHT));
+    tmp:= PROJECT(coi_tot, TRANSFORM(cntREC, SELF.classifier:= LEFT.classifier, SELF.c_actual:=  positiveClass,
+                           SELF.score:= -1, SELF.tp_cnt:= LEFT.totPos, SELF.fn_cnt:= 0, 
+                           SELF.fp_cnt:= LEFT.totNeg, SELF.tn_cnt:= 0, SELF:= LEFT ));
+    accnew:= SORT(cntNegPos + tmp, classifier, score);
     // Transform all into ROC curve points
-    rocPoints:= PROJECT(accnew, TRANSFORM(curvePoint, SELF.id:=COUNTER, SELF.thresho:=LEFT.score,
-                                SELF.fpr:= LEFT.fp_cnt/totNeg, SELF.tpr:= LEFT.tp_cnt/totPos, SELF.AUC:=IF(totNeg=0,1,0) ,SELF:=LEFT));
+    rocPoints:= PROJECT(accnew, TRANSFORM(AUCcurvePoint, SELF.id:=COUNTER, SELF.thresho:=LEFT.score, SELF.posClass:= positiveClass, 
+                                SELF.fpr:= LEFT.fp_cnt/LEFT.totNeg, SELF.tpr:= LEFT.tp_cnt/LEFT.totPos, SELF.AUC:=IF(LEFT.totNeg=0,1,0) ,SELF:=LEFT));
     // Calculate the area under the curve (cumulative iteration)
-    curvePoint rocArea(curvePoint l, curvePoint r) := TRANSFORM
-      deltaPos  := if(l.tpr > r.tpr, l.tpr - r.tpr, 0.0);
-      deltaNeg  := if( l.fpr > r.fpr, l.fpr - r.fpr, 0.0);
+    AUCcurvePoint rocArea(AUCcurvePoint l, AUCcurvePoint r) := TRANSFORM
+      deltaPos  := if(l.classifier <> r.classifier, 0.0, l.tpr - r.tpr);
+      deltaNeg  := if(l.classifier <> r.classifier, 0.0, l.fpr - r.fpr);
       SELF.deltaPos := deltaPos;
       SELF.deltaNeg := deltaNeg;
       // A classification without incorrectly classified instances must return AUC = 1
-      SELF.AUC      := IF(r.fpr=0 AND l.tpr=0 AND r.tpr=1, 1, l.AUC) + deltaPos * (l.cumNeg + 0.5* deltaNeg);
-      SELF.cumNeg   := l.cumNeg + deltaNeg;
+      SELF.AUC      := IF(l.classifier <> r.classifier, 0, IF(r.fpr=0 AND l.tpr=0 AND r.tpr=1, 1, l.AUC) + deltaPos * (l.cumNeg + 0.5* deltaNeg));
+      SELF.cumNeg   := IF(l.classifier <> r.classifier, 0, l.cumNeg + deltaNeg);
       SELF:= r;
     END;
     RETURN ITERATE(rocPoints, rocArea(LEFT, RIGHT));

diff --git a/ML/Ensemble.ecl b/ML/Ensemble.ecl
@@ -61,7 +61,15 @@ EXPORT Ensemble := MODULE
     SELF.number := l.number;
     SELF.value := l.value;
   END;  
-
+  SHARED gNodeInstDisc init(DiscreteField dep, Sampling.idListGroupRec depG) := TRANSFORM
+    SELF.group_id := depG.gNum;
+    SELF.node_id  := depG.gNum;
+    SELF.level    := 1;
+    SELF.id       := depG.id;
+    SELF.number   := 1;
+    SELF.value    := Dep.value;
+    SELF.depend   := Dep.value;
+  END;
 /* Discrete implementation*/
 // Function to split a set of nodes based on Feature Selection and Gini Impurity, used in Random Forest Classifier Discrete Learning,
 // the nodes received were generated sampling with replacement nTrees times.
@@ -75,22 +83,13 @@ EXPORT Ensemble := MODULE
     depth   := MIN(255, maxLevel);  // Max number of iterations when building trees (max 256 levels)
     // Sampling with replacement the original dataset to generate treeNum Datasets
     grList0   := Sampling.GenerateNSampleList(treeNum, N); // the number of records will be N * treeNum
-    dgrLstOld := DISTRIBUTE(grList0, HASH(oldId));
-    dDep      := DISTRIBUTE(Dep,     HASH(id));
+    dgrLstOld := DISTRIBUTE(grList0  , HASH(oldId));
+    dDep      := DISTRIBUTE(Dep      , HASH(id));
+    dIndep    := DISTRIBUTE(Indep    , HASH(id));
     // Generating initial node-instance recordset, all instances go to their respective tree's root
-    gNodeInstDisc init(Types.DiscreteField l_dep, Sampling.idListGroupRec r_depGLO) := TRANSFORM
-      SELF.group_id := r_depGLO.gNum;
-      SELF.node_id  := r_depGLO.gNum;
-      SELF.level    := 1;
-      SELF.id       := r_depGLO.id;
-      SELF.number   := 1;
-      SELF.value    := l_dep.value;
-      SELF.depend   := l_dep.value;
-    END;
     roots     := JOIN(dDep, dgrLstOld, LEFT.id = RIGHT.oldId, init(LEFT, RIGHT), LOCAL);
     distRoots := DISTRIBUTE(roots, HASH(group_id, node_id));
     // Calculated only once, used at each iteration inside loopbody
-    dIndep    := DISTRIBUTE(Indep, HASH(id));
     all_Data0 := JOIN(dIndep, dgrLstOld, LEFT.id = RIGHT.oldId, GetDRecords(LEFT, RIGHT), LOCAL);
     all_Data  := DISTRIBUTE(all_Data0, HASH(id));
     dgrLstNew := DISTRIBUTE(grList0  , HASH(id));
@@ -166,6 +165,136 @@ EXPORT Ensemble := MODULE
     maxlevel_leafs:= PROJECT(depCnt, TRANSFORM(gSplitD, SELF.number:=0, SELF.value:= LEFT.depend, SELF.support:= LEFT.cnt, SELF.new_node_id:=0, SELF:= LEFT));
     RETURN new_nodes + maxlevel_leafs;
   END;
+
+  EXPORT SplitFeatureSampleIGR(DATASET(DiscreteField) Indep, DATASET(DiscreteField) Dep, t_Count treeNum, t_Count fsNum, t_level maxLevel=32) := FUNCTION
+    N       := MAX(Dep, id);       // Number of Instances
+    totFeat := COUNT(Indep(id=N)); // Number of Features
+    depth   := MIN(255, maxLevel); // Max number of iterations when building trees (max 256 levels)
+    // sampling with replacement the original dataset to generate treeNum Datasets
+    gNodeInstDisc init(DiscreteField dep, Sampling.idListGroupRec depG) := TRANSFORM
+      SELF.group_id := depG.gNum;
+      SELF.node_id  := depG.gNum;
+      SELF.level    := 1;
+      SELF.id       := depG.id;
+      SELF.number   := 1;
+      SELF.value    := Dep.value;
+      SELF.depend   := Dep.value;
+    END;
+    grList0   := Sampling.GenerateNSampleList(treeNum, N); // the number of records will be N * treeNum
+    dgrLstOld := DISTRIBUTE(grList0  , HASH(oldId));
+    dDep      := DISTRIBUTE(Dep      , HASH(Id));
+    depG      := JOIN(dDep, dgrLstOld, LEFT.id = RIGHT.oldId, init(LEFT, RIGHT), LOCAL);
+    dIndep    := DISTRIBUTE(Indep     , HASH(id));
+    all_Data0 := JOIN(dIndep, dgrLstOld, LEFT.id = RIGHT.oldId, GetDRecords(LEFT, RIGHT), LOCAL);
+    // Calculated only once, used at each iteration inside loopbody
+    all_Data  := DISTRIBUTE(all_Data0, HASH(id));
+    dgrLstNew := DISTRIBUTE(grList0  , HASH(id));
+    // loopbody function
+    gNodeInstDisc RndFeatSelPartitionGRBased(DATASET(gNodeInstDisc) nodes, t_Count p_level):= FUNCTION
+      dNodes := DISTRIBUTE(nodes, HASH(group_id, node_id));
+      // Calculating Information Entropy of Nodes
+      top_dep     := TABLE(dNodes , {group_id, node_id, depend, cnt:= COUNT(GROUP)}, group_id, node_id, depend, LOCAL);
+      top_dep_tot := TABLE(top_dep, {group_id, node_id, tot:= SUM(GROUP, cnt)}     , group_id, node_id, LOCAL);
+      tdp := RECORD
+        top_dep;
+        REAL4 prop; // Proportion based only on dependent value
+        REAL4 plogp:= 0;
+      END;
+      P_Log_P(REAL P) := IF(P=1, 0, -P*LOG(P)/LOG(2));
+      top_dep_p := JOIN(top_dep, top_dep_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id,
+                TRANSFORM(tdp, SELF.prop:= LEFT.cnt/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.cnt/RIGHT.tot), SELF:=LEFT), LOCAL);
+      top_info  := TABLE(top_dep_p, {group_id, node_id, info:= SUM(GROUP, plogp)}, group_id, node_id, LOCAL); // Nodes Information Entropy
+      // Filtering pure nodes
+      PureNodes := top_info(info = 0); // Pure Nodes have Info Entropy = 0
+      // Node-instances in pure nodes pass through
+      pass_thru := JOIN(top_dep, PureNodes, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id, TRANSFORM(gNodeInstDisc,
+                       SELF.level:= p_level, SELF.depend:=LEFT.depend, SELF.support:=LEFT.cnt, SELF.id:=0, SELF.number:=0, SELF.value:=0, SELF:=LEFT), LOCAL);
+      // New working set after removing pass through node-instances
+      nodes_toSplit := DISTRIBUTE(JOIN(dnodes, PureNodes, LEFT.node_id=RIGHT.node_id, TRANSFORM(LEFT), LEFT ONLY, LOCAL), HASH(id));
+      // Gather only the data needed for each LOOP iteration,
+      // featSetInst preserves dgrLstNew distribution, then it can be JOINed LOCALly with all_Data (HASH(id) distribution also)
+      featSet   := NxKoutofM(treeNum, fsNum, totFeat);  // generating list of features selected for each tree
+      ftSetInst := JOIN(dgrLstNew, featSet, LEFT.gNum = RIGHT.gNum, MANY LOOKUP);
+      loop_Data := JOIN(all_Data, ftSetInst, LEFT.id = RIGHT.id AND LEFT.number = RIGHT.number, TRANSFORM(LEFT), LOCAL);
+      // Populating nodes' attributes to split
+      toSplit   := JOIN(loop_Data, nodes_toSplit, LEFT.id = RIGHT.id, TRANSFORM(gNodeInstDisc, SELF.number:= LEFT.number; SELF.value:= LEFT.value; SELF:= RIGHT;), LOCAL);
+      this_set  := DISTRIBUTE(toSplit, HASH(group_id, node_id));
+      // Calculating Information Gain of possible splits
+      child       := TABLE(this_set, {group_id, node_id, number, value, depend, cnt := COUNT(GROUP)}, group_id, node_id, number, value, depend, LOCAL);
+      child_tot := TABLE(child,    {group_id, node_id, number, value, tot := SUM(GROUP, cnt)},      group_id, node_id, number, value, LOCAL);
+      csp := RECORD
+        child_tot;
+        REAL4 prop;
+        REAL4 plogp;
+      END;
+      // Calculating Intrinsic Information Entropy of each attribute(split) per node
+      csplit_p  := JOIN(child_tot, top_dep_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id,
+                    TRANSFORM(csp, SELF.prop:= LEFT.tot/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.tot/RIGHT.tot), SELF:=LEFT), LOCAL);
+      csplit    := TABLE(csplit_p, {group_id, node_id, number, split_info:=SUM(GROUP, plogp)}, group_id, node_id, number, LOCAL); // Intrinsic Info
+      chp := RECORD
+        child;
+        REAL4 prop; // Proportion pertaining to this dependant value
+        REAL4 plogp:= 0;
+      END;
+      // Information Entropy of new branches per split
+      cprop     := JOIN(child, child_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number AND LEFT.value = RIGHT.value,
+                    TRANSFORM(chp, SELF.prop := LEFT.cnt/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.cnt/RIGHT.tot), SELF:=LEFT), LOCAL);
+      cplogp    := TABLE(cprop, {group_id, node_id, number, value, cont:= SUM(GROUP,cnt), inf0:= SUM(GROUP, plogp)}, group_id, node_id, number, value, LOCAL);
+      // Information Entropy of possible splits per node
+      cinfo     := TABLE(cplogp, {group_id, node_id, number, info:=SUM(GROUP, cont*inf0)/SUM(GROUP, cont)}, group_id, node_id, number, LOCAL);
+      gainRec := RECORD
+        t_count group_id;
+        t_node  node_id;
+        t_Discrete number;
+        REAL4 gain;
+      END;
+      // Information Gain of possible splits per node
+      gain      := JOIN(cinfo, top_info, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id,
+                    TRANSFORM(gainRec, SELF.gain:= RIGHT.info - LEFT.info, SELF:= LEFT), LOCAL);
+      gainRateRec := RECORD
+        t_count group_id;
+        t_node node_id;
+        t_Discrete number;
+        REAL4 gain_ratio;
+      END;
+      // Information Gain Ratio of possible splits per node
+      gainRatio := JOIN(gain, csplit, LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number,
+                      TRANSFORM(gainRateRec, SELF.gain_ratio:= LEFT.gain/RIGHT.split_info, SELF:= LEFT), LOCAL);
+      // Selecting the split with max Info Gain Ratio per node
+      split     := DEDUP(SORT(gainRatio, group_id, node_id, -gain_ratio, LOCAL), group_id, node_id, LOCAL);
+
+      // new split nodes found
+      new_spl0  := JOIN(child_tot, split, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id AND LEFT.number = RIGHT.number, TRANSFORM(LEFT), LOCAL);
+      node_base := MAX(nodes, node_id);
+      new_split := PROJECT(new_spl0, TRANSFORM(gNodeInstDisc, SELF.value:= node_base + COUNTER; SELF.depend:= LEFT.value;
+                                               SELF.level:= p_level; SELF.support:= LEFT.tot; SELF := LEFT; SELF := [];));
+      // reasigning instances to new nodes
+      node_inst := JOIN(this_set, new_split, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number AND LEFT.value=RIGHT.depend,
+                      TRANSFORM(gNodeInstDisc, SELF.node_id:=RIGHT.value, SELF.level:= RIGHT.level +1, SELF.value:= LEFT.depend, SELF:= LEFT ), LOCAL);
+      RETURN pass_thru + new_split + node_inst;   // returning leaf nodes, new splits nodes and reassigned instances
+    END;
+    // generating best feature_selection-gini_impurity splits, loopfilter level = COUNTER let pass only the nodes to be splitted for any current level
+    res := LOOP(depG, LEFT.level=COUNTER, COUNTER < depth , RndFeatSelPartitionGRBased(ROWS(LEFT), COUNTER));
+    // Turning LOOP results into splits and leaf nodes
+    gSplitD toNewNode(gNodeInstDisc NodeInst) := TRANSFORM
+      SELF.new_node_id  := IF(NodeInst.number>0, NodeInst.value, 0);
+      SELF.number       := IF(NodeInst.number>0, NodeInst.number, 0);
+      SELF.value        := NodeInst.depend;
+      SELF:= NodeInst;
+    END;
+    new_nodes:= PROJECT(res(id=0), toNewNode(LEFT));    // node splits and leaf nodes
+    // Taking care of instances (id>0) that reached maximum level and did not turn into a leaf yet
+    mode_r := RECORD
+      res.group_id;
+      res.node_id;
+      res.level;
+      res.depend;
+      Cnt := COUNT(GROUP);
+    END;
+    depCnt := TABLE(res(id>0),mode_r, group_id, node_id, level, depend, FEW);
+    maxlevel_leafs:= PROJECT(depCnt, TRANSFORM(gSplitD, SELF.number:=0, SELF.value:= LEFT.depend, SELF.support:= LEFT.cnt, SELF.new_node_id:=0, SELF:= LEFT));
+    RETURN new_nodes + maxlevel_leafs;
+  END;
   EXPORT FromDiscreteForest(DATASET(NumericField) mod) := FUNCTION
     FromField(mod, gSplitD,o, modelD_Map);
     RETURN o;