Skip to content

Commit

Permalink
RndForest InfoGainRatio Split+ NFoldCrossValidation + AUC_ROC Upgrade
Browse files Browse the repository at this point in the history
Adding InfoGainRatio split for RndForest
- Implement InfoGainRatio based split in RndForest LearnD
NFoldCrossValidation
Implement N-Fold CrossValidation MACRO
AUC_ROC Upgrade
Replace prior AUC_ROC with new implementation that calculates multiple
classification AUC_ROC
  • Loading branch information
DonVitorio committed Nov 10, 2015
1 parent e9d385a commit f6a77d9
Show file tree
Hide file tree
Showing 5 changed files with 376 additions and 67 deletions.
100 changes: 54 additions & 46 deletions ML/Classify.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -1526,9 +1526,10 @@ Configuration Input
Purity p <= 1.0
Depth max tree level
*/
EXPORT RandomForest(t_Count treeNum, t_Count fsNum, REAL Purity=1.0, INTEGER1 Depth=32):= MODULE
EXPORT RandomForest(t_Count treeNum, t_Count fsNum, REAL Purity=1.0, t_level Depth=32, BOOLEAN GiniSplit = TRUE):= MODULE
EXPORT LearnD(DATASET(Types.DiscreteField) Indep, DATASET(Types.DiscreteField) Dep) := FUNCTION
nodes := Ensemble.SplitFeatureSampleGI(Indep, Dep, treeNum, fsNum, Purity, Depth);
nodes := IF(GiniSplit, Ensemble.SplitFeatureSampleGI(Indep, Dep, treeNum, fsNum, Purity, Depth),
Ensemble.SplitFeatureSampleIGR(Indep, Dep, treeNum, fsNum, Depth));
RETURN ML.Ensemble.ToDiscreteForest(nodes);
END;
EXPORT LearnC(DATASET(Types.NumericField) Indep, DATASET(Types.DiscreteField) Dep) := FUNCTION
Expand Down Expand Up @@ -1561,77 +1562,84 @@ Configuration Input
END; // RandomTree module

/*
Area Under the ROC curve
Area Under the ROC curve - Multiple Classification Results
// The function calculate the Area Under the ROC curve based on:
// - classProbDistclass : probability distribution for each instance
// - positiveClass : the class of interest
// - Dep : instance's class value
// The function returns all points of the ROC curve for graphic purposes:
// The function returns all points of the ROC curves for graphic purposes:
// label: threshold, point: (threshold's false negative rate, threshold's true positive rate).
// The area under the ROC curve is returned in the AUC field of the last record.
// The area under the ROC curve is returned by the last record's AUC field for each Classification Results(group_id).
// Note: threshold = 100 means classifying all instances as negative, it is not necessarily part of the curve
*/
EXPORT AUC_ROC(DATASET(l_result) classProbDist, Types.t_Discrete positiveClass, DATASET(Types.DiscreteField) Dep) := FUNCTION
EXPORT AUCcurvePoint:= RECORD
t_Count id;
t_Discrete posClass;
t_FieldNumber classifier;
t_FieldReal thresho;
t_FieldReal fpr;
t_FieldReal tpr;
t_FieldReal deltaPos:=0;
t_FieldReal deltaNeg:=0;
t_FieldReal cumNeg:=0;
t_FieldReal AUC:=0;
END;
EXPORT AUC_ROC(DATASET(l_result) classProbDist, Types.t_Discrete positiveClass, DATASET(Types.DiscreteField) allDep) := FUNCTION
SHARED cntREC:= RECORD
Types.t_FieldNumber classifier; // The classifier in question (value of 'number' on outcome data)
Types.t_Discrete c_actual; // The value of c provided
Types.t_FieldReal score :=-1;
Types.t_count tp_cnt:=0;
Types.t_count fn_cnt:=0;
Types.t_count fp_cnt:=0;
Types.t_count tn_cnt:=0;
t_FieldNumber classifier; // The classifier in question (value of &amp;apos;number&amp;apos; on outcome data)
t_Discrete c_actual; // The value of c provided
t_FieldReal score :=-1;
t_count tp_cnt:= 0;
t_count fn_cnt:= 0;
t_count fp_cnt:= 0;
t_count tn_cnt:= 0;
t_count totPos:= 0;
t_count totNeg:= 0;
END;
SHARED compREC:= RECORD(cntREC)
Types.t_Discrete c_modeled;
t_Discrete c_modeled;
END;
classOfInterest := classProbDist(value = positiveClass);
compared:= JOIN(classOfInterest, Dep, LEFT.id=RIGHT.id AND LEFT.number=RIGHT.number,
TRANSFORM(compREC, SELF.classifier:= LEFT.number, SELF.c_actual:=RIGHT.value,
SELF.c_modeled:=LEFT.value, SELF.score:=LEFT.conf), HASH);
sortComp:= SORT(compared, score);
dCPD := DISTRIBUTE(classOfInterest, HASH(id));
dDep := DISTRIBUTE(allDep, HASH(id));
compared:= JOIN(dCPD, dDep, LEFT.id=RIGHT.id AND LEFT.number=RIGHT.number,
TRANSFORM(compREC, SELF.classifier:= RIGHT.number, SELF.c_actual:=RIGHT.value,
SELF.c_modeled:= positiveClass, SELF.score:=ROUND(LEFT.conf, 14)), RIGHT OUTER, LOCAL);
sortComp:= SORT(compared, classifier, score);
coi_acc:= TABLE(sortComp, {classifier, score, cntPos:= COUNT(GROUP, c_actual = c_modeled),
cntNeg:= COUNT(GROUP, c_actual<>c_modeled)}, classifier, score, LOCAL);
coi_tot:= TABLE(coi_acc, {classifier, totPos:= SUM(GROUP, cntPos), totNeg:= SUM(GROUP, cntNeg)}, classifier, FEW);
totPos:=EVALUATE(coi_tot[1], totPos);
totNeg:=EVALUATE(coi_tot[1], totNeg);
// Count and accumulate number of TP, FP, TN and FN instances for each threshold (score)
acc_sorted:= PROJECT(coi_acc, TRANSFORM(cntREC, SELF.c_actual:= positiveClass, SELF.fn_cnt:= LEFT.cntPos,
SELF.tn_cnt:= LEFT.cntNeg, SELF:= LEFT), LOCAL);
acc_tot:= JOIN(coi_acc, coi_tot, LEFT.classifier = RIGHT.classifier, TRANSFORM(cntREC , SELF.c_actual:= positiveClass,
SELF.fn_cnt:= LEFT.cntPos, SELF.tn_cnt:= LEFT.cntNeg,
SELF.totPos:= RIGHT.totPos, SELF.totNeg:= RIGHT.totNeg, SELF:= LEFT),LOOKUP);
cntREC accNegPos(cntREC l, cntREC r) := TRANSFORM
deltaPos:= l.fn_cnt + r.fn_cnt;
deltaNeg:= l.tn_cnt + r.tn_cnt;
deltaPos:= IF(l.classifier <> r.classifier, 0, l.fn_cnt) + r.fn_cnt;
deltaNeg:= IF(l.classifier <> r.classifier, 0, l.tn_cnt) + r.tn_cnt;
SELF.score:= r.score;
SELF.tp_cnt:= totPos - deltaPos;
SELF.tp_cnt:= r.totPos - deltaPos;
SELF.fn_cnt:= deltaPos;
SELF.fp_cnt:= totNeg - deltaNeg;
SELF.fp_cnt:= r.totNeg - deltaNeg;
SELF.tn_cnt:= deltaNeg;
SELF:= r;
END;
cntNegPos:= ITERATE(acc_sorted, accNegPos(LEFT, RIGHT));
accnew := DATASET([{1,positiveClass,-1,totPos,0,totNeg,0}], cntREC) + cntNegPos;
curvePoint:= RECORD
Types.t_Count id;
Types.t_FieldNumber classifier;
Types.t_FieldReal thresho;
Types.t_FieldReal fpr;
Types.t_FieldReal tpr;
Types.t_FieldReal deltaPos:=0;
Types.t_FieldReal deltaNeg:=0;
Types.t_FieldReal cumNeg:=0;
Types.t_FieldReal AUC:=0;
END;
cntNegPos:= ITERATE(acc_tot, accNegPos(LEFT, RIGHT));
tmp:= PROJECT(coi_tot, TRANSFORM(cntREC, SELF.classifier:= LEFT.classifier, SELF.c_actual:= positiveClass,
SELF.score:= -1, SELF.tp_cnt:= LEFT.totPos, SELF.fn_cnt:= 0,
SELF.fp_cnt:= LEFT.totNeg, SELF.tn_cnt:= 0, SELF:= LEFT ));
accnew:= SORT(cntNegPos + tmp, classifier, score);
// Transform all into ROC curve points
rocPoints:= PROJECT(accnew, TRANSFORM(curvePoint, SELF.id:=COUNTER, SELF.thresho:=LEFT.score,
SELF.fpr:= LEFT.fp_cnt/totNeg, SELF.tpr:= LEFT.tp_cnt/totPos, SELF.AUC:=IF(totNeg=0,1,0) ,SELF:=LEFT));
rocPoints:= PROJECT(accnew, TRANSFORM(AUCcurvePoint, SELF.id:=COUNTER, SELF.thresho:=LEFT.score, SELF.posClass:= positiveClass,
SELF.fpr:= LEFT.fp_cnt/LEFT.totNeg, SELF.tpr:= LEFT.tp_cnt/LEFT.totPos, SELF.AUC:=IF(LEFT.totNeg=0,1,0) ,SELF:=LEFT));
// Calculate the area under the curve (cumulative iteration)
curvePoint rocArea(curvePoint l, curvePoint r) := TRANSFORM
deltaPos := if(l.tpr > r.tpr, l.tpr - r.tpr, 0.0);
deltaNeg := if( l.fpr > r.fpr, l.fpr - r.fpr, 0.0);
AUCcurvePoint rocArea(AUCcurvePoint l, AUCcurvePoint r) := TRANSFORM
deltaPos := if(l.classifier <> r.classifier, 0.0, l.tpr - r.tpr);
deltaNeg := if(l.classifier <> r.classifier, 0.0, l.fpr - r.fpr);
SELF.deltaPos := deltaPos;
SELF.deltaNeg := deltaNeg;
// A classification without incorrectly classified instances must return AUC = 1
SELF.AUC := IF(r.fpr=0 AND l.tpr=0 AND r.tpr=1, 1, l.AUC) + deltaPos * (l.cumNeg + 0.5* deltaNeg);
SELF.cumNeg := l.cumNeg + deltaNeg;
SELF.AUC := IF(l.classifier <> r.classifier, 0, IF(r.fpr=0 AND l.tpr=0 AND r.tpr=1, 1, l.AUC) + deltaPos * (l.cumNeg + 0.5* deltaNeg));
SELF.cumNeg := IF(l.classifier <> r.classifier, 0, l.cumNeg + deltaNeg);
SELF:= r;
END;
RETURN ITERATE(rocPoints, rocArea(LEFT, RIGHT));
Expand Down
155 changes: 142 additions & 13 deletions ML/Ensemble.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,15 @@ EXPORT Ensemble := MODULE
SELF.number := l.number;
SELF.value := l.value;
END;

SHARED gNodeInstDisc init(DiscreteField dep, Sampling.idListGroupRec depG) := TRANSFORM
SELF.group_id := depG.gNum;
SELF.node_id := depG.gNum;
SELF.level := 1;
SELF.id := depG.id;
SELF.number := 1;
SELF.value := Dep.value;
SELF.depend := Dep.value;
END;
/* Discrete implementation*/
// Function to split a set of nodes based on Feature Selection and Gini Impurity, used in Random Forest Classifier Discrete Learning,
// the nodes received were generated sampling with replacement nTrees times.
Expand All @@ -75,22 +83,13 @@ EXPORT Ensemble := MODULE
depth := MIN(255, maxLevel); // Max number of iterations when building trees (max 256 levels)
// Sampling with replacement the original dataset to generate treeNum Datasets
grList0 := Sampling.GenerateNSampleList(treeNum, N); // the number of records will be N * treeNum
dgrLstOld := DISTRIBUTE(grList0, HASH(oldId));
dDep := DISTRIBUTE(Dep, HASH(id));
dgrLstOld := DISTRIBUTE(grList0 , HASH(oldId));
dDep := DISTRIBUTE(Dep , HASH(id));
dIndep := DISTRIBUTE(Indep , HASH(id));
// Generating initial node-instance recordset, all instances go to their respective tree's root
gNodeInstDisc init(Types.DiscreteField l_dep, Sampling.idListGroupRec r_depGLO) := TRANSFORM
SELF.group_id := r_depGLO.gNum;
SELF.node_id := r_depGLO.gNum;
SELF.level := 1;
SELF.id := r_depGLO.id;
SELF.number := 1;
SELF.value := l_dep.value;
SELF.depend := l_dep.value;
END;
roots := JOIN(dDep, dgrLstOld, LEFT.id = RIGHT.oldId, init(LEFT, RIGHT), LOCAL);
distRoots := DISTRIBUTE(roots, HASH(group_id, node_id));
// Calculated only once, used at each iteration inside loopbody
dIndep := DISTRIBUTE(Indep, HASH(id));
all_Data0 := JOIN(dIndep, dgrLstOld, LEFT.id = RIGHT.oldId, GetDRecords(LEFT, RIGHT), LOCAL);
all_Data := DISTRIBUTE(all_Data0, HASH(id));
dgrLstNew := DISTRIBUTE(grList0 , HASH(id));
Expand Down Expand Up @@ -166,6 +165,136 @@ EXPORT Ensemble := MODULE
maxlevel_leafs:= PROJECT(depCnt, TRANSFORM(gSplitD, SELF.number:=0, SELF.value:= LEFT.depend, SELF.support:= LEFT.cnt, SELF.new_node_id:=0, SELF:= LEFT));
RETURN new_nodes + maxlevel_leafs;
END;

EXPORT SplitFeatureSampleIGR(DATASET(DiscreteField) Indep, DATASET(DiscreteField) Dep, t_Count treeNum, t_Count fsNum, t_level maxLevel=32) := FUNCTION
N := MAX(Dep, id); // Number of Instances
totFeat := COUNT(Indep(id=N)); // Number of Features
depth := MIN(255, maxLevel); // Max number of iterations when building trees (max 256 levels)
// sampling with replacement the original dataset to generate treeNum Datasets
gNodeInstDisc init(DiscreteField dep, Sampling.idListGroupRec depG) := TRANSFORM
SELF.group_id := depG.gNum;
SELF.node_id := depG.gNum;
SELF.level := 1;
SELF.id := depG.id;
SELF.number := 1;
SELF.value := Dep.value;
SELF.depend := Dep.value;
END;
grList0 := Sampling.GenerateNSampleList(treeNum, N); // the number of records will be N * treeNum
dgrLstOld := DISTRIBUTE(grList0 , HASH(oldId));
dDep := DISTRIBUTE(Dep , HASH(Id));
depG := JOIN(dDep, dgrLstOld, LEFT.id = RIGHT.oldId, init(LEFT, RIGHT), LOCAL);
dIndep := DISTRIBUTE(Indep , HASH(id));
all_Data0 := JOIN(dIndep, dgrLstOld, LEFT.id = RIGHT.oldId, GetDRecords(LEFT, RIGHT), LOCAL);
// Calculated only once, used at each iteration inside loopbody
all_Data := DISTRIBUTE(all_Data0, HASH(id));
dgrLstNew := DISTRIBUTE(grList0 , HASH(id));
// loopbody function
gNodeInstDisc RndFeatSelPartitionGRBased(DATASET(gNodeInstDisc) nodes, t_Count p_level):= FUNCTION
dNodes := DISTRIBUTE(nodes, HASH(group_id, node_id));
// Calculating Information Entropy of Nodes
top_dep := TABLE(dNodes , {group_id, node_id, depend, cnt:= COUNT(GROUP)}, group_id, node_id, depend, LOCAL);
top_dep_tot := TABLE(top_dep, {group_id, node_id, tot:= SUM(GROUP, cnt)} , group_id, node_id, LOCAL);
tdp := RECORD
top_dep;
REAL4 prop; // Proportion based only on dependent value
REAL4 plogp:= 0;
END;
P_Log_P(REAL P) := IF(P=1, 0, -P*LOG(P)/LOG(2));
top_dep_p := JOIN(top_dep, top_dep_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id,
TRANSFORM(tdp, SELF.prop:= LEFT.cnt/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.cnt/RIGHT.tot), SELF:=LEFT), LOCAL);
top_info := TABLE(top_dep_p, {group_id, node_id, info:= SUM(GROUP, plogp)}, group_id, node_id, LOCAL); // Nodes Information Entropy
// Filtering pure nodes
PureNodes := top_info(info = 0); // Pure Nodes have Info Entropy = 0
// Node-instances in pure nodes pass through
pass_thru := JOIN(top_dep, PureNodes, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id, TRANSFORM(gNodeInstDisc,
SELF.level:= p_level, SELF.depend:=LEFT.depend, SELF.support:=LEFT.cnt, SELF.id:=0, SELF.number:=0, SELF.value:=0, SELF:=LEFT), LOCAL);
// New working set after removing pass through node-instances
nodes_toSplit := DISTRIBUTE(JOIN(dnodes, PureNodes, LEFT.node_id=RIGHT.node_id, TRANSFORM(LEFT), LEFT ONLY, LOCAL), HASH(id));
// Gather only the data needed for each LOOP iteration,
// featSetInst preserves dgrLstNew distribution, then it can be JOINed LOCALly with all_Data (HASH(id) distribution also)
featSet := NxKoutofM(treeNum, fsNum, totFeat); // generating list of features selected for each tree
ftSetInst := JOIN(dgrLstNew, featSet, LEFT.gNum = RIGHT.gNum, MANY LOOKUP);
loop_Data := JOIN(all_Data, ftSetInst, LEFT.id = RIGHT.id AND LEFT.number = RIGHT.number, TRANSFORM(LEFT), LOCAL);
// Populating nodes' attributes to split
toSplit := JOIN(loop_Data, nodes_toSplit, LEFT.id = RIGHT.id, TRANSFORM(gNodeInstDisc, SELF.number:= LEFT.number; SELF.value:= LEFT.value; SELF:= RIGHT;), LOCAL);
this_set := DISTRIBUTE(toSplit, HASH(group_id, node_id));
// Calculating Information Gain of possible splits
child := TABLE(this_set, {group_id, node_id, number, value, depend, cnt := COUNT(GROUP)}, group_id, node_id, number, value, depend, LOCAL);
child_tot := TABLE(child, {group_id, node_id, number, value, tot := SUM(GROUP, cnt)}, group_id, node_id, number, value, LOCAL);
csp := RECORD
child_tot;
REAL4 prop;
REAL4 plogp;
END;
// Calculating Intrinsic Information Entropy of each attribute(split) per node
csplit_p := JOIN(child_tot, top_dep_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id,
TRANSFORM(csp, SELF.prop:= LEFT.tot/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.tot/RIGHT.tot), SELF:=LEFT), LOCAL);
csplit := TABLE(csplit_p, {group_id, node_id, number, split_info:=SUM(GROUP, plogp)}, group_id, node_id, number, LOCAL); // Intrinsic Info
chp := RECORD
child;
REAL4 prop; // Proportion pertaining to this dependant value
REAL4 plogp:= 0;
END;
// Information Entropy of new branches per split
cprop := JOIN(child, child_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number AND LEFT.value = RIGHT.value,
TRANSFORM(chp, SELF.prop := LEFT.cnt/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.cnt/RIGHT.tot), SELF:=LEFT), LOCAL);
cplogp := TABLE(cprop, {group_id, node_id, number, value, cont:= SUM(GROUP,cnt), inf0:= SUM(GROUP, plogp)}, group_id, node_id, number, value, LOCAL);
// Information Entropy of possible splits per node
cinfo := TABLE(cplogp, {group_id, node_id, number, info:=SUM(GROUP, cont*inf0)/SUM(GROUP, cont)}, group_id, node_id, number, LOCAL);
gainRec := RECORD
t_count group_id;
t_node node_id;
t_Discrete number;
REAL4 gain;
END;
// Information Gain of possible splits per node
gain := JOIN(cinfo, top_info, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id,
TRANSFORM(gainRec, SELF.gain:= RIGHT.info - LEFT.info, SELF:= LEFT), LOCAL);
gainRateRec := RECORD
t_count group_id;
t_node node_id;
t_Discrete number;
REAL4 gain_ratio;
END;
// Information Gain Ratio of possible splits per node
gainRatio := JOIN(gain, csplit, LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number,
TRANSFORM(gainRateRec, SELF.gain_ratio:= LEFT.gain/RIGHT.split_info, SELF:= LEFT), LOCAL);
// Selecting the split with max Info Gain Ratio per node
split := DEDUP(SORT(gainRatio, group_id, node_id, -gain_ratio, LOCAL), group_id, node_id, LOCAL);

// new split nodes found
new_spl0 := JOIN(child_tot, split, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id AND LEFT.number = RIGHT.number, TRANSFORM(LEFT), LOCAL);
node_base := MAX(nodes, node_id);
new_split := PROJECT(new_spl0, TRANSFORM(gNodeInstDisc, SELF.value:= node_base + COUNTER; SELF.depend:= LEFT.value;
SELF.level:= p_level; SELF.support:= LEFT.tot; SELF := LEFT; SELF := [];));
// reasigning instances to new nodes
node_inst := JOIN(this_set, new_split, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number AND LEFT.value=RIGHT.depend,
TRANSFORM(gNodeInstDisc, SELF.node_id:=RIGHT.value, SELF.level:= RIGHT.level +1, SELF.value:= LEFT.depend, SELF:= LEFT ), LOCAL);
RETURN pass_thru + new_split + node_inst; // returning leaf nodes, new splits nodes and reassigned instances
END;
// generating best feature_selection-gini_impurity splits, loopfilter level = COUNTER let pass only the nodes to be splitted for any current level
res := LOOP(depG, LEFT.level=COUNTER, COUNTER < depth , RndFeatSelPartitionGRBased(ROWS(LEFT), COUNTER));
// Turning LOOP results into splits and leaf nodes
gSplitD toNewNode(gNodeInstDisc NodeInst) := TRANSFORM
SELF.new_node_id := IF(NodeInst.number>0, NodeInst.value, 0);
SELF.number := IF(NodeInst.number>0, NodeInst.number, 0);
SELF.value := NodeInst.depend;
SELF:= NodeInst;
END;
new_nodes:= PROJECT(res(id=0), toNewNode(LEFT)); // node splits and leaf nodes
// Taking care of instances (id>0) that reached maximum level and did not turn into a leaf yet
mode_r := RECORD
res.group_id;
res.node_id;
res.level;
res.depend;
Cnt := COUNT(GROUP);
END;
depCnt := TABLE(res(id>0),mode_r, group_id, node_id, level, depend, FEW);
maxlevel_leafs:= PROJECT(depCnt, TRANSFORM(gSplitD, SELF.number:=0, SELF.value:= LEFT.depend, SELF.support:= LEFT.cnt, SELF.new_node_id:=0, SELF:= LEFT));
RETURN new_nodes + maxlevel_leafs;
END;
EXPORT FromDiscreteForest(DATASET(NumericField) mod) := FUNCTION
FromField(mod, gSplitD,o, modelD_Map);
RETURN o;
Expand Down
Loading

0 comments on commit f6a77d9

Please sign in to comment.