diff --git a/ML/Classify.ecl b/ML/Classify.ecl
index 870a9bb2..c23140da 100644
--- a/ML/Classify.ecl
+++ b/ML/Classify.ecl
@@ -1526,9 +1526,10 @@ Configuration Input
Purity p <= 1.0
Depth max tree level
*/
- EXPORT RandomForest(t_Count treeNum, t_Count fsNum, REAL Purity=1.0, INTEGER1 Depth=32):= MODULE
+ EXPORT RandomForest(t_Count treeNum, t_Count fsNum, REAL Purity=1.0, t_level Depth=32, BOOLEAN GiniSplit = TRUE):= MODULE
EXPORT LearnD(DATASET(Types.DiscreteField) Indep, DATASET(Types.DiscreteField) Dep) := FUNCTION
- nodes := Ensemble.SplitFeatureSampleGI(Indep, Dep, treeNum, fsNum, Purity, Depth);
+ nodes := IF(GiniSplit, Ensemble.SplitFeatureSampleGI(Indep, Dep, treeNum, fsNum, Purity, Depth),
+ Ensemble.SplitFeatureSampleIGR(Indep, Dep, treeNum, fsNum, Depth));
RETURN ML.Ensemble.ToDiscreteForest(nodes);
END;
EXPORT LearnC(DATASET(Types.NumericField) Indep, DATASET(Types.DiscreteField) Dep) := FUNCTION
@@ -1561,77 +1562,84 @@ Configuration Input
END; // RandomTree module
/*
- Area Under the ROC curve
+ Area Under the ROC curve - Multiple Classification Results
// The function calculate the Area Under the ROC curve based on:
// - classProbDistclass : probability distribution for each instance
// - positiveClass : the class of interest
// - Dep : instance's class value
- // The function returns all points of the ROC curve for graphic purposes:
+ // The function returns all points of the ROC curves for graphic purposes:
// label: threshold, point: (threshold's false negative rate, threshold's true positive rate).
- // The area under the ROC curve is returned in the AUC field of the last record.
+ // The area under the ROC curve is returned by the last record's AUC field for each Classification Results(group_id).
// Note: threshold = 100 means classifying all instances as negative, it is not necessarily part of the curve
*/
- EXPORT AUC_ROC(DATASET(l_result) classProbDist, Types.t_Discrete positiveClass, DATASET(Types.DiscreteField) Dep) := FUNCTION
+ EXPORT AUCcurvePoint:= RECORD
+ t_Count id;
+ t_Discrete posClass;
+ t_FieldNumber classifier;
+ t_FieldReal thresho;
+ t_FieldReal fpr;
+ t_FieldReal tpr;
+ t_FieldReal deltaPos:=0;
+ t_FieldReal deltaNeg:=0;
+ t_FieldReal cumNeg:=0;
+ t_FieldReal AUC:=0;
+ END;
+ EXPORT AUC_ROC(DATASET(l_result) classProbDist, Types.t_Discrete positiveClass, DATASET(Types.DiscreteField) allDep) := FUNCTION
SHARED cntREC:= RECORD
- Types.t_FieldNumber classifier; // The classifier in question (value of 'number' on outcome data)
- Types.t_Discrete c_actual; // The value of c provided
- Types.t_FieldReal score :=-1;
- Types.t_count tp_cnt:=0;
- Types.t_count fn_cnt:=0;
- Types.t_count fp_cnt:=0;
- Types.t_count tn_cnt:=0;
+ t_FieldNumber classifier; // The classifier in question (value of 'number' on outcome data)
+ t_Discrete c_actual; // The value of c provided
+ t_FieldReal score :=-1;
+ t_count tp_cnt:= 0;
+ t_count fn_cnt:= 0;
+ t_count fp_cnt:= 0;
+ t_count tn_cnt:= 0;
+ t_count totPos:= 0;
+ t_count totNeg:= 0;
END;
SHARED compREC:= RECORD(cntREC)
- Types.t_Discrete c_modeled;
+ t_Discrete c_modeled;
END;
classOfInterest := classProbDist(value = positiveClass);
- compared:= JOIN(classOfInterest, Dep, LEFT.id=RIGHT.id AND LEFT.number=RIGHT.number,
- TRANSFORM(compREC, SELF.classifier:= LEFT.number, SELF.c_actual:=RIGHT.value,
- SELF.c_modeled:=LEFT.value, SELF.score:=LEFT.conf), HASH);
- sortComp:= SORT(compared, score);
+ dCPD := DISTRIBUTE(classOfInterest, HASH(id));
+ dDep := DISTRIBUTE(allDep, HASH(id));
+ compared:= JOIN(dCPD, dDep, LEFT.id=RIGHT.id AND LEFT.number=RIGHT.number,
+ TRANSFORM(compREC, SELF.classifier:= RIGHT.number, SELF.c_actual:=RIGHT.value,
+ SELF.c_modeled:= positiveClass, SELF.score:=ROUND(LEFT.conf, 14)), RIGHT OUTER, LOCAL);
+ sortComp:= SORT(compared, classifier, score);
coi_acc:= TABLE(sortComp, {classifier, score, cntPos:= COUNT(GROUP, c_actual = c_modeled),
cntNeg:= COUNT(GROUP, c_actual<>c_modeled)}, classifier, score, LOCAL);
coi_tot:= TABLE(coi_acc, {classifier, totPos:= SUM(GROUP, cntPos), totNeg:= SUM(GROUP, cntNeg)}, classifier, FEW);
- totPos:=EVALUATE(coi_tot[1], totPos);
- totNeg:=EVALUATE(coi_tot[1], totNeg);
// Count and accumulate number of TP, FP, TN and FN instances for each threshold (score)
- acc_sorted:= PROJECT(coi_acc, TRANSFORM(cntREC, SELF.c_actual:= positiveClass, SELF.fn_cnt:= LEFT.cntPos,
- SELF.tn_cnt:= LEFT.cntNeg, SELF:= LEFT), LOCAL);
+ acc_tot:= JOIN(coi_acc, coi_tot, LEFT.classifier = RIGHT.classifier, TRANSFORM(cntREC , SELF.c_actual:= positiveClass,
+ SELF.fn_cnt:= LEFT.cntPos, SELF.tn_cnt:= LEFT.cntNeg,
+ SELF.totPos:= RIGHT.totPos, SELF.totNeg:= RIGHT.totNeg, SELF:= LEFT),LOOKUP);
cntREC accNegPos(cntREC l, cntREC r) := TRANSFORM
- deltaPos:= l.fn_cnt + r.fn_cnt;
- deltaNeg:= l.tn_cnt + r.tn_cnt;
+ deltaPos:= IF(l.classifier <> r.classifier, 0, l.fn_cnt) + r.fn_cnt;
+ deltaNeg:= IF(l.classifier <> r.classifier, 0, l.tn_cnt) + r.tn_cnt;
SELF.score:= r.score;
- SELF.tp_cnt:= totPos - deltaPos;
+ SELF.tp_cnt:= r.totPos - deltaPos;
SELF.fn_cnt:= deltaPos;
- SELF.fp_cnt:= totNeg - deltaNeg;
+ SELF.fp_cnt:= r.totNeg - deltaNeg;
SELF.tn_cnt:= deltaNeg;
SELF:= r;
END;
- cntNegPos:= ITERATE(acc_sorted, accNegPos(LEFT, RIGHT));
- accnew := DATASET([{1,positiveClass,-1,totPos,0,totNeg,0}], cntREC) + cntNegPos;
- curvePoint:= RECORD
- Types.t_Count id;
- Types.t_FieldNumber classifier;
- Types.t_FieldReal thresho;
- Types.t_FieldReal fpr;
- Types.t_FieldReal tpr;
- Types.t_FieldReal deltaPos:=0;
- Types.t_FieldReal deltaNeg:=0;
- Types.t_FieldReal cumNeg:=0;
- Types.t_FieldReal AUC:=0;
- END;
+ cntNegPos:= ITERATE(acc_tot, accNegPos(LEFT, RIGHT));
+ tmp:= PROJECT(coi_tot, TRANSFORM(cntREC, SELF.classifier:= LEFT.classifier, SELF.c_actual:= positiveClass,
+ SELF.score:= -1, SELF.tp_cnt:= LEFT.totPos, SELF.fn_cnt:= 0,
+ SELF.fp_cnt:= LEFT.totNeg, SELF.tn_cnt:= 0, SELF:= LEFT ));
+ accnew:= SORT(cntNegPos + tmp, classifier, score);
// Transform all into ROC curve points
- rocPoints:= PROJECT(accnew, TRANSFORM(curvePoint, SELF.id:=COUNTER, SELF.thresho:=LEFT.score,
- SELF.fpr:= LEFT.fp_cnt/totNeg, SELF.tpr:= LEFT.tp_cnt/totPos, SELF.AUC:=IF(totNeg=0,1,0) ,SELF:=LEFT));
+ rocPoints:= PROJECT(accnew, TRANSFORM(AUCcurvePoint, SELF.id:=COUNTER, SELF.thresho:=LEFT.score, SELF.posClass:= positiveClass,
+ SELF.fpr:= LEFT.fp_cnt/LEFT.totNeg, SELF.tpr:= LEFT.tp_cnt/LEFT.totPos, SELF.AUC:=IF(LEFT.totNeg=0,1,0) ,SELF:=LEFT));
// Calculate the area under the curve (cumulative iteration)
- curvePoint rocArea(curvePoint l, curvePoint r) := TRANSFORM
- deltaPos := if(l.tpr > r.tpr, l.tpr - r.tpr, 0.0);
- deltaNeg := if( l.fpr > r.fpr, l.fpr - r.fpr, 0.0);
+ AUCcurvePoint rocArea(AUCcurvePoint l, AUCcurvePoint r) := TRANSFORM
+ deltaPos := if(l.classifier <> r.classifier, 0.0, l.tpr - r.tpr);
+ deltaNeg := if(l.classifier <> r.classifier, 0.0, l.fpr - r.fpr);
SELF.deltaPos := deltaPos;
SELF.deltaNeg := deltaNeg;
// A classification without incorrectly classified instances must return AUC = 1
- SELF.AUC := IF(r.fpr=0 AND l.tpr=0 AND r.tpr=1, 1, l.AUC) + deltaPos * (l.cumNeg + 0.5* deltaNeg);
- SELF.cumNeg := l.cumNeg + deltaNeg;
+ SELF.AUC := IF(l.classifier <> r.classifier, 0, IF(r.fpr=0 AND l.tpr=0 AND r.tpr=1, 1, l.AUC) + deltaPos * (l.cumNeg + 0.5* deltaNeg));
+ SELF.cumNeg := IF(l.classifier <> r.classifier, 0, l.cumNeg + deltaNeg);
SELF:= r;
END;
RETURN ITERATE(rocPoints, rocArea(LEFT, RIGHT));
diff --git a/ML/Ensemble.ecl b/ML/Ensemble.ecl
index f17bc0bf..dbd3db77 100644
--- a/ML/Ensemble.ecl
+++ b/ML/Ensemble.ecl
@@ -61,7 +61,15 @@ EXPORT Ensemble := MODULE
SELF.number := l.number;
SELF.value := l.value;
END;
-
+ SHARED gNodeInstDisc init(DiscreteField dep, Sampling.idListGroupRec depG) := TRANSFORM
+ SELF.group_id := depG.gNum;
+ SELF.node_id := depG.gNum;
+ SELF.level := 1;
+ SELF.id := depG.id;
+ SELF.number := 1;
+ SELF.value := Dep.value;
+ SELF.depend := Dep.value;
+ END;
/* Discrete implementation*/
// Function to split a set of nodes based on Feature Selection and Gini Impurity, used in Random Forest Classifier Discrete Learning,
// the nodes received were generated sampling with replacement nTrees times.
@@ -75,22 +83,13 @@ EXPORT Ensemble := MODULE
depth := MIN(255, maxLevel); // Max number of iterations when building trees (max 256 levels)
// Sampling with replacement the original dataset to generate treeNum Datasets
grList0 := Sampling.GenerateNSampleList(treeNum, N); // the number of records will be N * treeNum
- dgrLstOld := DISTRIBUTE(grList0, HASH(oldId));
- dDep := DISTRIBUTE(Dep, HASH(id));
+ dgrLstOld := DISTRIBUTE(grList0 , HASH(oldId));
+ dDep := DISTRIBUTE(Dep , HASH(id));
+ dIndep := DISTRIBUTE(Indep , HASH(id));
// Generating initial node-instance recordset, all instances go to their respective tree's root
- gNodeInstDisc init(Types.DiscreteField l_dep, Sampling.idListGroupRec r_depGLO) := TRANSFORM
- SELF.group_id := r_depGLO.gNum;
- SELF.node_id := r_depGLO.gNum;
- SELF.level := 1;
- SELF.id := r_depGLO.id;
- SELF.number := 1;
- SELF.value := l_dep.value;
- SELF.depend := l_dep.value;
- END;
roots := JOIN(dDep, dgrLstOld, LEFT.id = RIGHT.oldId, init(LEFT, RIGHT), LOCAL);
distRoots := DISTRIBUTE(roots, HASH(group_id, node_id));
// Calculated only once, used at each iteration inside loopbody
- dIndep := DISTRIBUTE(Indep, HASH(id));
all_Data0 := JOIN(dIndep, dgrLstOld, LEFT.id = RIGHT.oldId, GetDRecords(LEFT, RIGHT), LOCAL);
all_Data := DISTRIBUTE(all_Data0, HASH(id));
dgrLstNew := DISTRIBUTE(grList0 , HASH(id));
@@ -166,6 +165,136 @@ EXPORT Ensemble := MODULE
maxlevel_leafs:= PROJECT(depCnt, TRANSFORM(gSplitD, SELF.number:=0, SELF.value:= LEFT.depend, SELF.support:= LEFT.cnt, SELF.new_node_id:=0, SELF:= LEFT));
RETURN new_nodes + maxlevel_leafs;
END;
+
+ EXPORT SplitFeatureSampleIGR(DATASET(DiscreteField) Indep, DATASET(DiscreteField) Dep, t_Count treeNum, t_Count fsNum, t_level maxLevel=32) := FUNCTION
+ N := MAX(Dep, id); // Number of Instances
+ totFeat := COUNT(Indep(id=N)); // Number of Features
+ depth := MIN(255, maxLevel); // Max number of iterations when building trees (max 256 levels)
+ // sampling with replacement the original dataset to generate treeNum Datasets
+ gNodeInstDisc init(DiscreteField dep, Sampling.idListGroupRec depG) := TRANSFORM
+ SELF.group_id := depG.gNum;
+ SELF.node_id := depG.gNum;
+ SELF.level := 1;
+ SELF.id := depG.id;
+ SELF.number := 1;
+ SELF.value := Dep.value;
+ SELF.depend := Dep.value;
+ END;
+ grList0 := Sampling.GenerateNSampleList(treeNum, N); // the number of records will be N * treeNum
+ dgrLstOld := DISTRIBUTE(grList0 , HASH(oldId));
+ dDep := DISTRIBUTE(Dep , HASH(Id));
+ depG := JOIN(dDep, dgrLstOld, LEFT.id = RIGHT.oldId, init(LEFT, RIGHT), LOCAL);
+ dIndep := DISTRIBUTE(Indep , HASH(id));
+ all_Data0 := JOIN(dIndep, dgrLstOld, LEFT.id = RIGHT.oldId, GetDRecords(LEFT, RIGHT), LOCAL);
+ // Calculated only once, used at each iteration inside loopbody
+ all_Data := DISTRIBUTE(all_Data0, HASH(id));
+ dgrLstNew := DISTRIBUTE(grList0 , HASH(id));
+ // loopbody function
+ gNodeInstDisc RndFeatSelPartitionGRBased(DATASET(gNodeInstDisc) nodes, t_Count p_level):= FUNCTION
+ dNodes := DISTRIBUTE(nodes, HASH(group_id, node_id));
+ // Calculating Information Entropy of Nodes
+ top_dep := TABLE(dNodes , {group_id, node_id, depend, cnt:= COUNT(GROUP)}, group_id, node_id, depend, LOCAL);
+ top_dep_tot := TABLE(top_dep, {group_id, node_id, tot:= SUM(GROUP, cnt)} , group_id, node_id, LOCAL);
+ tdp := RECORD
+ top_dep;
+ REAL4 prop; // Proportion based only on dependent value
+ REAL4 plogp:= 0;
+ END;
+ P_Log_P(REAL P) := IF(P=1, 0, -P*LOG(P)/LOG(2));
+ top_dep_p := JOIN(top_dep, top_dep_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id,
+ TRANSFORM(tdp, SELF.prop:= LEFT.cnt/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.cnt/RIGHT.tot), SELF:=LEFT), LOCAL);
+ top_info := TABLE(top_dep_p, {group_id, node_id, info:= SUM(GROUP, plogp)}, group_id, node_id, LOCAL); // Nodes Information Entropy
+ // Filtering pure nodes
+ PureNodes := top_info(info = 0); // Pure Nodes have Info Entropy = 0
+ // Node-instances in pure nodes pass through
+ pass_thru := JOIN(top_dep, PureNodes, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id, TRANSFORM(gNodeInstDisc,
+ SELF.level:= p_level, SELF.depend:=LEFT.depend, SELF.support:=LEFT.cnt, SELF.id:=0, SELF.number:=0, SELF.value:=0, SELF:=LEFT), LOCAL);
+ // New working set after removing pass through node-instances
+ nodes_toSplit := DISTRIBUTE(JOIN(dnodes, PureNodes, LEFT.node_id=RIGHT.node_id, TRANSFORM(LEFT), LEFT ONLY, LOCAL), HASH(id));
+ // Gather only the data needed for each LOOP iteration,
+ // featSetInst preserves dgrLstNew distribution, then it can be JOINed LOCALly with all_Data (HASH(id) distribution also)
+ featSet := NxKoutofM(treeNum, fsNum, totFeat); // generating list of features selected for each tree
+ ftSetInst := JOIN(dgrLstNew, featSet, LEFT.gNum = RIGHT.gNum, MANY LOOKUP);
+ loop_Data := JOIN(all_Data, ftSetInst, LEFT.id = RIGHT.id AND LEFT.number = RIGHT.number, TRANSFORM(LEFT), LOCAL);
+ // Populating nodes' attributes to split
+ toSplit := JOIN(loop_Data, nodes_toSplit, LEFT.id = RIGHT.id, TRANSFORM(gNodeInstDisc, SELF.number:= LEFT.number; SELF.value:= LEFT.value; SELF:= RIGHT;), LOCAL);
+ this_set := DISTRIBUTE(toSplit, HASH(group_id, node_id));
+ // Calculating Information Gain of possible splits
+ child := TABLE(this_set, {group_id, node_id, number, value, depend, cnt := COUNT(GROUP)}, group_id, node_id, number, value, depend, LOCAL);
+ child_tot := TABLE(child, {group_id, node_id, number, value, tot := SUM(GROUP, cnt)}, group_id, node_id, number, value, LOCAL);
+ csp := RECORD
+ child_tot;
+ REAL4 prop;
+ REAL4 plogp;
+ END;
+ // Calculating Intrinsic Information Entropy of each attribute(split) per node
+ csplit_p := JOIN(child_tot, top_dep_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id,
+ TRANSFORM(csp, SELF.prop:= LEFT.tot/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.tot/RIGHT.tot), SELF:=LEFT), LOCAL);
+ csplit := TABLE(csplit_p, {group_id, node_id, number, split_info:=SUM(GROUP, plogp)}, group_id, node_id, number, LOCAL); // Intrinsic Info
+ chp := RECORD
+ child;
+ REAL4 prop; // Proportion pertaining to this dependant value
+ REAL4 plogp:= 0;
+ END;
+ // Information Entropy of new branches per split
+ cprop := JOIN(child, child_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number AND LEFT.value = RIGHT.value,
+ TRANSFORM(chp, SELF.prop := LEFT.cnt/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.cnt/RIGHT.tot), SELF:=LEFT), LOCAL);
+ cplogp := TABLE(cprop, {group_id, node_id, number, value, cont:= SUM(GROUP,cnt), inf0:= SUM(GROUP, plogp)}, group_id, node_id, number, value, LOCAL);
+ // Information Entropy of possible splits per node
+ cinfo := TABLE(cplogp, {group_id, node_id, number, info:=SUM(GROUP, cont*inf0)/SUM(GROUP, cont)}, group_id, node_id, number, LOCAL);
+ gainRec := RECORD
+ t_count group_id;
+ t_node node_id;
+ t_Discrete number;
+ REAL4 gain;
+ END;
+ // Information Gain of possible splits per node
+ gain := JOIN(cinfo, top_info, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id,
+ TRANSFORM(gainRec, SELF.gain:= RIGHT.info - LEFT.info, SELF:= LEFT), LOCAL);
+ gainRateRec := RECORD
+ t_count group_id;
+ t_node node_id;
+ t_Discrete number;
+ REAL4 gain_ratio;
+ END;
+ // Information Gain Ratio of possible splits per node
+ gainRatio := JOIN(gain, csplit, LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number,
+ TRANSFORM(gainRateRec, SELF.gain_ratio:= LEFT.gain/RIGHT.split_info, SELF:= LEFT), LOCAL);
+ // Selecting the split with max Info Gain Ratio per node
+ split := DEDUP(SORT(gainRatio, group_id, node_id, -gain_ratio, LOCAL), group_id, node_id, LOCAL);
+
+ // new split nodes found
+ new_spl0 := JOIN(child_tot, split, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id AND LEFT.number = RIGHT.number, TRANSFORM(LEFT), LOCAL);
+ node_base := MAX(nodes, node_id);
+ new_split := PROJECT(new_spl0, TRANSFORM(gNodeInstDisc, SELF.value:= node_base + COUNTER; SELF.depend:= LEFT.value;
+ SELF.level:= p_level; SELF.support:= LEFT.tot; SELF := LEFT; SELF := [];));
+ // reasigning instances to new nodes
+ node_inst := JOIN(this_set, new_split, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number AND LEFT.value=RIGHT.depend,
+ TRANSFORM(gNodeInstDisc, SELF.node_id:=RIGHT.value, SELF.level:= RIGHT.level +1, SELF.value:= LEFT.depend, SELF:= LEFT ), LOCAL);
+ RETURN pass_thru + new_split + node_inst; // returning leaf nodes, new splits nodes and reassigned instances
+ END;
+ // generating best feature_selection-gini_impurity splits, loopfilter level = COUNTER let pass only the nodes to be splitted for any current level
+ res := LOOP(depG, LEFT.level=COUNTER, COUNTER < depth , RndFeatSelPartitionGRBased(ROWS(LEFT), COUNTER));
+ // Turning LOOP results into splits and leaf nodes
+ gSplitD toNewNode(gNodeInstDisc NodeInst) := TRANSFORM
+ SELF.new_node_id := IF(NodeInst.number>0, NodeInst.value, 0);
+ SELF.number := IF(NodeInst.number>0, NodeInst.number, 0);
+ SELF.value := NodeInst.depend;
+ SELF:= NodeInst;
+ END;
+ new_nodes:= PROJECT(res(id=0), toNewNode(LEFT)); // node splits and leaf nodes
+ // Taking care of instances (id>0) that reached maximum level and did not turn into a leaf yet
+ mode_r := RECORD
+ res.group_id;
+ res.node_id;
+ res.level;
+ res.depend;
+ Cnt := COUNT(GROUP);
+ END;
+ depCnt := TABLE(res(id>0),mode_r, group_id, node_id, level, depend, FEW);
+ maxlevel_leafs:= PROJECT(depCnt, TRANSFORM(gSplitD, SELF.number:=0, SELF.value:= LEFT.depend, SELF.support:= LEFT.cnt, SELF.new_node_id:=0, SELF:= LEFT));
+ RETURN new_nodes + maxlevel_leafs;
+ END;
EXPORT FromDiscreteForest(DATASET(NumericField) mod) := FUNCTION
FromField(mod, gSplitD,o, modelD_Map);
RETURN o;
diff --git a/ML/NFoldCrossValidation.ecl b/ML/NFoldCrossValidation.ecl
new file mode 100644
index 00000000..dbd78842
--- /dev/null
+++ b/ML/NFoldCrossValidation.ecl
@@ -0,0 +1,130 @@
+IMPORT * FROM ML;
+IMPORT * FROM ML.Types;
+EXPORT NFoldCrossValidation(IndepDS, DepDS, LearnerName, NumFolds) := FUNCTIONMACRO
+ learner:= LearnerName;
+ #DECLARE(fields);
+ #DECLARE(ilearn) #SET(ilearn,'')
+ #DECLARE(icpd) #SET(icpd,'')
+ #DECLARE(iclass) #SET(iclass,'')
+ #EXPORTXML(fields, RECORDOF(IndepDS));
+ #FOR(fields)
+ #FOR(Field)
+ #IF(%'{@label}'% = 'value')
+ #IF(%'{@type}'% = 'integer')
+ #SET(ilearn, 'RETURN learner.LearnD(indData, depData); \n')
+ #SET(icpd, 'RETURN learner.ClassProbDistribD(indData, mod); \n')
+ #SET(iclass, 'RETURN learner.ClassifyD(indData, mod); \n')
+ #ELSE
+ #SET(ilearn, 'RETURN learner.LearnC(indData, depData); \n')
+ #SET(icpd, 'RETURN learner.ClassProbDistribC(indData, mod); \n')
+ #SET(iclass, 'RETURN learner.ClassifyC(indData, mod); \n')
+ #END
+ #END
+ #END
+ #END
+
+ AUC_Rec:= RECORD(ML.Types.NumericField)
+ DATASET(ML.Classify.AUCcurvePoint) curvePoints;
+ END;
+ idFoldRec := RECORD
+ Types.t_FieldNumber fold;
+ Types.t_RecordID id;
+ END;
+ dsRecordRnd := RECORD(ML.Types.DiscreteField)
+ Types.t_FieldNumber rnd:= 0;
+ END;
+ dsRecordRnd AddRandom(ML.Types.DiscreteField l) :=TRANSFORM
+ SELF.rnd := RANDOM();
+ SELF := l;
+ END;
+ Learn(DATASET(RECORDOF(IndepDS)) indData, DATASET(Types.DiscreteField) depData) := FUNCTION
+ #EXPAND(%'ilearn'%)
+ END;
+ ClassProbDistrib(DATASET(RECORDOF(IndepDS)) indData, DATASET(Types.NumericField) mod) := FUNCTION
+ #EXPAND(%'icpd'%)
+ END;
+ Classify(DATASET(RECORDOF(IndepDS)) indData, DATASET(Types.NumericField) mod) := FUNCTION
+ #EXPAND(%'iclass'%)
+ END;
+ FoldNDS(DATASET(RECORDOF(IndepDS)) indData, DATASET(Types.DiscreteField) depData, DATASET(idFoldRec) ds_folds, Types.t_Discrete num_fold, Types.t_RecordID baseId = 0) := MODULE
+ EXPORT trainIndep := JOIN(indData, ds_folds(fold <> num_fold), LEFT.id = RIGHT.id, TRANSFORM(RECORDOF(IndepDS), SELF.id:= LEFT.id + baseId, SELF:=LEFT), LOCAL);
+ EXPORT trainDep := JOIN(depData, ds_folds(fold <> num_fold), LEFT.id = RIGHT.id, TRANSFORM(Types.DiscreteField, SELF.id:= LEFT.id + baseId, SELF.number:= num_fold, SELF:=LEFT), LOCAL);
+ EXPORT testIndep := JOIN(indData, ds_folds(fold = num_fold), LEFT.id = RIGHT.id, TRANSFORM(RECORDOF(IndepDS), SELF.id:= LEFT.id + baseId, SELF:=LEFT), LOCAL);
+ EXPORT testDep := JOIN(depData, ds_folds(fold = num_fold), LEFT.id = RIGHT.id, TRANSFORM(Types.DiscreteField, SELF.id:= LEFT.id + baseId, SELF.number:= num_fold, SELF:=LEFT), LOCAL);
+ END;
+ toFoldResult(DATASET(ML.Types.l_result) iCPD, DATASET(ML.Types.l_result) iClass, ML.Types.t_Discrete num_fold) := MODULE
+ EXPORT CPD := PROJECT(iCPD, TRANSFORM(ML.Types.l_result, SELF.number:= num_fold; SELF:= LEFT), LOCAL);
+ EXPORT Class := PROJECT(iClass, TRANSFORM(ML.Types.l_result, SELF.number:= num_fold; SELF:= LEFT), LOCAL);
+ END;
+ dRnd := PROJECT(DepDS, AddRandom(LEFT), LOCAL);
+ dRndSorted := SORT(dRnd,value,rnd);
+ ds_parts := DISTRIBUTE(PROJECT(dRndSorted, TRANSFORM(idFoldRec, SELF.fold := COUNTER%NumFolds + 1, SELF:= LEFT)), id);
+ dIndep := DISTRIBUTE(IndepDS, id);
+ dDep := DISTRIBUTE(DepDS, id);
+// classes := TABLE(dDep,{number, value}, number, value);
+ #DECLARE (SetString) #SET (SetString, '');
+ #DECLARE (SetLearner) #SET (SetLearner, REGEXREPLACE('[^a-zA-Z0-9_.,()]',#TEXT(LearnerName),''));
+ #DECLARE (Ndx)
+ #SET (Ndx, 1);
+ #LOOP
+ #IF (%Ndx% > NumFolds)
+ #BREAK // break out of the loop
+ #ELSE //otherwise
+ #APPEND(SetString,'fold' + %'Ndx'% + ':= FoldNDS(dIndep, dDep, ds_parts, ' + %'Ndx'% + '); \n');
+ #APPEND(SetString,'indepN' + %'Ndx'% + ':= fold' + %'Ndx'% + '.trainIndep; \n');
+ #APPEND(SetString,'depN' + %'Ndx'% + ':= fold' + %'Ndx'% + '.trainDep; \n');
+ #APPEND(SetString,'t_indepN' + %'Ndx'% + ':= fold' + %'Ndx'% + '.testIndep; \n');
+ #APPEND(SetString,'t_depN' + %'Ndx'% + ':= fold' + %'Ndx'% + '.testDep; \n');
+ #APPEND(SetString,'modN' + %'Ndx'% + ':= Learn(indepN' + %'Ndx'% + ', depN' + %'Ndx'% + '); \n');
+ #APPEND(SetString,'cpdN' + %'Ndx'% + ':= ClassProbDistrib(t_indepN' + %'Ndx'% + ', modN' + %'Ndx'% + '); \n');
+ #APPEND(SetString,'classN' + %'Ndx'% + ':= Classify(t_indepN' + %'Ndx'% + ', modN' + %'Ndx'% + '); \n');
+ #APPEND(SetString,'tfResN' + %'Ndx'% + ':= toFoldResult(cpdN' + %'Ndx'% + ', classN' + %'Ndx'% + ',' + %'Ndx'% + '); \n');
+ #SET (Ndx, %Ndx% + 1) //and increment the value of Ndx
+ #END
+ #END
+ #EXPAND(%'SetString'%);
+ #SET (Ndx, 1);
+ #DECLARE (aggCPD) #SET (aggCPD, 'allCPD:= ');
+ #DECLARE (aggClass) #SET (aggClass, 'allClass:= ');
+ #DECLARE (aggtDep) #SET (aggtDep, 'alltDep:= ');
+ #LOOP
+ #IF (%Ndx% < NumFolds)
+ #APPEND(aggCPD, 'tfResN' + %'Ndx'% + '.CPD +');
+ #APPEND(aggClass, 'tfResN' + %'Ndx'% + '.Class +');
+ #APPEND(aggtDep, 't_depN' + %'Ndx'% + ' +');
+ #SET (Ndx, %Ndx% + 1) //and increment the value of Ndx
+ #ELSE //otherwise
+ #APPEND(aggCPD, 'tfResN' + %'Ndx'% + '.CPD; \n');
+ #APPEND(aggClass, 'tfResN' + %'Ndx'% + '.Class; \n');
+ #APPEND(aggtDep, 't_depN' + %'Ndx'% + '; \n');
+ #BREAK // break out of the loop
+ #END
+ #END
+ #EXPAND(%'aggCPD'%);
+ #EXPAND(%'aggClass'%);
+ #EXPAND(%'aggtDep'%);
+
+ CVResults(DATASET(ML.Types.DiscreteField) it_depN, DATASET(ML.Types.l_result) iCPDN, DATASET(ML.Types.l_result) iclassN) := MODULE
+ tclass:= TABLE(it_depN, {value}, value);
+ fclass:= PROJECT(tclass, TRANSFORM(ML.Types.NumericField, SELF.id:= COUNTER, SELF.number:= LEFT.value, SELF.value:= 0));
+ AUC_Rec GetAUC(ML.Types.NumericField L) := TRANSFORM
+ SELF.curvePoints:= ML.Classify.AUC_ROC(iCPDN, L.number, it_depN);
+ SELF := L;
+ END;
+ AUC_class:= PROJECT(fclass, GetAUC(LEFT));
+ ML.Classify.AUCcurvePoint GetCurvePoints(ML.Classify.AUCcurvePoint R) := TRANSFORM
+ SELF := R;
+ END;
+ EXPORT AUC_curvePoints := NORMALIZE(AUC_class, LEFT.curvePoints, GetCurvePoints(RIGHT));
+ EXPORT AUC_scores := TABLE(AUC_curvePoints, {posClass, classifier, AUC_score:= MAX(GROUP, auc)}, posClass, classifier);
+ SHARED TestModule := ML.Classify.Compare(it_depN, iclassN);
+ EXPORT CrossAssignments := TestModule.CrossAssignments;
+ EXPORT RecallByClass := TestModule.RecallByClass;
+ EXPORT PrecisionByClass := TestModule.PrecisionByClass;
+ EXPORT FP_Rate_ByClass := TestModule.FP_Rate_ByClass;
+ EXPORT Accuracy := TestModule.Accuracy;
+ EXPORT CPDN := iCPDN;
+ EXPORT classN := iclassN;
+ END;
+ RETURN CVResults(alltDep, allCPD, allClass);
+ENDMACRO;
\ No newline at end of file
diff --git a/ML/Tests/Explanatory/10CrossValidation.ecl b/ML/Tests/Explanatory/10CrossValidation.ecl
new file mode 100644
index 00000000..a25113aa
--- /dev/null
+++ b/ML/Tests/Explanatory/10CrossValidation.ecl
@@ -0,0 +1,20 @@
+#option('outputLimit',100);
+IMPORT * FROM ML;
+IMPORT ML.Tests.Explanatory as TE;
+//Medium Large dataset for tests
+indep_data:= TABLE(TE.AdultDS.Train_Data,{id, Age, WorkClass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country});
+dep_data:= TABLE(TE.AdultDS.Train_Data,{id, Outcome});
+
+ToField(indep_data, pr_indep);
+indepData := ML.Discretize.ByRounding(pr_indep);
+ToField(dep_data, pr_dep);
+depData := ML.Discretize.ByRounding(pr_dep);
+learner := Classify.RandomForest(100, 8, 1.0, 125, TRUE);
+
+cv:= NFoldCrossValidation(indepData, depData, learner, 10);
+OUTPUT(cv.CrossAssignments, NAMED('CA_part'), ALL);
+OUTPUT(cv.RecallByClass, NAMED('RecallByClass'));
+OUTPUT(cv.PrecisionByClass, NAMED('PrecByClass'));
+OUTPUT(cv.FP_Rate_ByClass, NAMED('FP_Rate_ByClass'));
+OUTPUT(cv.Accuracy, NAMED('Accuracy'));
+OUTPUT(cv.AUC_scores, NAMED('AUC_scores'));
diff --git a/ML/Tests/Explanatory/RandomForest.ecl b/ML/Tests/Explanatory/RandomForest.ecl
index bac6b107..9a50dae0 100644
--- a/ML/Tests/Explanatory/RandomForest.ecl
+++ b/ML/Tests/Explanatory/RandomForest.ecl
@@ -1,6 +1,8 @@
-IMPORT * FROM ML;
+//RandomForest.ecl
+IMPORT * FROM ML;
IMPORT ML.Tests.Explanatory as TE;
-/*
+/*
+//Tiny dataset for tests
weatherRecord := RECORD
Types.t_RecordID id;
Types.t_FieldNumber outlook;
@@ -28,25 +30,41 @@ weatherRecord);
OUTPUT(weather_Data, NAMED('weather_Data'));
indep_Data:= TABLE(weather_Data,{id, outlook, temperature, humidity, windy});
dep_Data:= TABLE(weather_Data,{id, play});
-*/
+
+//Medium dataset for tests
indep_data:= TABLE(TE.MonkDS.Train_Data,{id, a1, a2, a3, a4, a5, a6});
dep_data:= TABLE(TE.MonkDS.Train_Data,{id, class});
+*/
+
+//Medium Large dataset for tests
+indep_data:= TABLE(TE.AdultDS.Train_Data,{id, Age, WorkClass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country});
+dep_data:= TABLE(TE.AdultDS.Train_Data,{id, Outcome});
ToField(indep_data, pr_indep);
indepData := ML.Discretize.ByRounding(pr_indep);
ToField(dep_data, pr_dep);
depData := ML.Discretize.ByRounding(pr_dep);
+/*
+// Wont work with the largest dataset, delete " , ALL"
+// As well as further commented lines will ", ALL"
// Using a small dataset to facilitate understanding of algorithm
OUTPUT(indepData, NAMED('indepData'), ALL);
OUTPUT(depData, NAMED('depData'), ALL);
-//Generating a random forest of 25 trees selecting 4 features for splits using impurity:=1.0 and max depth:= 10
-learner := Classify.RandomForest(25, 4, 1.0, 10);
-result := learner.learnd(IndepData, DepData); // model to use when classifying
-OUTPUT(result,NAMED('learnd_output'), ALL); // group_id represent number of tree
+*/
+
+// Generating a random forest of 100 trees selecting 7 features for splits using impurity:=1.0 and max depth:= 100.
+//learner := Classify.RandomForest(100, 7, 1.0, 100); // GiniSplit = TRUE (default) uses Gini Impurity as split criteria
+learner := Classify.RandomForest(100, 7, 1.0, 100, FALSE); // GiniSplit = FALSE uses Info Gain Ratio as split criteria
+result := learner.LearnD(IndepData, DepData); // model to use when classifying
+// OUTPUT(result,NAMED('learnd_output'), ALL); // group_id represent number of tree
model:= learner.model(result); // transforming model to a easier way to read it
-OUTPUT(SORT(model, group_id, node_id),NAMED('model_ouput'), ALL); // group_id represent number of tree
+// Showing only the first 100 records ("result limit" is 100 by default)
+OUTPUT(SORT(model, group_id, node_id, value), NAMED('model_ouput') );
+//OUTPUT(SORT(model, group_id, node_id, value), NAMED('model_ouput_all'), ALL);
+// To review the whole model use following line instead:
+//OUTPUT(SORT(model, group_id, node_id, value),, '~user::rdnforest_model', OVERWRITE); // stored in cluster
//Class distribution for each Instance
ClassDist:= learner.ClassProbDistribD(IndepData, result);
@@ -57,6 +75,10 @@ OUTPUT(class, NAMED('class_result'), ALL); // conf show voting percentage
//Measuring Performance of Classifier
performance:= Classify.Compare(depData, class);
OUTPUT(performance.CrossAssignments, NAMED('CrossAssig'));
+OUTPUT(performance.RecallByClass, NAMED('RecallByClass'));
+OUTPUT(performance.PrecisionByClass, NAMED('PrecisionByClass'));
+OUTPUT(performance.FP_Rate_ByClass, NAMED('FP_Rate_ByClass'));
+OUTPUT(performance.Accuracy, NAMED('Accuracy'));
//AUC_ROC returns all the ROC points and the value of the Area under the curve in the LAST_RECORD(AUC FIELD)
AUC0:= Classify.AUC_ROC(ClassDist, 0, depData); //Area under ROC Curve for class "0"
OUTPUT(AUC0, ALL, NAMED('AUC_0'));