diff --git a/ML/Classify.ecl b/ML/Classify.ecl index 870a9bb2..c23140da 100644 --- a/ML/Classify.ecl +++ b/ML/Classify.ecl @@ -1526,9 +1526,10 @@ Configuration Input Purity p <= 1.0 Depth max tree level */ - EXPORT RandomForest(t_Count treeNum, t_Count fsNum, REAL Purity=1.0, INTEGER1 Depth=32):= MODULE + EXPORT RandomForest(t_Count treeNum, t_Count fsNum, REAL Purity=1.0, t_level Depth=32, BOOLEAN GiniSplit = TRUE):= MODULE EXPORT LearnD(DATASET(Types.DiscreteField) Indep, DATASET(Types.DiscreteField) Dep) := FUNCTION - nodes := Ensemble.SplitFeatureSampleGI(Indep, Dep, treeNum, fsNum, Purity, Depth); + nodes := IF(GiniSplit, Ensemble.SplitFeatureSampleGI(Indep, Dep, treeNum, fsNum, Purity, Depth), + Ensemble.SplitFeatureSampleIGR(Indep, Dep, treeNum, fsNum, Depth)); RETURN ML.Ensemble.ToDiscreteForest(nodes); END; EXPORT LearnC(DATASET(Types.NumericField) Indep, DATASET(Types.DiscreteField) Dep) := FUNCTION @@ -1561,77 +1562,84 @@ Configuration Input END; // RandomTree module /* - Area Under the ROC curve + Area Under the ROC curve - Multiple Classification Results // The function calculate the Area Under the ROC curve based on: // - classProbDistclass : probability distribution for each instance // - positiveClass : the class of interest // - Dep : instance's class value - // The function returns all points of the ROC curve for graphic purposes: + // The function returns all points of the ROC curves for graphic purposes: // label: threshold, point: (threshold's false negative rate, threshold's true positive rate). - // The area under the ROC curve is returned in the AUC field of the last record. + // The area under the ROC curve is returned by the last record's AUC field for each Classification Results(group_id). // Note: threshold = 100 means classifying all instances as negative, it is not necessarily part of the curve */ - EXPORT AUC_ROC(DATASET(l_result) classProbDist, Types.t_Discrete positiveClass, DATASET(Types.DiscreteField) Dep) := FUNCTION + EXPORT AUCcurvePoint:= RECORD + t_Count id; + t_Discrete posClass; + t_FieldNumber classifier; + t_FieldReal thresho; + t_FieldReal fpr; + t_FieldReal tpr; + t_FieldReal deltaPos:=0; + t_FieldReal deltaNeg:=0; + t_FieldReal cumNeg:=0; + t_FieldReal AUC:=0; + END; + EXPORT AUC_ROC(DATASET(l_result) classProbDist, Types.t_Discrete positiveClass, DATASET(Types.DiscreteField) allDep) := FUNCTION SHARED cntREC:= RECORD - Types.t_FieldNumber classifier; // The classifier in question (value of 'number' on outcome data) - Types.t_Discrete c_actual; // The value of c provided - Types.t_FieldReal score :=-1; - Types.t_count tp_cnt:=0; - Types.t_count fn_cnt:=0; - Types.t_count fp_cnt:=0; - Types.t_count tn_cnt:=0; + t_FieldNumber classifier; // The classifier in question (value of &apos;number&apos; on outcome data) + t_Discrete c_actual; // The value of c provided + t_FieldReal score :=-1; + t_count tp_cnt:= 0; + t_count fn_cnt:= 0; + t_count fp_cnt:= 0; + t_count tn_cnt:= 0; + t_count totPos:= 0; + t_count totNeg:= 0; END; SHARED compREC:= RECORD(cntREC) - Types.t_Discrete c_modeled; + t_Discrete c_modeled; END; classOfInterest := classProbDist(value = positiveClass); - compared:= JOIN(classOfInterest, Dep, LEFT.id=RIGHT.id AND LEFT.number=RIGHT.number, - TRANSFORM(compREC, SELF.classifier:= LEFT.number, SELF.c_actual:=RIGHT.value, - SELF.c_modeled:=LEFT.value, SELF.score:=LEFT.conf), HASH); - sortComp:= SORT(compared, score); + dCPD := DISTRIBUTE(classOfInterest, HASH(id)); + dDep := DISTRIBUTE(allDep, HASH(id)); + compared:= JOIN(dCPD, dDep, LEFT.id=RIGHT.id AND LEFT.number=RIGHT.number, + TRANSFORM(compREC, SELF.classifier:= RIGHT.number, SELF.c_actual:=RIGHT.value, + SELF.c_modeled:= positiveClass, SELF.score:=ROUND(LEFT.conf, 14)), RIGHT OUTER, LOCAL); + sortComp:= SORT(compared, classifier, score); coi_acc:= TABLE(sortComp, {classifier, score, cntPos:= COUNT(GROUP, c_actual = c_modeled), cntNeg:= COUNT(GROUP, c_actual<>c_modeled)}, classifier, score, LOCAL); coi_tot:= TABLE(coi_acc, {classifier, totPos:= SUM(GROUP, cntPos), totNeg:= SUM(GROUP, cntNeg)}, classifier, FEW); - totPos:=EVALUATE(coi_tot[1], totPos); - totNeg:=EVALUATE(coi_tot[1], totNeg); // Count and accumulate number of TP, FP, TN and FN instances for each threshold (score) - acc_sorted:= PROJECT(coi_acc, TRANSFORM(cntREC, SELF.c_actual:= positiveClass, SELF.fn_cnt:= LEFT.cntPos, - SELF.tn_cnt:= LEFT.cntNeg, SELF:= LEFT), LOCAL); + acc_tot:= JOIN(coi_acc, coi_tot, LEFT.classifier = RIGHT.classifier, TRANSFORM(cntREC , SELF.c_actual:= positiveClass, + SELF.fn_cnt:= LEFT.cntPos, SELF.tn_cnt:= LEFT.cntNeg, + SELF.totPos:= RIGHT.totPos, SELF.totNeg:= RIGHT.totNeg, SELF:= LEFT),LOOKUP); cntREC accNegPos(cntREC l, cntREC r) := TRANSFORM - deltaPos:= l.fn_cnt + r.fn_cnt; - deltaNeg:= l.tn_cnt + r.tn_cnt; + deltaPos:= IF(l.classifier <> r.classifier, 0, l.fn_cnt) + r.fn_cnt; + deltaNeg:= IF(l.classifier <> r.classifier, 0, l.tn_cnt) + r.tn_cnt; SELF.score:= r.score; - SELF.tp_cnt:= totPos - deltaPos; + SELF.tp_cnt:= r.totPos - deltaPos; SELF.fn_cnt:= deltaPos; - SELF.fp_cnt:= totNeg - deltaNeg; + SELF.fp_cnt:= r.totNeg - deltaNeg; SELF.tn_cnt:= deltaNeg; SELF:= r; END; - cntNegPos:= ITERATE(acc_sorted, accNegPos(LEFT, RIGHT)); - accnew := DATASET([{1,positiveClass,-1,totPos,0,totNeg,0}], cntREC) + cntNegPos; - curvePoint:= RECORD - Types.t_Count id; - Types.t_FieldNumber classifier; - Types.t_FieldReal thresho; - Types.t_FieldReal fpr; - Types.t_FieldReal tpr; - Types.t_FieldReal deltaPos:=0; - Types.t_FieldReal deltaNeg:=0; - Types.t_FieldReal cumNeg:=0; - Types.t_FieldReal AUC:=0; - END; + cntNegPos:= ITERATE(acc_tot, accNegPos(LEFT, RIGHT)); + tmp:= PROJECT(coi_tot, TRANSFORM(cntREC, SELF.classifier:= LEFT.classifier, SELF.c_actual:= positiveClass, + SELF.score:= -1, SELF.tp_cnt:= LEFT.totPos, SELF.fn_cnt:= 0, + SELF.fp_cnt:= LEFT.totNeg, SELF.tn_cnt:= 0, SELF:= LEFT )); + accnew:= SORT(cntNegPos + tmp, classifier, score); // Transform all into ROC curve points - rocPoints:= PROJECT(accnew, TRANSFORM(curvePoint, SELF.id:=COUNTER, SELF.thresho:=LEFT.score, - SELF.fpr:= LEFT.fp_cnt/totNeg, SELF.tpr:= LEFT.tp_cnt/totPos, SELF.AUC:=IF(totNeg=0,1,0) ,SELF:=LEFT)); + rocPoints:= PROJECT(accnew, TRANSFORM(AUCcurvePoint, SELF.id:=COUNTER, SELF.thresho:=LEFT.score, SELF.posClass:= positiveClass, + SELF.fpr:= LEFT.fp_cnt/LEFT.totNeg, SELF.tpr:= LEFT.tp_cnt/LEFT.totPos, SELF.AUC:=IF(LEFT.totNeg=0,1,0) ,SELF:=LEFT)); // Calculate the area under the curve (cumulative iteration) - curvePoint rocArea(curvePoint l, curvePoint r) := TRANSFORM - deltaPos := if(l.tpr > r.tpr, l.tpr - r.tpr, 0.0); - deltaNeg := if( l.fpr > r.fpr, l.fpr - r.fpr, 0.0); + AUCcurvePoint rocArea(AUCcurvePoint l, AUCcurvePoint r) := TRANSFORM + deltaPos := if(l.classifier <> r.classifier, 0.0, l.tpr - r.tpr); + deltaNeg := if(l.classifier <> r.classifier, 0.0, l.fpr - r.fpr); SELF.deltaPos := deltaPos; SELF.deltaNeg := deltaNeg; // A classification without incorrectly classified instances must return AUC = 1 - SELF.AUC := IF(r.fpr=0 AND l.tpr=0 AND r.tpr=1, 1, l.AUC) + deltaPos * (l.cumNeg + 0.5* deltaNeg); - SELF.cumNeg := l.cumNeg + deltaNeg; + SELF.AUC := IF(l.classifier <> r.classifier, 0, IF(r.fpr=0 AND l.tpr=0 AND r.tpr=1, 1, l.AUC) + deltaPos * (l.cumNeg + 0.5* deltaNeg)); + SELF.cumNeg := IF(l.classifier <> r.classifier, 0, l.cumNeg + deltaNeg); SELF:= r; END; RETURN ITERATE(rocPoints, rocArea(LEFT, RIGHT)); diff --git a/ML/Ensemble.ecl b/ML/Ensemble.ecl index f17bc0bf..dbd3db77 100644 --- a/ML/Ensemble.ecl +++ b/ML/Ensemble.ecl @@ -61,7 +61,15 @@ EXPORT Ensemble := MODULE SELF.number := l.number; SELF.value := l.value; END; - + SHARED gNodeInstDisc init(DiscreteField dep, Sampling.idListGroupRec depG) := TRANSFORM + SELF.group_id := depG.gNum; + SELF.node_id := depG.gNum; + SELF.level := 1; + SELF.id := depG.id; + SELF.number := 1; + SELF.value := Dep.value; + SELF.depend := Dep.value; + END; /* Discrete implementation*/ // Function to split a set of nodes based on Feature Selection and Gini Impurity, used in Random Forest Classifier Discrete Learning, // the nodes received were generated sampling with replacement nTrees times. @@ -75,22 +83,13 @@ EXPORT Ensemble := MODULE depth := MIN(255, maxLevel); // Max number of iterations when building trees (max 256 levels) // Sampling with replacement the original dataset to generate treeNum Datasets grList0 := Sampling.GenerateNSampleList(treeNum, N); // the number of records will be N * treeNum - dgrLstOld := DISTRIBUTE(grList0, HASH(oldId)); - dDep := DISTRIBUTE(Dep, HASH(id)); + dgrLstOld := DISTRIBUTE(grList0 , HASH(oldId)); + dDep := DISTRIBUTE(Dep , HASH(id)); + dIndep := DISTRIBUTE(Indep , HASH(id)); // Generating initial node-instance recordset, all instances go to their respective tree's root - gNodeInstDisc init(Types.DiscreteField l_dep, Sampling.idListGroupRec r_depGLO) := TRANSFORM - SELF.group_id := r_depGLO.gNum; - SELF.node_id := r_depGLO.gNum; - SELF.level := 1; - SELF.id := r_depGLO.id; - SELF.number := 1; - SELF.value := l_dep.value; - SELF.depend := l_dep.value; - END; roots := JOIN(dDep, dgrLstOld, LEFT.id = RIGHT.oldId, init(LEFT, RIGHT), LOCAL); distRoots := DISTRIBUTE(roots, HASH(group_id, node_id)); // Calculated only once, used at each iteration inside loopbody - dIndep := DISTRIBUTE(Indep, HASH(id)); all_Data0 := JOIN(dIndep, dgrLstOld, LEFT.id = RIGHT.oldId, GetDRecords(LEFT, RIGHT), LOCAL); all_Data := DISTRIBUTE(all_Data0, HASH(id)); dgrLstNew := DISTRIBUTE(grList0 , HASH(id)); @@ -166,6 +165,136 @@ EXPORT Ensemble := MODULE maxlevel_leafs:= PROJECT(depCnt, TRANSFORM(gSplitD, SELF.number:=0, SELF.value:= LEFT.depend, SELF.support:= LEFT.cnt, SELF.new_node_id:=0, SELF:= LEFT)); RETURN new_nodes + maxlevel_leafs; END; + + EXPORT SplitFeatureSampleIGR(DATASET(DiscreteField) Indep, DATASET(DiscreteField) Dep, t_Count treeNum, t_Count fsNum, t_level maxLevel=32) := FUNCTION + N := MAX(Dep, id); // Number of Instances + totFeat := COUNT(Indep(id=N)); // Number of Features + depth := MIN(255, maxLevel); // Max number of iterations when building trees (max 256 levels) + // sampling with replacement the original dataset to generate treeNum Datasets + gNodeInstDisc init(DiscreteField dep, Sampling.idListGroupRec depG) := TRANSFORM + SELF.group_id := depG.gNum; + SELF.node_id := depG.gNum; + SELF.level := 1; + SELF.id := depG.id; + SELF.number := 1; + SELF.value := Dep.value; + SELF.depend := Dep.value; + END; + grList0 := Sampling.GenerateNSampleList(treeNum, N); // the number of records will be N * treeNum + dgrLstOld := DISTRIBUTE(grList0 , HASH(oldId)); + dDep := DISTRIBUTE(Dep , HASH(Id)); + depG := JOIN(dDep, dgrLstOld, LEFT.id = RIGHT.oldId, init(LEFT, RIGHT), LOCAL); + dIndep := DISTRIBUTE(Indep , HASH(id)); + all_Data0 := JOIN(dIndep, dgrLstOld, LEFT.id = RIGHT.oldId, GetDRecords(LEFT, RIGHT), LOCAL); + // Calculated only once, used at each iteration inside loopbody + all_Data := DISTRIBUTE(all_Data0, HASH(id)); + dgrLstNew := DISTRIBUTE(grList0 , HASH(id)); + // loopbody function + gNodeInstDisc RndFeatSelPartitionGRBased(DATASET(gNodeInstDisc) nodes, t_Count p_level):= FUNCTION + dNodes := DISTRIBUTE(nodes, HASH(group_id, node_id)); + // Calculating Information Entropy of Nodes + top_dep := TABLE(dNodes , {group_id, node_id, depend, cnt:= COUNT(GROUP)}, group_id, node_id, depend, LOCAL); + top_dep_tot := TABLE(top_dep, {group_id, node_id, tot:= SUM(GROUP, cnt)} , group_id, node_id, LOCAL); + tdp := RECORD + top_dep; + REAL4 prop; // Proportion based only on dependent value + REAL4 plogp:= 0; + END; + P_Log_P(REAL P) := IF(P=1, 0, -P*LOG(P)/LOG(2)); + top_dep_p := JOIN(top_dep, top_dep_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id, + TRANSFORM(tdp, SELF.prop:= LEFT.cnt/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.cnt/RIGHT.tot), SELF:=LEFT), LOCAL); + top_info := TABLE(top_dep_p, {group_id, node_id, info:= SUM(GROUP, plogp)}, group_id, node_id, LOCAL); // Nodes Information Entropy + // Filtering pure nodes + PureNodes := top_info(info = 0); // Pure Nodes have Info Entropy = 0 + // Node-instances in pure nodes pass through + pass_thru := JOIN(top_dep, PureNodes, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id, TRANSFORM(gNodeInstDisc, + SELF.level:= p_level, SELF.depend:=LEFT.depend, SELF.support:=LEFT.cnt, SELF.id:=0, SELF.number:=0, SELF.value:=0, SELF:=LEFT), LOCAL); + // New working set after removing pass through node-instances + nodes_toSplit := DISTRIBUTE(JOIN(dnodes, PureNodes, LEFT.node_id=RIGHT.node_id, TRANSFORM(LEFT), LEFT ONLY, LOCAL), HASH(id)); + // Gather only the data needed for each LOOP iteration, + // featSetInst preserves dgrLstNew distribution, then it can be JOINed LOCALly with all_Data (HASH(id) distribution also) + featSet := NxKoutofM(treeNum, fsNum, totFeat); // generating list of features selected for each tree + ftSetInst := JOIN(dgrLstNew, featSet, LEFT.gNum = RIGHT.gNum, MANY LOOKUP); + loop_Data := JOIN(all_Data, ftSetInst, LEFT.id = RIGHT.id AND LEFT.number = RIGHT.number, TRANSFORM(LEFT), LOCAL); + // Populating nodes' attributes to split + toSplit := JOIN(loop_Data, nodes_toSplit, LEFT.id = RIGHT.id, TRANSFORM(gNodeInstDisc, SELF.number:= LEFT.number; SELF.value:= LEFT.value; SELF:= RIGHT;), LOCAL); + this_set := DISTRIBUTE(toSplit, HASH(group_id, node_id)); + // Calculating Information Gain of possible splits + child := TABLE(this_set, {group_id, node_id, number, value, depend, cnt := COUNT(GROUP)}, group_id, node_id, number, value, depend, LOCAL); + child_tot := TABLE(child, {group_id, node_id, number, value, tot := SUM(GROUP, cnt)}, group_id, node_id, number, value, LOCAL); + csp := RECORD + child_tot; + REAL4 prop; + REAL4 plogp; + END; + // Calculating Intrinsic Information Entropy of each attribute(split) per node + csplit_p := JOIN(child_tot, top_dep_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id, + TRANSFORM(csp, SELF.prop:= LEFT.tot/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.tot/RIGHT.tot), SELF:=LEFT), LOCAL); + csplit := TABLE(csplit_p, {group_id, node_id, number, split_info:=SUM(GROUP, plogp)}, group_id, node_id, number, LOCAL); // Intrinsic Info + chp := RECORD + child; + REAL4 prop; // Proportion pertaining to this dependant value + REAL4 plogp:= 0; + END; + // Information Entropy of new branches per split + cprop := JOIN(child, child_tot, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number AND LEFT.value = RIGHT.value, + TRANSFORM(chp, SELF.prop := LEFT.cnt/RIGHT.tot, SELF.plogp:= P_LOG_P(LEFT.cnt/RIGHT.tot), SELF:=LEFT), LOCAL); + cplogp := TABLE(cprop, {group_id, node_id, number, value, cont:= SUM(GROUP,cnt), inf0:= SUM(GROUP, plogp)}, group_id, node_id, number, value, LOCAL); + // Information Entropy of possible splits per node + cinfo := TABLE(cplogp, {group_id, node_id, number, info:=SUM(GROUP, cont*inf0)/SUM(GROUP, cont)}, group_id, node_id, number, LOCAL); + gainRec := RECORD + t_count group_id; + t_node node_id; + t_Discrete number; + REAL4 gain; + END; + // Information Gain of possible splits per node + gain := JOIN(cinfo, top_info, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id, + TRANSFORM(gainRec, SELF.gain:= RIGHT.info - LEFT.info, SELF:= LEFT), LOCAL); + gainRateRec := RECORD + t_count group_id; + t_node node_id; + t_Discrete number; + REAL4 gain_ratio; + END; + // Information Gain Ratio of possible splits per node + gainRatio := JOIN(gain, csplit, LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number, + TRANSFORM(gainRateRec, SELF.gain_ratio:= LEFT.gain/RIGHT.split_info, SELF:= LEFT), LOCAL); + // Selecting the split with max Info Gain Ratio per node + split := DEDUP(SORT(gainRatio, group_id, node_id, -gain_ratio, LOCAL), group_id, node_id, LOCAL); + + // new split nodes found + new_spl0 := JOIN(child_tot, split, LEFT.group_id = RIGHT.group_id AND LEFT.node_id = RIGHT.node_id AND LEFT.number = RIGHT.number, TRANSFORM(LEFT), LOCAL); + node_base := MAX(nodes, node_id); + new_split := PROJECT(new_spl0, TRANSFORM(gNodeInstDisc, SELF.value:= node_base + COUNTER; SELF.depend:= LEFT.value; + SELF.level:= p_level; SELF.support:= LEFT.tot; SELF := LEFT; SELF := [];)); + // reasigning instances to new nodes + node_inst := JOIN(this_set, new_split, LEFT.group_id = RIGHT.group_id AND LEFT.node_id=RIGHT.node_id AND LEFT.number=RIGHT.number AND LEFT.value=RIGHT.depend, + TRANSFORM(gNodeInstDisc, SELF.node_id:=RIGHT.value, SELF.level:= RIGHT.level +1, SELF.value:= LEFT.depend, SELF:= LEFT ), LOCAL); + RETURN pass_thru + new_split + node_inst; // returning leaf nodes, new splits nodes and reassigned instances + END; + // generating best feature_selection-gini_impurity splits, loopfilter level = COUNTER let pass only the nodes to be splitted for any current level + res := LOOP(depG, LEFT.level=COUNTER, COUNTER < depth , RndFeatSelPartitionGRBased(ROWS(LEFT), COUNTER)); + // Turning LOOP results into splits and leaf nodes + gSplitD toNewNode(gNodeInstDisc NodeInst) := TRANSFORM + SELF.new_node_id := IF(NodeInst.number>0, NodeInst.value, 0); + SELF.number := IF(NodeInst.number>0, NodeInst.number, 0); + SELF.value := NodeInst.depend; + SELF:= NodeInst; + END; + new_nodes:= PROJECT(res(id=0), toNewNode(LEFT)); // node splits and leaf nodes + // Taking care of instances (id>0) that reached maximum level and did not turn into a leaf yet + mode_r := RECORD + res.group_id; + res.node_id; + res.level; + res.depend; + Cnt := COUNT(GROUP); + END; + depCnt := TABLE(res(id>0),mode_r, group_id, node_id, level, depend, FEW); + maxlevel_leafs:= PROJECT(depCnt, TRANSFORM(gSplitD, SELF.number:=0, SELF.value:= LEFT.depend, SELF.support:= LEFT.cnt, SELF.new_node_id:=0, SELF:= LEFT)); + RETURN new_nodes + maxlevel_leafs; + END; EXPORT FromDiscreteForest(DATASET(NumericField) mod) := FUNCTION FromField(mod, gSplitD,o, modelD_Map); RETURN o; diff --git a/ML/NFoldCrossValidation.ecl b/ML/NFoldCrossValidation.ecl new file mode 100644 index 00000000..dbd78842 --- /dev/null +++ b/ML/NFoldCrossValidation.ecl @@ -0,0 +1,130 @@ +IMPORT * FROM ML; +IMPORT * FROM ML.Types; +EXPORT NFoldCrossValidation(IndepDS, DepDS, LearnerName, NumFolds) := FUNCTIONMACRO + learner:= LearnerName; + #DECLARE(fields); + #DECLARE(ilearn) #SET(ilearn,'') + #DECLARE(icpd) #SET(icpd,'') + #DECLARE(iclass) #SET(iclass,'') + #EXPORTXML(fields, RECORDOF(IndepDS)); + #FOR(fields) + #FOR(Field) + #IF(%'{@label}'% = 'value') + #IF(%'{@type}'% = 'integer') + #SET(ilearn, 'RETURN learner.LearnD(indData, depData); \n') + #SET(icpd, 'RETURN learner.ClassProbDistribD(indData, mod); \n') + #SET(iclass, 'RETURN learner.ClassifyD(indData, mod); \n') + #ELSE + #SET(ilearn, 'RETURN learner.LearnC(indData, depData); \n') + #SET(icpd, 'RETURN learner.ClassProbDistribC(indData, mod); \n') + #SET(iclass, 'RETURN learner.ClassifyC(indData, mod); \n') + #END + #END + #END + #END + + AUC_Rec:= RECORD(ML.Types.NumericField) + DATASET(ML.Classify.AUCcurvePoint) curvePoints; + END; + idFoldRec := RECORD + Types.t_FieldNumber fold; + Types.t_RecordID id; + END; + dsRecordRnd := RECORD(ML.Types.DiscreteField) + Types.t_FieldNumber rnd:= 0; + END; + dsRecordRnd AddRandom(ML.Types.DiscreteField l) :=TRANSFORM + SELF.rnd := RANDOM(); + SELF := l; + END; + Learn(DATASET(RECORDOF(IndepDS)) indData, DATASET(Types.DiscreteField) depData) := FUNCTION + #EXPAND(%'ilearn'%) + END; + ClassProbDistrib(DATASET(RECORDOF(IndepDS)) indData, DATASET(Types.NumericField) mod) := FUNCTION + #EXPAND(%'icpd'%) + END; + Classify(DATASET(RECORDOF(IndepDS)) indData, DATASET(Types.NumericField) mod) := FUNCTION + #EXPAND(%'iclass'%) + END; + FoldNDS(DATASET(RECORDOF(IndepDS)) indData, DATASET(Types.DiscreteField) depData, DATASET(idFoldRec) ds_folds, Types.t_Discrete num_fold, Types.t_RecordID baseId = 0) := MODULE + EXPORT trainIndep := JOIN(indData, ds_folds(fold <> num_fold), LEFT.id = RIGHT.id, TRANSFORM(RECORDOF(IndepDS), SELF.id:= LEFT.id + baseId, SELF:=LEFT), LOCAL); + EXPORT trainDep := JOIN(depData, ds_folds(fold <> num_fold), LEFT.id = RIGHT.id, TRANSFORM(Types.DiscreteField, SELF.id:= LEFT.id + baseId, SELF.number:= num_fold, SELF:=LEFT), LOCAL); + EXPORT testIndep := JOIN(indData, ds_folds(fold = num_fold), LEFT.id = RIGHT.id, TRANSFORM(RECORDOF(IndepDS), SELF.id:= LEFT.id + baseId, SELF:=LEFT), LOCAL); + EXPORT testDep := JOIN(depData, ds_folds(fold = num_fold), LEFT.id = RIGHT.id, TRANSFORM(Types.DiscreteField, SELF.id:= LEFT.id + baseId, SELF.number:= num_fold, SELF:=LEFT), LOCAL); + END; + toFoldResult(DATASET(ML.Types.l_result) iCPD, DATASET(ML.Types.l_result) iClass, ML.Types.t_Discrete num_fold) := MODULE + EXPORT CPD := PROJECT(iCPD, TRANSFORM(ML.Types.l_result, SELF.number:= num_fold; SELF:= LEFT), LOCAL); + EXPORT Class := PROJECT(iClass, TRANSFORM(ML.Types.l_result, SELF.number:= num_fold; SELF:= LEFT), LOCAL); + END; + dRnd := PROJECT(DepDS, AddRandom(LEFT), LOCAL); + dRndSorted := SORT(dRnd,value,rnd); + ds_parts := DISTRIBUTE(PROJECT(dRndSorted, TRANSFORM(idFoldRec, SELF.fold := COUNTER%NumFolds + 1, SELF:= LEFT)), id); + dIndep := DISTRIBUTE(IndepDS, id); + dDep := DISTRIBUTE(DepDS, id); +// classes := TABLE(dDep,{number, value}, number, value); + #DECLARE (SetString) #SET (SetString, ''); + #DECLARE (SetLearner) #SET (SetLearner, REGEXREPLACE('[^a-zA-Z0-9_.,()]',#TEXT(LearnerName),'')); + #DECLARE (Ndx) + #SET (Ndx, 1); + #LOOP + #IF (%Ndx% > NumFolds) + #BREAK // break out of the loop + #ELSE //otherwise + #APPEND(SetString,'fold' + %'Ndx'% + ':= FoldNDS(dIndep, dDep, ds_parts, ' + %'Ndx'% + '); \n'); + #APPEND(SetString,'indepN' + %'Ndx'% + ':= fold' + %'Ndx'% + '.trainIndep; \n'); + #APPEND(SetString,'depN' + %'Ndx'% + ':= fold' + %'Ndx'% + '.trainDep; \n'); + #APPEND(SetString,'t_indepN' + %'Ndx'% + ':= fold' + %'Ndx'% + '.testIndep; \n'); + #APPEND(SetString,'t_depN' + %'Ndx'% + ':= fold' + %'Ndx'% + '.testDep; \n'); + #APPEND(SetString,'modN' + %'Ndx'% + ':= Learn(indepN' + %'Ndx'% + ', depN' + %'Ndx'% + '); \n'); + #APPEND(SetString,'cpdN' + %'Ndx'% + ':= ClassProbDistrib(t_indepN' + %'Ndx'% + ', modN' + %'Ndx'% + '); \n'); + #APPEND(SetString,'classN' + %'Ndx'% + ':= Classify(t_indepN' + %'Ndx'% + ', modN' + %'Ndx'% + '); \n'); + #APPEND(SetString,'tfResN' + %'Ndx'% + ':= toFoldResult(cpdN' + %'Ndx'% + ', classN' + %'Ndx'% + ',' + %'Ndx'% + '); \n'); + #SET (Ndx, %Ndx% + 1) //and increment the value of Ndx + #END + #END + #EXPAND(%'SetString'%); + #SET (Ndx, 1); + #DECLARE (aggCPD) #SET (aggCPD, 'allCPD:= '); + #DECLARE (aggClass) #SET (aggClass, 'allClass:= '); + #DECLARE (aggtDep) #SET (aggtDep, 'alltDep:= '); + #LOOP + #IF (%Ndx% < NumFolds) + #APPEND(aggCPD, 'tfResN' + %'Ndx'% + '.CPD +'); + #APPEND(aggClass, 'tfResN' + %'Ndx'% + '.Class +'); + #APPEND(aggtDep, 't_depN' + %'Ndx'% + ' +'); + #SET (Ndx, %Ndx% + 1) //and increment the value of Ndx + #ELSE //otherwise + #APPEND(aggCPD, 'tfResN' + %'Ndx'% + '.CPD; \n'); + #APPEND(aggClass, 'tfResN' + %'Ndx'% + '.Class; \n'); + #APPEND(aggtDep, 't_depN' + %'Ndx'% + '; \n'); + #BREAK // break out of the loop + #END + #END + #EXPAND(%'aggCPD'%); + #EXPAND(%'aggClass'%); + #EXPAND(%'aggtDep'%); + + CVResults(DATASET(ML.Types.DiscreteField) it_depN, DATASET(ML.Types.l_result) iCPDN, DATASET(ML.Types.l_result) iclassN) := MODULE + tclass:= TABLE(it_depN, {value}, value); + fclass:= PROJECT(tclass, TRANSFORM(ML.Types.NumericField, SELF.id:= COUNTER, SELF.number:= LEFT.value, SELF.value:= 0)); + AUC_Rec GetAUC(ML.Types.NumericField L) := TRANSFORM + SELF.curvePoints:= ML.Classify.AUC_ROC(iCPDN, L.number, it_depN); + SELF := L; + END; + AUC_class:= PROJECT(fclass, GetAUC(LEFT)); + ML.Classify.AUCcurvePoint GetCurvePoints(ML.Classify.AUCcurvePoint R) := TRANSFORM + SELF := R; + END; + EXPORT AUC_curvePoints := NORMALIZE(AUC_class, LEFT.curvePoints, GetCurvePoints(RIGHT)); + EXPORT AUC_scores := TABLE(AUC_curvePoints, {posClass, classifier, AUC_score:= MAX(GROUP, auc)}, posClass, classifier); + SHARED TestModule := ML.Classify.Compare(it_depN, iclassN); + EXPORT CrossAssignments := TestModule.CrossAssignments; + EXPORT RecallByClass := TestModule.RecallByClass; + EXPORT PrecisionByClass := TestModule.PrecisionByClass; + EXPORT FP_Rate_ByClass := TestModule.FP_Rate_ByClass; + EXPORT Accuracy := TestModule.Accuracy; + EXPORT CPDN := iCPDN; + EXPORT classN := iclassN; + END; + RETURN CVResults(alltDep, allCPD, allClass); +ENDMACRO; \ No newline at end of file diff --git a/ML/Tests/Explanatory/10CrossValidation.ecl b/ML/Tests/Explanatory/10CrossValidation.ecl new file mode 100644 index 00000000..a25113aa --- /dev/null +++ b/ML/Tests/Explanatory/10CrossValidation.ecl @@ -0,0 +1,20 @@ +#option('outputLimit',100); +IMPORT * FROM ML; +IMPORT ML.Tests.Explanatory as TE; +//Medium Large dataset for tests +indep_data:= TABLE(TE.AdultDS.Train_Data,{id, Age, WorkClass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country}); +dep_data:= TABLE(TE.AdultDS.Train_Data,{id, Outcome}); + +ToField(indep_data, pr_indep); +indepData := ML.Discretize.ByRounding(pr_indep); +ToField(dep_data, pr_dep); +depData := ML.Discretize.ByRounding(pr_dep); +learner := Classify.RandomForest(100, 8, 1.0, 125, TRUE); + +cv:= NFoldCrossValidation(indepData, depData, learner, 10); +OUTPUT(cv.CrossAssignments, NAMED('CA_part'), ALL); +OUTPUT(cv.RecallByClass, NAMED('RecallByClass')); +OUTPUT(cv.PrecisionByClass, NAMED('PrecByClass')); +OUTPUT(cv.FP_Rate_ByClass, NAMED('FP_Rate_ByClass')); +OUTPUT(cv.Accuracy, NAMED('Accuracy')); +OUTPUT(cv.AUC_scores, NAMED('AUC_scores')); diff --git a/ML/Tests/Explanatory/RandomForest.ecl b/ML/Tests/Explanatory/RandomForest.ecl index bac6b107..9a50dae0 100644 --- a/ML/Tests/Explanatory/RandomForest.ecl +++ b/ML/Tests/Explanatory/RandomForest.ecl @@ -1,6 +1,8 @@ -IMPORT * FROM ML; +//RandomForest.ecl +IMPORT * FROM ML; IMPORT ML.Tests.Explanatory as TE; -/* +/* +//Tiny dataset for tests weatherRecord := RECORD Types.t_RecordID id; Types.t_FieldNumber outlook; @@ -28,25 +30,41 @@ weatherRecord); OUTPUT(weather_Data, NAMED('weather_Data')); indep_Data:= TABLE(weather_Data,{id, outlook, temperature, humidity, windy}); dep_Data:= TABLE(weather_Data,{id, play}); -*/ + +//Medium dataset for tests indep_data:= TABLE(TE.MonkDS.Train_Data,{id, a1, a2, a3, a4, a5, a6}); dep_data:= TABLE(TE.MonkDS.Train_Data,{id, class}); +*/ + +//Medium Large dataset for tests +indep_data:= TABLE(TE.AdultDS.Train_Data,{id, Age, WorkClass, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country}); +dep_data:= TABLE(TE.AdultDS.Train_Data,{id, Outcome}); ToField(indep_data, pr_indep); indepData := ML.Discretize.ByRounding(pr_indep); ToField(dep_data, pr_dep); depData := ML.Discretize.ByRounding(pr_dep); +/* +// Wont work with the largest dataset, delete " , ALL" +// As well as further commented lines will ", ALL" // Using a small dataset to facilitate understanding of algorithm OUTPUT(indepData, NAMED('indepData'), ALL); OUTPUT(depData, NAMED('depData'), ALL); -//Generating a random forest of 25 trees selecting 4 features for splits using impurity:=1.0 and max depth:= 10 -learner := Classify.RandomForest(25, 4, 1.0, 10); -result := learner.learnd(IndepData, DepData); // model to use when classifying -OUTPUT(result,NAMED('learnd_output'), ALL); // group_id represent number of tree +*/ + +// Generating a random forest of 100 trees selecting 7 features for splits using impurity:=1.0 and max depth:= 100. +//learner := Classify.RandomForest(100, 7, 1.0, 100); // GiniSplit = TRUE (default) uses Gini Impurity as split criteria +learner := Classify.RandomForest(100, 7, 1.0, 100, FALSE); // GiniSplit = FALSE uses Info Gain Ratio as split criteria +result := learner.LearnD(IndepData, DepData); // model to use when classifying +// OUTPUT(result,NAMED('learnd_output'), ALL); // group_id represent number of tree model:= learner.model(result); // transforming model to a easier way to read it -OUTPUT(SORT(model, group_id, node_id),NAMED('model_ouput'), ALL); // group_id represent number of tree +// Showing only the first 100 records ("result limit" is 100 by default) +OUTPUT(SORT(model, group_id, node_id, value), NAMED('model_ouput') ); +//OUTPUT(SORT(model, group_id, node_id, value), NAMED('model_ouput_all'), ALL); +// To review the whole model use following line instead: +//OUTPUT(SORT(model, group_id, node_id, value),, '~user::rdnforest_model', OVERWRITE); // stored in cluster //Class distribution for each Instance ClassDist:= learner.ClassProbDistribD(IndepData, result); @@ -57,6 +75,10 @@ OUTPUT(class, NAMED('class_result'), ALL); // conf show voting percentage //Measuring Performance of Classifier performance:= Classify.Compare(depData, class); OUTPUT(performance.CrossAssignments, NAMED('CrossAssig')); +OUTPUT(performance.RecallByClass, NAMED('RecallByClass')); +OUTPUT(performance.PrecisionByClass, NAMED('PrecisionByClass')); +OUTPUT(performance.FP_Rate_ByClass, NAMED('FP_Rate_ByClass')); +OUTPUT(performance.Accuracy, NAMED('Accuracy')); //AUC_ROC returns all the ROC points and the value of the Area under the curve in the LAST_RECORD(AUC FIELD) AUC0:= Classify.AUC_ROC(ClassDist, 0, depData); //Area under ROC Curve for class "0" OUTPUT(AUC0, ALL, NAMED('AUC_0'));