Skip to content

Commit

Permalink
ch6
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnMount committed Dec 11, 2015
1 parent f8f9f2b commit 84d9444
Show file tree
Hide file tree
Showing 16 changed files with 1,170 additions and 107 deletions.
Binary file modified CodeExamples.zip
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# (example 6.3 of section 6.2.1) : Memorization methods : Building single-variable models : Using categorical features
# Title: Churn rates grouped by variable 218 codes

> print(table218[,2]/(table218[,1]+table218[,2]))
cJvF UYBR <NA>
0.05994389 0.08223821 0.26523297
print(table218[,2]/(table218[,1]+table218[,2]))
## cJvF UYBR <NA>
## 0.05994389 0.08223821 0.26523297

Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

library('ROCR')

> calcAUC <- function(predcol,outcol) {
calcAUC <- function(predcol,outcol) {
perf <- performance(prediction(predcol,outcol==pos),'auc')
as.numeric(perf@y.values)
}

> for(v in catVars) {
for(v in catVars) {
pi <- paste('pred',v,sep='')
aucTrain <- calcAUC(dTrain[,pi],dTrain[,outcome])
if(aucTrain>=0.8) {
Expand All @@ -18,8 +18,8 @@ library('ROCR')
pi,aucTrain,aucCal))
}
}
[1] "predVar200, trainAUC: 0.828 calibrationAUC: 0.527"
[1] "predVar202, trainAUC: 0.829 calibrationAUC: 0.522"
[1] "predVar214, trainAUC: 0.828 calibrationAUC: 0.527"
[1] "predVar217, trainAUC: 0.898 calibrationAUC: 0.553"
## [1] "predVar200, trainAUC: 0.828 calibrationAUC: 0.527"
## [1] "predVar202, trainAUC: 0.829 calibrationAUC: 0.522"
## [1] "predVar214, trainAUC: 0.828 calibrationAUC: 0.527"
## [1] "predVar217, trainAUC: 0.898 calibrationAUC: 0.553"

Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
# (example 6.7 of section 6.2.2) : Memorization methods : Building single-variable models : Using numeric features
# Title: Scoring numeric variables by AUC

> mkPredN <- function(outCol,varCol,appCol) {
mkPredN <- function(outCol,varCol,appCol) {
cuts <- unique(as.numeric(quantile(varCol,
probs=seq(0, 1, 0.1),na.rm=T)))
varC <- cut(varCol,cuts)
appC <- cut(appCol,cuts)
mkPredC(outCol,varC,appC)
}
> for(v in numericVars) {
for(v in numericVars) {
pi <- paste('pred',v,sep='')
dTrain[,pi] <- mkPredN(dTrain[,outcome],dTrain[,v],dTrain[,v])
dTest[,pi] <- mkPredN(dTrain[,outcome],dTrain[,v],dTest[,v])
Expand All @@ -21,14 +21,14 @@
pi,aucTrain,aucCal))
}
}
[1] "predVar6, trainAUC: 0.557 calibrationAUC: 0.554"
[1] "predVar7, trainAUC: 0.555 calibrationAUC: 0.565"
[1] "predVar13, trainAUC: 0.568 calibrationAUC: 0.553"
[1] "predVar73, trainAUC: 0.608 calibrationAUC: 0.616"
[1] "predVar74, trainAUC: 0.574 calibrationAUC: 0.566"
[1] "predVar81, trainAUC: 0.558 calibrationAUC: 0.542"
[1] "predVar113, trainAUC: 0.557 calibrationAUC: 0.567"
[1] "predVar126, trainAUC: 0.635 calibrationAUC: 0.629"
[1] "predVar140, trainAUC: 0.561 calibrationAUC: 0.560"
[1] "predVar189, trainAUC: 0.574 calibrationAUC: 0.599"
## [1] "predVar6, trainAUC: 0.557 calibrationAUC: 0.554"
## [1] "predVar7, trainAUC: 0.555 calibrationAUC: 0.565"
## [1] "predVar13, trainAUC: 0.568 calibrationAUC: 0.553"
## [1] "predVar73, trainAUC: 0.608 calibrationAUC: 0.616"
## [1] "predVar74, trainAUC: 0.574 calibrationAUC: 0.566"
## [1] "predVar81, trainAUC: 0.558 calibrationAUC: 0.542"
## [1] "predVar113, trainAUC: 0.557 calibrationAUC: 0.567"
## [1] "predVar126, trainAUC: 0.635 calibrationAUC: 0.629"
## [1] "predVar140, trainAUC: 0.561 calibrationAUC: 0.560"
## [1] "predVar189, trainAUC: 0.574 calibrationAUC: 0.599"

Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# (example 6.8 of section 6.2.2) : Memorization methods : Building single-variable models : Using numeric features
# Title: Plotting variable performance

library('ggplot2')
ggplot(data=dCal) +
geom_density(aes(x=predVar126,color=as.factor(churn)))

Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@
# (example 6.9 of section 6.2.3) : Memorization methods : Building single-variable models : Using cross-validation to estimate effects of overfitting
# Title: Running a repeated cross-validation experiment

> var <- 'Var217'
> aucs <- rep(0,100)
> for(rep in 1:length(aucs)) { # Note: 1
var <- 'Var217'
aucs <- rep(0,100)
for(rep in 1:length(aucs)) { # Note: 1
useForCalRep <- rbinom(n=dim(dTrainAll)[[1]],size=1,prob=0.1)>0 # Note: 2
predRep <- mkPredC(dTrainAll[!useForCalRep,outcome], # Note: 3
dTrainAll[!useForCalRep,var],
dTrainAll[useForCalRep,var])
aucs[rep] <- calcAUC(predRep,dTrainAll[useForCalRep,outcome]) # Note: 4
}
> mean(aucs)
[1] 0.5556656
> sd(aucs)
[1] 0.01569345
mean(aucs)
## [1] 0.5556656
sd(aucs)
## [1] 0.01569345

# Note 1:
# For 100 iterations...
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
# (example 6.10 of section 6.2.3) : Memorization methods : Building single-variable models : Using cross-validation to estimate effects of overfitting
# Title: Empirically cross-validating performance

> fCross <- function() {
fCross <- function() {
useForCalRep <- rbinom(n=dim(dTrainAll)[[1]],size=1,prob=0.1)>0
predRep <- mkPredC(dTrainAll[!useForCalRep,outcome],
dTrainAll[!useForCalRep,var],
dTrainAll[useForCalRep,var])
calcAUC(predRep,dTrainAll[useForCalRep,outcome])
}
> aucs <- replicate(100,fCross())
aucs <- replicate(100,fCross())

Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
# (example 6.13 of section 6.3.2) : Memorization methods : Building models using many variables : Using decision trees
# Title: Building a bad decision tree

> library('rpart')
> fV <- paste(outcome,'>0 ~ ',
library('rpart')
fV <- paste(outcome,'>0 ~ ',
paste(c(catVars,numericVars),collapse=' + '),sep='')
> tmodel <- rpart(fV,data=dTrain)
> print(calcAUC(predict(tmodel,newdata=dTrain),dTrain[,outcome]))
[1] 0.9241265
> print(calcAUC(predict(tmodel,newdata=dTest),dTest[,outcome]))
[1] 0.5266172
> print(calcAUC(predict(tmodel,newdata=dCal),dCal[,outcome]))
[1] 0.5126917
tmodel <- rpart(fV,data=dTrain)
print(calcAUC(predict(tmodel,newdata=dTrain),dTrain[,outcome]))
## [1] 0.9241265
print(calcAUC(predict(tmodel,newdata=dTest),dTest[,outcome]))
## [1] 0.5266172
print(calcAUC(predict(tmodel,newdata=dCal),dCal[,outcome]))
## [1] 0.5126917

Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
# (example 6.14 of section 6.3.2) : Memorization methods : Building models using many variables : Using decision trees
# Title: Building another bad decision tree

> tVars <- paste('pred',c(catVars,numericVars),sep='')
> fV2 <- paste(outcome,'>0 ~ ',paste(tVars,collapse=' + '),sep='')
> tmodel <- rpart(fV2,data=dTrain)
> print(calcAUC(predict(tmodel,newdata=dTrain),dTrain[,outcome]))
[1] 0.928669
> print(calcAUC(predict(tmodel,newdata=dTest),dTest[,outcome]))
[1] 0.5390648
> print(calcAUC(predict(tmodel,newdata=dCal),dCal[,outcome]))
[1] 0.5384152
tVars <- paste('pred',c(catVars,numericVars),sep='')
fV2 <- paste(outcome,'>0 ~ ',paste(tVars,collapse=' + '),sep='')
tmodel <- rpart(fV2,data=dTrain)
print(calcAUC(predict(tmodel,newdata=dTrain),dTrain[,outcome]))
## [1] 0.928669
print(calcAUC(predict(tmodel,newdata=dTest),dTest[,outcome]))
## [1] 0.5390648
print(calcAUC(predict(tmodel,newdata=dCal),dCal[,outcome]))
## [1] 0.5384152

Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
# (example 6.15 of section 6.3.2) : Memorization methods : Building models using many variables : Using decision trees
# Title: Building yet another bad decision tree

> tmodel <- rpart(fV2,data=dTrain,
tmodel <- rpart(fV2,data=dTrain,
control=rpart.control(cp=0.001,minsplit=1000,
minbucket=1000,maxdepth=5)
)
> print(calcAUC(predict(tmodel,newdata=dTrain),dTrain[,outcome]))
[1] 0.9421195
> print(calcAUC(predict(tmodel,newdata=dTest),dTest[,outcome]))
[1] 0.5794633
> print(calcAUC(predict(tmodel,newdata=dCal),dCal[,outcome]))
[1] 0.547967
print(calcAUC(predict(tmodel,newdata=dTrain),dTrain[,outcome]))
## [1] 0.9421195
print(calcAUC(predict(tmodel,newdata=dTest),dTest[,outcome]))
## [1] 0.5794633
print(calcAUC(predict(tmodel,newdata=dCal),dCal[,outcome]))
## [1] 0.547967

Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
# Title: Building a better decision tree

f <- paste(outcome,'>0 ~ ',paste(selVars,collapse=' + '),sep='')
> tmodel <- rpart(f,data=dTrain,
tmodel <- rpart(f,data=dTrain,
control=rpart.control(cp=0.001,minsplit=1000,
minbucket=1000,maxdepth=5)
)
> print(calcAUC(predict(tmodel,newdata=dTrain),dTrain[,outcome]))
[1] 0.6906852
> print(calcAUC(predict(tmodel,newdata=dTest),dTest[,outcome]))
[1] 0.6843595
> print(calcAUC(predict(tmodel,newdata=dCal),dCal[,outcome]))
[1] 0.6669301
print(calcAUC(predict(tmodel,newdata=dTrain),dTrain[,outcome]))
## [1] 0.6906852
print(calcAUC(predict(tmodel,newdata=dTest),dTest[,outcome]))
## [1] 0.6843595
print(calcAUC(predict(tmodel,newdata=dCal),dCal[,outcome]))
## [1] 0.6669301

Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,31 @@
# (example 6.17 of section 6.3.2) : Memorization methods : Building models using many variables : Using decision trees
# Title: Printing the decision tree

> print(tmodel)
n= 40518

node), split, n, deviance, yval
* denotes terminal node

1) root 40518 2769.3550 0.07379436
2) predVar126< 0.07366888 18188 726.4097 0.04167583
4) predVar126< 0.04391312 8804 189.7251 0.02203544 *
5) predVar126>=0.04391312 9384 530.1023 0.06010230
10) predVar189< 0.08449448 8317 410.4571 0.05206204 *
11) predVar189>=0.08449448 1067 114.9166 0.12277410 *
3) predVar126>=0.07366888 22330 2008.9000 0.09995522
6) predVar212< 0.07944508 8386 484.2499 0.06153112
12) predVar73< 0.06813291 4084 167.5012 0.04285015 *
13) predVar73>=0.06813291 4302 313.9705 0.07926546 *
7) predVar212>=0.07944508 13944 1504.8230 0.12306370
14) predVar218< 0.07134103 6728 580.7390 0.09542212
28) predVar126< 0.1015407 3901 271.8426 0.07536529 *
29) predVar126>=0.1015407 2827 305.1617 0.12309870
58) predVar73< 0.07804522 1452 110.0826 0.08264463 *
59) predVar73>=0.07804522 1375 190.1935 0.16581820 *
15) predVar218>=0.07134103 7216 914.1502 0.14883590
30) predVar74< 0.0797246 2579 239.3579 0.10352850 *
31) predVar74>=0.0797246 4637 666.5538 0.17403490
62) predVar189< 0.06775545 1031 102.9486 0.11251210 *
63) predVar189>=0.06775545 3606 558.5871 0.19162510 *
print(tmodel)
## n= 40518
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 40518 2769.3550 0.07379436
## 2) predVar126< 0.07366888 18188 726.4097 0.04167583
## 4) predVar126< 0.04391312 8804 189.7251 0.02203544 *
## 5) predVar126>=0.04391312 9384 530.1023 0.06010230
## 10) predVar189< 0.08449448 8317 410.4571 0.05206204 *
## 11) predVar189>=0.08449448 1067 114.9166 0.12277410 *
## 3) predVar126>=0.07366888 22330 2008.9000 0.09995522
## 6) predVar212< 0.07944508 8386 484.2499 0.06153112
## 12) predVar73< 0.06813291 4084 167.5012 0.04285015 *
## 13) predVar73>=0.06813291 4302 313.9705 0.07926546 *
## 7) predVar212>=0.07944508 13944 1504.8230 0.12306370
## 14) predVar218< 0.07134103 6728 580.7390 0.09542212
## 28) predVar126< 0.1015407 3901 271.8426 0.07536529 *
## 29) predVar126>=0.1015407 2827 305.1617 0.12309870
## 58) predVar73< 0.07804522 1452 110.0826 0.08264463 *
## 59) predVar73>=0.07804522 1375 190.1935 0.16581820 *
## 15) predVar218>=0.07134103 7216 914.1502 0.14883590
## 30) predVar74< 0.0797246 2579 239.3579 0.10352850 *
## 31) predVar74>=0.0797246 4637 666.5538 0.17403490
## 62) predVar189< 0.06775545 1031 102.9486 0.11251210 *
## 63) predVar189>=0.06775545 3606 558.5871 0.19162510 *

Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@
# (example 6.19 of section 6.3.3) : Memorization methods : Building models using many variables : Using nearest neighbor methods
# Title: Running k-nearest neighbors

> library('class')
> nK <- 200
> knnTrain <- dTrain[,selVars] # Note: 1
> knnCl <- dTrain[,outcome]==pos # Note: 2
> knnPred <- function(df) { # Note: 3
library('class')
nK <- 200
knnTrain <- dTrain[,selVars] # Note: 1
knnCl <- dTrain[,outcome]==pos # Note: 2
knnPred <- function(df) { # Note: 3
knnDecision <- knn(knnTrain,df,knnCl,k=nK,prob=T)
ifelse(knnDecision==TRUE, # Note: 4
attributes(knnDecision)$prob,
1-(attributes(knnDecision)$prob))
}
> print(calcAUC(knnPred(dTrain[,selVars]),dTrain[,outcome]))
[1] 0.7443927
> print(calcAUC(knnPred(dCal[,selVars]),dCal[,outcome]))
[1] 0.7119394
> print(calcAUC(knnPred(dTest[,selVars]),dTest[,outcome]))
[1] 0.718256
print(calcAUC(knnPred(dTrain[,selVars]),dTrain[,outcome]))
## [1] 0.7443927
print(calcAUC(knnPred(dCal[,selVars]),dCal[,outcome]))
## [1] 0.7119394
print(calcAUC(knnPred(dTest[,selVars]),dTest[,outcome]))
## [1] 0.718256

# Note 1:
# Build a data frame with only the variables we
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
# (example 6.22 of section 6.3.3) : Memorization methods : Building models using many variables : Using nearest neighbor methods
# Title: Plotting the performance of a logistic regression model

> gmodel <- glm(as.formula(f),data=dTrain,family=binomial(link='logit'))
> print(calcAUC(predict(gmodel,newdata=dTrain),dTrain[,outcome]))
[1] 0.7309537
> print(calcAUC(predict(gmodel,newdata=dTest),dTest[,outcome]))
[1] 0.7234645
> print(calcAUC(predict(gmodel,newdata=dCal),dCal[,outcome]))
[1] 0.7170824
gmodel <- glm(as.formula(f),data=dTrain,family=binomial(link='logit'))
print(calcAUC(predict(gmodel,newdata=dTrain),dTrain[,outcome]))
## [1] 0.7309537
print(calcAUC(predict(gmodel,newdata=dTest),dTest[,outcome]))
## [1] 0.7234645
print(calcAUC(predict(gmodel,newdata=dCal),dCal[,outcome]))
## [1] 0.7170824

15 changes: 15 additions & 0 deletions RunExamples/rCh06.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
title: "rCh06"
author: "Win-Vector LLC"
date: "December 10, 2015"
output: html_document
---

```{r sourceFns}
source('runDir.R')
```

```{r ch6ex, tidy=FALSE,comment='',prompt=FALSE}
runDir('../CodeExamples/c06_Memorization_methods',
'../KDD2009')
```
1,047 changes: 1,047 additions & 0 deletions RunExamples/rCh06.html

Large diffs are not rendered by default.

0 comments on commit 84d9444

Please sign in to comment.