## "Exploratory Analysis - Titanic" ## Classification ## @ Matheus Pimenta ## github.com/omatheuspimenta/titanic_exploratory ########################################################### ##### # Set path setwd("/home/matheus/Dropbox/06_doutorado/2021_01/Bioestatistica/projeto/dataset/") ##### # Libraries library("class") # for KNN classifier library("randomForest") # for RandomForest classifier library("rpart") # for decision tree classifier library("rpart.plot") # for plot decision tree classifier library("infotheo") #for information theory library("caTools") # for split data frame library("scales") # for rescale library("e1071") # for SVM classifier ##### # Load file load("titanic3.RData") ##### # removing some columns titanic3$name <- NULL titanic3$ticket <- NULL titanic3$cabin <- NULL titanic3$boat <- NULL titanic3$body <- NULL titanic3$home.dest <- NULL titanic3$lastname <- NULL titanic3$title <- NULL # Feature Selection - Information Theory # in this part we don't remove any column, but if you want, one idea # is to use Information Theory ##### # discretizing # disc_df <- discretize(titanic3) # mutual information among the columns # mi <- mutinformation(disc_df) # t<-mi[-2,2] # t<-t[order(t, decreasing = TRUE)] # ylim <- c(0, 1.1*max(t)) # xx <- barplot(t, # col = rainbow(20), # main = "Barplot - Mutual Information", # xlab = "Variable", # ylab = "Frequency", # ylim = ylim) # # remove temp variables # remove(ylim, y, xx, mi, disc_df,t) # after this, we don't use "sibsp", "parch" and "embarked" columns. # REMEMBER, this is ONLY A EXAMPLE! If you use this, set a threshold before!!!!! # the column age will be converted to "categorical dummy" # Creating dummy variables for "pclass", "sex", "sibsp", "parch", "embarked", "age" # and, "nfamily" # using the base R # pclass class titanic3$class1 <- ifelse(titanic3$pclass=="1st",1,0) titanic3$class2 <- ifelse(titanic3$pclass=="2nd",1,0) titanic3$class3 <- ifelse(titanic3$pclass=="3rd",1,0) # sex dummy titanic3$sex <- ifelse(titanic3$sex=="female",1,0) # sibsp dummy titanic3$sibsp0 <- ifelse(titanic3$sibsp==0,1,0) titanic3$sibsp1 <- ifelse(titanic3$sibsp==1,1,0) titanic3$sibsp2 <- ifelse(titanic3$sibsp==2,1,0) titanic3$sibsp3 <- ifelse(titanic3$sibsp==3,1,0) titanic3$sibsp4 <- ifelse(titanic3$sibsp==4,1,0) titanic3$sibsp5 <- ifelse(titanic3$sibsp==5,1,0) titanic3$sibsp8 <- ifelse(titanic3$sibsp==8,1,0) # parch dummy titanic3$parch0 <- ifelse(titanic3$parch==0,1,0) titanic3$parch1 <- ifelse(titanic3$parch==1,1,0) titanic3$parch2 <- ifelse(titanic3$parch==2,1,0) titanic3$parch3 <- ifelse(titanic3$parch==3,1,0) titanic3$parch4 <- ifelse(titanic3$parch==4,1,0) titanic3$parch5 <- ifelse(titanic3$parch==5,1,0) titanic3$parch6 <- ifelse(titanic3$parch==6,1,0) titanic3$parch9 <- ifelse(titanic3$parch==9,1,0) # embarked dummy titanic3$Cherbourg <- ifelse(titanic3$embarked == "Cherbourg",1,0) titanic3$Queenstown <- ifelse(titanic3$embarked == "Queenstown",1,0) titanic3$Southampton <- ifelse(titanic3$embarked == "Southampton",1,0) # age dummy titanic3$children <- ifelse(titanic3$age<=11, 1, 0) titanic3$teenage <- ifelse((titanic3$age>11 & titanic3$age<20), 1, 0) titanic3$young <- ifelse((titanic3$age>20 & titanic3$age<30), 1, 0) titanic3$adult <- ifelse((titanic3$age>30 & titanic3$age<60), 1, 0) titanic3$old <- ifelse(titanic3$age>60, 1, 0) # # nfamily dummy titanic3$nfamily1 <- ifelse(titanic3$nfamily == 1,1,0) titanic3$nfamily2 <- ifelse(titanic3$nfamily == 2,1,0) titanic3$nfamily3 <- ifelse(titanic3$nfamily == 3,1,0) titanic3$nfamily4 <- ifelse(titanic3$nfamily == 4,1,0) titanic3$nfamily5 <- ifelse(titanic3$nfamily == 5,1,0) titanic3$nfamily6 <- ifelse(titanic3$nfamily == 6,1,0) titanic3$nfamily7 <- ifelse(titanic3$nfamily == 7,1,0) titanic3$nfamily8 <- ifelse(titanic3$nfamily == 8,1,0) titanic3$nfamily11 <- ifelse(titanic3$nfamily == 11,1,0) ##### # Drop columns titanic3$pclass <- NULL titanic3$embarked <- NULL titanic3$sibsp <- NULL titanic3$parch <- NULL titanic3$age <- NULL titanic3$nfamily <- NULL ##### # Normalize "fare" titanic3$fare <- rescale(as.numeric(titanic3$fare)) # Factorize titanic3$survived <- as.factor(titanic3$survived) ##### # Classification # split dataframe set.seed(7) split <- sample.split(titanic3$survived, SplitRatio=0.8) train_df <- subset(titanic3, split == "TRUE") test_df <- subset(titanic3, split == "FALSE") ##### # KNN - k = 3 # with you need improve the method, change the 'k' # suggest: use a grid search to do this knn3 <- knn(train = train_df[,-1], test = test_df[,-1], cl = train_df[,1], k = 3) cm_knn3 <- table(test_df[,1], knn3) # confusion matrix confusionMatrix(cm_knn3) # Confusion Matrix and Statistics # knn3 # 0 1 # 0 135 27 # 1 32 68 # Accuracy : 0.7748 # 95% CI : (0.7194, 0.8239) # No Information Rate : 0.6374 # P-Value [Acc > NIR] : 1.161e-06 # Kappa : 0.5183 # Mcnemar's Test P-Value : 0.6025 # Sensitivity : 0.8084 # Specificity : 0.7158 # Pos Pred Value : 0.8333 # Neg Pred Value : 0.6800 # Prevalence : 0.6374 # Detection Rate : 0.5153 # Detection Prevalence : 0.6183 # Balanced Accuracy : 0.7621 # 'Positive' Class : 0 # if you need to do some cross-validation, please use the caret library # train_control = trainControl(method = 'repeatedcv', # number = 10, # repeats = 10) # 10 times for each 10fold CrossValidation # model = train(survived ~., # data = titanic3, # trControl = train_control, # method = 'knn') ##### # RandomForest # with you need to improve the method, change the 'ntree' # suggest: use a grid search to do this rf100 <- randomForest(x = train_df[-1], y = train_df$survived, ntree = 100) rf100_pred <- predict(rf100, newdata = test_df[-1]) rf100_cm <- table(test_df[,1], rf100_pred) # confusion matrix confusionMatrix(rf100_cm) # Confusion Matrix and Statistics # rf100_pred # 0 1 # 0 145 17 # 1 33 67 # Accuracy : 0.8092 # 95% CI : (0.7563, 0.8549) # No Information Rate : 0.6794 # P-Value [Acc > NIR] : 1.809e-06 # Kappa : 0.5829 # Mcnemar's Test P-Value : 0.03389 # Sensitivity : 0.8146 # Specificity : 0.7976 # Pos Pred Value : 0.8951 # Neg Pred Value : 0.6700 # Prevalence : 0.6794 # Detection Rate : 0.5534 # Detection Prevalence : 0.6183 # Balanced Accuracy : 0.8061 # 'Positive' Class : 0 # if you need to do some cross-validation, please use the caret library # train_control = trainControl(method = 'repeatedcv', # number = 10, # repeats = 10) # 10 times for each 10fold CrossValidation # model = train(survived ~., # data = titanic3, # trControl = train_control, # method = 'rf') ##### # Decision Tree dt <- rpart(formula = survived ~ ., data = titanic3) rpart.plot(dt) dt_pred <- predict(dt, newdata = test_df[-1], type = "class") dt_cm <- table(test_df[,1], dt_pred) confusionMatrix(dt_cm) # Confusion Matrix and Statistics # dt_pred # 0 1 # 0 146 16 # 1 26 74 # Accuracy : 0.8397 # 95% CI : (0.7896, 0.882) # No Information Rate : 0.6565 # P-Value [Acc > NIR] : 2.532e-11 # Kappa : 0.6537 # Mcnemar's Test P-Value : 0.1649 # Sensitivity : 0.8488 # Specificity : 0.8222 # Pos Pred Value : 0.9012 # Neg Pred Value : 0.7400 # Prevalence : 0.6565 # Detection Rate : 0.5573 # Detection Prevalence : 0.6183 # Balanced Accuracy : 0.8355 # 'Positive' Class : 0 ##### # SVM # with you need to improve the method, change the hyper parameters # suggest: use a grid search to do this svm.model <- svm(formula = survived ~ ., data = train_df, type = 'C-classification', kernel = 'radial', cost = 10.0) svm.pred <- predict(svm.model, newdata = test_df[-1]) svm_cm <- table(test_df[,1], svm.pred) # confusion matrix confusionMatrix(svm_cm) # Confusion Matrix and Statistics # svm.pred # 0 1 # 0 142 20 # 1 32 68 # Accuracy : 0.8015 # 95% CI : (0.748, 0.8481) # No Information Rate : 0.6641 # P-Value [Acc > NIR] : 6.34e-07 # Kappa : 0.5696 # Mcnemar's Test P-Value : 0.1272 # Sensitivity : 0.8161 # Specificity : 0.7727 # Pos Pred Value : 0.8765 # Neg Pred Value : 0.6800 # Prevalence : 0.6641 # Detection Rate : 0.5420 # Detection Prevalence : 0.6183 # Balanced Accuracy : 0.7944 # 'Positive' Class : 0 ##### # remove temp variables remove(cm_knn3, knn3, dt_cm, dt_pred, knn_3, rf100_cm, rf100_pred, split, svm_cm, svm.pred, svm.model, test_df, train_df, dt, rf100) # You can tune the classifiers, plot graphs, heatmaps and, ROC curves. # We don't do this at this time because this isn't the purpose o this work. # Thank you so much for your attention