--- title: "Final Project" author: "Camillo" date: "5/9/2022" output: word_document: default html_document: default --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ## ```{r } #import data getwd() setwd("C:/Users/n/Documents") data<-read.csv("CarPrice_Assignment.csv",header = T) ``` ```{r } #form a data frame with only the required variables library(dplyr) data2<-data %>% dplyr::select(c("price", "highwaympg", "citympg", "horsepower" ,"fuelsystem", "enginesize", "cylindernumber" ,"drivewheel", "fueltype" )) #check for missing values sum(is.na(data2)) #convert chr to factor for categorical variables data2$fuelsystem<-as.factor(data2$fuelsystem) data2$cylindernumber<-as.factor(data2$cylindernumber) data2$drivewheel<-as.factor(data2$drivewheel) data2$fueltype<-as.factor(data2$fueltype) ``` ```{r } #visualize data library(tidyverse) library(epiDisplay) # fuel system #frequency table tab1(data2$fuelsystem, sort.group = "decreasing", cum.percent = TRUE) ggplot(data = data2, aes(x =fuelsystem, fill =fuelsystem)) + geom_bar() #cylinder number #frequency table tab1(data2$cylindernumber, sort.group = "decreasing", cum.percent = TRUE) ggplot(data = data2, aes(x =cylindernumber, fill =cylindernumber)) + geom_bar() #drive wheel #frequency table tab1(data2$drivewheel, sort.group = "decreasing", cum.percent = TRUE) ggplot(data = data2, aes(x =drivewheel, fill =drivewheel)) + geom_bar() # fuel type #Frequency table tab1(data2$fueltype, sort.group = "decreasing", cum.percent = TRUE) ggplot(data = data2, aes(x =fueltype, fill =fueltype)) + geom_bar() ``` ```{r } # summary descriptive statistics for continuous data data2 %>% group_by(fuelsystem) %>% summarise(mean_price=mean(price), sd_price=sd(price)) data2 %>% group_by(fueltype) %>% summarise(mean_price=mean(price), sd_price=sd(price)) data2 %>% group_by(cylindernumber) %>% summarise(mean_price=mean(price), sd_price=sd(price)) data2 %>% group_by(drivewheel) %>% summarise(mean_price=mean(price), sd_price=sd(price)) ``` ```{r } #assumptions test #normality #q-q plot qqnorm(data2$price, pch = 1, frame = FALSE) qqline(data2$price, col = "steelblue", lwd = 2) #using shapiro test shapiro.test(data2$price) #Linear relationship #scatter plot matrix plot(data2[-c(5,7:9)], col="blue") #correlation for multicollinearity cor(data2[-c(1,5,7:9)]) ``` ```{r } #the full model model<-lm(log10(price)~highwaympg+citympg+horsepower+fuelsystem +enginesize+cylindernumber+drivewheel+fueltype, data=data2) summary(model) par(mfrow = c(2, 2)) plot(model) ``` ```{r } #model library(leaps) m1<-regsubsets(log10(price)~highwaympg+citympg+horsepower+fuelsystem +enginesize+cylindernumber+drivewheel+fueltype, data=data2) models<-summary(m1) #see the number of recommended regression models which.max(models$adjr2) #best models based on adj r2, cp and bic data.frame( Adj.R2 = which.max(models$adjr2), CP = which.min(models$cp), BIC = which.min(models$bic) ) #get the corresponding adjusted r2, cp, and bic for the best model models$adjr2[8] models$bic[8] models$cp[8] ``` ```{r } #now obtain the best predictors models$which[8,] #regression coefficients for the best model coef(m1, 8) ```