Final project.Rmd

---
title: "Final Project"
author: "Camillo"
date: "5/9/2022"
output:
  word_document: default
  html_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## 

```{r }
#import data
getwd()
setwd("C:/Users/n/Documents")
data<-read.csv("CarPrice_Assignment.csv",header = T)

```



```{r }
#form a data frame with only the required variables
library(dplyr)
data2<-data %>% dplyr::select(c("price", "highwaympg", "citympg", "horsepower"
                               ,"fuelsystem", "enginesize", "cylindernumber"
                               ,"drivewheel", "fueltype" ))
#check for missing values
sum(is.na(data2))
#convert chr to factor for categorical variables
data2$fuelsystem<-as.factor(data2$fuelsystem)
data2$cylindernumber<-as.factor(data2$cylindernumber)
data2$drivewheel<-as.factor(data2$drivewheel)
data2$fueltype<-as.factor(data2$fueltype)
```


```{r }
#visualize data 
library(tidyverse)
library(epiDisplay)
# fuel system

#frequency table
tab1(data2$fuelsystem, sort.group = "decreasing",
     cum.percent = TRUE)
ggplot(data = data2, aes(x =fuelsystem, fill =fuelsystem)) +
  geom_bar()

#cylinder number

#frequency table
tab1(data2$cylindernumber, sort.group = "decreasing", 
     cum.percent = TRUE)
ggplot(data = data2, aes(x =cylindernumber, fill =cylindernumber)) +
  geom_bar()
#drive wheel
#frequency table 
tab1(data2$drivewheel, sort.group = "decreasing", 
     cum.percent = TRUE)
ggplot(data = data2, aes(x =drivewheel, fill =drivewheel)) +
  geom_bar()

# fuel type 

#Frequency table
tab1(data2$fueltype, sort.group = "decreasing", 
     cum.percent = TRUE)
ggplot(data = data2, aes(x =fueltype, fill =fueltype)) +
  geom_bar()
```


```{r }
# summary descriptive statistics for continuous data
data2 %>% group_by(fuelsystem) %>%
  summarise(mean_price=mean(price),
            sd_price=sd(price))
data2 %>% group_by(fueltype) %>%
  summarise(mean_price=mean(price),
            sd_price=sd(price))
data2 %>% group_by(cylindernumber) %>%
  summarise(mean_price=mean(price),
            sd_price=sd(price))
data2 %>% group_by(drivewheel) %>%
  summarise(mean_price=mean(price),
            sd_price=sd(price))

```


```{r }
#assumptions test
#normality
#q-q plot
qqnorm(data2$price, pch = 1, frame = FALSE)
qqline(data2$price, col = "steelblue", lwd = 2)
#using shapiro test
shapiro.test(data2$price)



#Linear relationship
#scatter plot matrix
plot(data2[-c(5,7:9)], col="blue")
#correlation for multicollinearity
cor(data2[-c(1,5,7:9)])

```


```{r }
#the full model
model<-lm(log10(price)~highwaympg+citympg+horsepower+fuelsystem
          +enginesize+cylindernumber+drivewheel+fueltype, data=data2)
summary(model)


par(mfrow = c(2, 2))
plot(model)

```


```{r }
#model 
library(leaps)
m1<-regsubsets(log10(price)~highwaympg+citympg+horsepower+fuelsystem
               +enginesize+cylindernumber+drivewheel+fueltype, data=data2)
models<-summary(m1)
#see the number of recommended regression models
which.max(models$adjr2)

#best models based on adj r2, cp and bic
data.frame(
  Adj.R2 = which.max(models$adjr2),
  CP = which.min(models$cp),
  BIC = which.min(models$bic)
)
#get the corresponding adjusted r2, cp, and bic for the best model
models$adjr2[8]
models$bic[8]
models$cp[8]
```


```{r }
#now obtain the best predictors
models$which[8,]
#regression coefficients for the best model
coef(m1, 8)
```