#' --- #' title: "Bayesian data analysis - traffic deaths in Finland" #' author: "Aki Vehtari" #' date: "First version 2017-09-28. Last modified `r format(Sys.Date())`." #' output: #' html_document: #' fig_caption: yes #' toc: TRUE #' toc_depth: 2 #' number_sections: TRUE #' toc_float: #' smooth_scroll: FALSE #' theme: readable #' code_download: true #' --- #' # Setup {.unnumbered} #+ setup, include=FALSE knitr::opts_chunk$set(cache=FALSE, message=FALSE, error=FALSE, warning=TRUE, out.width='95%') #' **Load packages** #+ comment=NA library(ggplot2) library(tidyr) library(dplyr) library(gridExtra) library(rstanarm) library(brms) options(brms.backend = "cmdstanr") library(bayesplot) theme_set(bayesplot::theme_default(base_family = "sans", base_size = 16)) library(patchwork) library(loo) library(rprojroot) root<-has_file(".BDA_R_demos_root")$make_fix_file() #' # Introduction #' #' This notebook demonstrates time series analysis for traffic deaths per #' year in Finland. Currently when the the number of traffic deaths #' during previous year are reported, the press release claims that the #' the traffic safety in Finland has improved or worsened depending #' whether the number is smaller or larger than the year before. Time #' series analysis can be used to separate random fluctuation from the #' slowly changing traffic safety. #' #' # Data #' #' Read the data (there would data for earlier years, too, but this is #' sufficient for the demonstration) #+ # file preview shows a header row deaths <- read.csv(root("demos_rstan", "trafficdeaths.csv"), header = TRUE) head(deaths) deaths2013 <- deaths |> filter(year<=2013) #' First plot just the data. #+ deaths2013 |> ggplot(aes(x=year, y=deaths)) + geom_point() + labs(y = 'Number of traffic deaths in Finland', x= "Year") + guides(linetype = "none") #ggsave('traffic1.pdf',width=6,height=4) deaths2013 |> ggplot(aes(x=year, y=deaths)) + geom_point() + geom_line() + labs(y = 'Number of traffic deaths in Finland', x= "Year") + guides(linetype = "none") #ggsave('traffic2.pdf',width=6,height=4) #' # Poisson regression model #' #' The number of deaths is count data, so we use Poisson observation #' model. We first fit log-linear model for the Poisson intensity, which #' corresponds to assuming constant proportional change in the rate. #+ fit_lin <- stan_glm(deaths ~ year, data=deaths, family=poisson, refresh=1000, iter=1000, chains=4, seed=583829, refresh=0) #' ESS's and Rhat's are ok (see, e.g., [RStan #' workflow](http://mc-stan.org/users/documentation/case-studies/rstan_workflow.html)). Let's #' look at the posterior predictive distribution (median and 5% and 95% #' intervals). #+ x_predict <- seq(1993,2030) N_predict <- length(x_predict) y_predict_lin <- posterior_predict(fit_lin, newdata=data.frame(year=x_predict)) mu <- apply(t(y_predict_lin), 1, quantile, c(0.05, 0.5, 0.95)) %>% t() %>% data.frame(x = x_predict, .) %>% gather(pct, y, -x) pfit <- ggplot() + geom_point(aes(year, deaths), data = deaths2013, size = 1) + geom_line(aes(x, y, linetype = pct), data = mu, color = 'red') + scale_linetype_manual(values = c(2,1,2)) + annotate(geom="text", x=2031, y=mu$y[mu$x==2030], label=c('5%','50%','95%'))+ theme(legend.position="none")+ labs(x = 'Year', y = 'Number of traffic deaths in Finland') + guides(title='f') (pfit) x_predict <- seq(1993,2030) N_predict <- length(x_predict) y_predict_lin <- posterior_predict(fit_lin, newdata=data.frame(year=x_predict)) mu <- apply(t(y_predict_lin), 1, quantile, c(0.05, 0.5, 0.95)) %>% t() %>% data.frame(x = x_predict, .) %>% gather(pct, y, -x) pfit <- ggplot() + geom_point(aes(year, deaths), data = deaths, size = 1) + geom_line(aes(x, y, linetype = pct), data = mu, color = 'red') + scale_linetype_manual(values = c(2,1,2)) + annotate(geom="text", x=2031, y=mu$y[mu$x==2030], label=c('5%','50%','95%'))+ theme(legend.position="none")+ labs(x = 'Year', y = 'Number of traffic deaths in Finland') + guides(title='f') (pfit) #ggsave('traffic5.pdf',width=6,height=4) #' Next we fit a non-linear spline model with `stan_gamm4` #+ fit_gam <- stan_gamm4(deaths ~ year + s(year), data=deaths2020, family=poisson, adapt_delta=0.999, refresh=1000, iter=2000, chain=4, seed=583829, refresh=0) #' ESS is clearly smaller than for the linear model, but Rhat's are ok. #' #' Let's look at the posterior predictive distribution. #+ x_predict=seq(1993,2030) N_predict=length(x_predict) y_predict_gam <- posterior_predict(fit_gam, newdata=data.frame(year=x_predict)) mu <- apply(t(y_predict_gam), 1, quantile, c(0.05, 0.5, 0.95)) %>% t() %>% data.frame(x = x_predict, .) %>% gather(pct, y, -x) pfit <- ggplot() + geom_point(aes(year, deaths), data = deaths2020, size = 1) + geom_line(aes(x, y, linetype = pct), data = mu, color = 'red') + scale_linetype_manual(values = c(2,1,2)) + annotate(geom="text", x=2031, y=mu$y[mu$x==2030], label=c('5%','50%','95%'))+ labs(x = 'Year', y = 'Traffic deaths') + guides(linetype = 'none') (pfit) #' The predictive median is clearly nonlinear. The predictive mean for #' future years stays at the same level as the most recent observations, #' but uncertainty increases quickly. #' #' Finally we fit Gaussian process centered on linear model. We use #' brms for this: #+ comment=NA fit_gp <- brm(deaths ~ year + gp(year), data=deaths2020, family=poisson, adapt_delta=0.95, refresh=1000, iter=2000, chain=4, seed=583829, refresh=0) x_predict=seq(1993,2030) N_predict=length(x_predict) y_predict_gp <- posterior_predict(fit_gp, newdata=data.frame(year=x_predict)) mu <- apply(t(y_predict_gp), 1, quantile, c(0.05, 0.5, 0.95)) %>% t() %>% data.frame(x = x_predict, .) %>% gather(pct, y, -x) pfit <- ggplot() + geom_point(aes(year, deaths), data = deaths2020, size = 1) + geom_line(aes(x, y, linetype = pct), data = mu, color = 'red') + scale_linetype_manual(values = c(2,1,2)) + annotate(geom="text", x=2031, y=mu$y[mu$x==2030], label=c('5%','50%','95%'))+ labs(x = 'Year', y = 'Traffic deaths') + guides(linetype = F) (pfit) #' Finally we compare models using PSIS-LOO predictive performance estimates. #+ (loo_lin<-loo(fit_lin, save_psis=TRUE)) (loo_gam<-loo(fit_gam, save_psis=TRUE)) (loo_gp<-loo(fit_gp, save_psis=TRUE)) loo_compare(loo_lin, loo_gam, loo_gp) #' There are no practical differences in predictive performance, which is #' partially due to small number of observations. Based on the posterior #' predictive distributions there are clear differences in the future #' predictions. #' We can also look at the calibration of leave-one-out predictive #' distributions ppc_loo_intervals(deaths$deaths, y_predict_lin[,1:29], psis_object=loo_lin$psis_object)+ labs(title='PPC-LOO linear model') ppc_loo_intervals(deaths$deaths, y_predict_gp[,1:29], psis_object=loo_gp$psis_object)+ labs(title='PPC-LOO GP model') #' There is a small difference in favor of GP model. #'
#' #' # Licenses {.unnumbered} #' #' * Code © 2017-2020, Aki Vehtari, licensed under BSD-3. #' * Text © 2017-2020, Aki Vehtari, licensed under CC-BY-NC 4.0. #' #' # Original Computing Environment {.unnumbered} #+ sessionInfo() #'