#' ---
#' title: "Bayesian data analysis - traffic deaths in Finland"
#' author: "Aki Vehtari"
#' date: "First version 2017-09-28. Last modified `r format(Sys.Date())`."
#' output:
#' html_document:
#' fig_caption: yes
#' toc: TRUE
#' toc_depth: 2
#' number_sections: TRUE
#' toc_float:
#' smooth_scroll: FALSE
#' theme: readable
#' code_download: true
#' ---
#' # Setup {.unnumbered}
#+ setup, include=FALSE
knitr::opts_chunk$set(cache=FALSE, message=FALSE, error=FALSE, warning=TRUE, out.width='95%')
#' **Load packages**
#+ comment=NA
library(ggplot2)
library(tidyr)
library(dplyr)
library(gridExtra)
library(rstanarm)
library(brms)
options(brms.backend = "cmdstanr")
library(bayesplot)
theme_set(bayesplot::theme_default(base_family = "sans", base_size = 16))
library(patchwork)
library(loo)
library(rprojroot)
root<-has_file(".BDA_R_demos_root")$make_fix_file()
#' # Introduction
#'
#' This notebook demonstrates time series analysis for traffic deaths per
#' year in Finland. Currently when the the number of traffic deaths
#' during previous year are reported, the press release claims that the
#' the traffic safety in Finland has improved or worsened depending
#' whether the number is smaller or larger than the year before. Time
#' series analysis can be used to separate random fluctuation from the
#' slowly changing traffic safety.
#'
#' # Data
#'
#' Read the data (there would data for earlier years, too, but this is
#' sufficient for the demonstration)
#+
# file preview shows a header row
deaths <- read.csv(root("demos_rstan", "trafficdeaths.csv"), header = TRUE)
head(deaths)
deaths2013 <- deaths |> filter(year<=2013)
#' First plot just the data.
#+
deaths2013 |>
ggplot(aes(x=year, y=deaths)) +
geom_point() +
labs(y = 'Number of traffic deaths in Finland', x= "Year") +
guides(linetype = "none")
#ggsave('traffic1.pdf',width=6,height=4)
deaths2013 |>
ggplot(aes(x=year, y=deaths)) +
geom_point() +
geom_line() +
labs(y = 'Number of traffic deaths in Finland', x= "Year") +
guides(linetype = "none")
#ggsave('traffic2.pdf',width=6,height=4)
#' # Poisson regression model
#'
#' The number of deaths is count data, so we use Poisson observation
#' model. We first fit log-linear model for the Poisson intensity, which
#' corresponds to assuming constant proportional change in the rate.
#+
fit_lin <- stan_glm(deaths ~ year, data=deaths, family=poisson,
refresh=1000, iter=1000, chains=4, seed=583829, refresh=0)
#' ESS's and Rhat's are ok (see, e.g., [RStan
#' workflow](http://mc-stan.org/users/documentation/case-studies/rstan_workflow.html)). Let's
#' look at the posterior predictive distribution (median and 5% and 95%
#' intervals).
#+
x_predict <- seq(1993,2030)
N_predict <- length(x_predict)
y_predict_lin <- posterior_predict(fit_lin, newdata=data.frame(year=x_predict))
mu <- apply(t(y_predict_lin), 1, quantile, c(0.05, 0.5, 0.95)) %>%
t() %>% data.frame(x = x_predict, .) %>% gather(pct, y, -x)
pfit <- ggplot() +
geom_point(aes(year, deaths), data = deaths2013, size = 1) +
geom_line(aes(x, y, linetype = pct), data = mu, color = 'red') +
scale_linetype_manual(values = c(2,1,2)) +
annotate(geom="text", x=2031, y=mu$y[mu$x==2030], label=c('5%','50%','95%'))+
theme(legend.position="none")+
labs(x = 'Year', y = 'Number of traffic deaths in Finland') +
guides(title='f')
(pfit)
x_predict <- seq(1993,2030)
N_predict <- length(x_predict)
y_predict_lin <- posterior_predict(fit_lin, newdata=data.frame(year=x_predict))
mu <- apply(t(y_predict_lin), 1, quantile, c(0.05, 0.5, 0.95)) %>%
t() %>% data.frame(x = x_predict, .) %>% gather(pct, y, -x)
pfit <- ggplot() +
geom_point(aes(year, deaths), data = deaths, size = 1) +
geom_line(aes(x, y, linetype = pct), data = mu, color = 'red') +
scale_linetype_manual(values = c(2,1,2)) +
annotate(geom="text", x=2031, y=mu$y[mu$x==2030], label=c('5%','50%','95%'))+
theme(legend.position="none")+
labs(x = 'Year', y = 'Number of traffic deaths in Finland') +
guides(title='f')
(pfit)
#ggsave('traffic5.pdf',width=6,height=4)
#' Next we fit a non-linear spline model with `stan_gamm4`
#+
fit_gam <- stan_gamm4(deaths ~ year + s(year), data=deaths2020,
family=poisson, adapt_delta=0.999,
refresh=1000, iter=2000, chain=4, seed=583829, refresh=0)
#' ESS is clearly smaller than for the linear model, but Rhat's are ok.
#'
#' Let's look at the posterior predictive distribution.
#+
x_predict=seq(1993,2030)
N_predict=length(x_predict)
y_predict_gam <- posterior_predict(fit_gam, newdata=data.frame(year=x_predict))
mu <- apply(t(y_predict_gam), 1, quantile, c(0.05, 0.5, 0.95)) %>%
t() %>% data.frame(x = x_predict, .) %>% gather(pct, y, -x)
pfit <- ggplot() +
geom_point(aes(year, deaths), data = deaths2020, size = 1) +
geom_line(aes(x, y, linetype = pct), data = mu, color = 'red') +
scale_linetype_manual(values = c(2,1,2)) +
annotate(geom="text", x=2031, y=mu$y[mu$x==2030], label=c('5%','50%','95%'))+
labs(x = 'Year', y = 'Traffic deaths') +
guides(linetype = 'none')
(pfit)
#' The predictive median is clearly nonlinear. The predictive mean for
#' future years stays at the same level as the most recent observations,
#' but uncertainty increases quickly.
#'
#' Finally we fit Gaussian process centered on linear model. We use
#' brms for this:
#+ comment=NA
fit_gp <- brm(deaths ~ year + gp(year), data=deaths2020,
family=poisson, adapt_delta=0.95,
refresh=1000, iter=2000, chain=4, seed=583829, refresh=0)
x_predict=seq(1993,2030)
N_predict=length(x_predict)
y_predict_gp <- posterior_predict(fit_gp, newdata=data.frame(year=x_predict))
mu <- apply(t(y_predict_gp), 1, quantile, c(0.05, 0.5, 0.95)) %>%
t() %>% data.frame(x = x_predict, .) %>% gather(pct, y, -x)
pfit <- ggplot() +
geom_point(aes(year, deaths), data = deaths2020, size = 1) +
geom_line(aes(x, y, linetype = pct), data = mu, color = 'red') +
scale_linetype_manual(values = c(2,1,2)) +
annotate(geom="text", x=2031, y=mu$y[mu$x==2030], label=c('5%','50%','95%'))+
labs(x = 'Year', y = 'Traffic deaths') +
guides(linetype = F)
(pfit)
#' Finally we compare models using PSIS-LOO predictive performance estimates.
#+
(loo_lin<-loo(fit_lin, save_psis=TRUE))
(loo_gam<-loo(fit_gam, save_psis=TRUE))
(loo_gp<-loo(fit_gp, save_psis=TRUE))
loo_compare(loo_lin, loo_gam, loo_gp)
#' There are no practical differences in predictive performance, which is
#' partially due to small number of observations. Based on the posterior
#' predictive distributions there are clear differences in the future
#' predictions.
#' We can also look at the calibration of leave-one-out predictive
#' distributions
ppc_loo_intervals(deaths$deaths,
y_predict_lin[,1:29],
psis_object=loo_lin$psis_object)+
labs(title='PPC-LOO linear model')
ppc_loo_intervals(deaths$deaths,
y_predict_gp[,1:29],
psis_object=loo_gp$psis_object)+
labs(title='PPC-LOO GP model')
#' There is a small difference in favor of GP model.
#'
#'
#' # Licenses {.unnumbered}
#'
#' * Code © 2017-2020, Aki Vehtari, licensed under BSD-3.
#' * Text © 2017-2020, Aki Vehtari, licensed under CC-BY-NC 4.0.
#'
#' # Original Computing Environment {.unnumbered}
#+
sessionInfo()
#'