diff --git a/.Rbuildignore b/.Rbuildignore
index bd73ad6..62c5510 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -6,9 +6,7 @@
 ^.+\.Rproj$
 ^\.Rproj\.user$
 ^quilt_config\.yml$
-^inst/doc/presentation/.*$
-^inst/doc/publication_analyses/.*$
-^inst/doc/website_source/.*\.html$
+^inst/doc/.*$
 ^big_data$
 ^scratch$
 ^inst/extdata/mothur\.summary$
diff --git a/inst/doc/publication_analyses/16s_database_comparison/01--silva.Rmd b/inst/doc/publication_analyses/16s_database_comparison/01--silva.Rmd
new file mode 100644
index 0000000..caff09a
--- /dev/null
+++ b/inst/doc/publication_analyses/16s_database_comparison/01--silva.Rmd
@@ -0,0 +1,116 @@
+---
+title: Graphing the SILVA database
+---
+
+```{r silva_setup, echo=FALSE, warning=FALSE, message=FALSE}
+source("settings.R")
+```
+
+## Requirements 
+
+**NOTE:** This analysis requires at least 10Gb of RAM to run. 
+
+## Parse database
+
+The code below parses and subsets the entire SILVA non-redundant reference database.
+The object made is quite large.
+
+```{r load_silva}
+file_path <- file.path(input_folder, "SILVA_123.1_SSURef_Nr99_tax_silva.fasta")
+system.time(silva <- extract_taxonomy(seqinr::read.fasta(file_path, as.string = TRUE), 
+                                      regex = "^(.*?) (.*)$",
+                                      key = c(id = "obs_info", "class"),
+                                      class_sep = ";"))
+print(silva)
+```
+
+## Subset
+
+```{r subset_silva}
+if (! is.null(max_taxonomy_depth)) {
+  system.time(silva <- filter_taxa(silva, n_supertaxa <= max_taxonomy_depth))
+  print(silva)
+}
+if (! is.null(min_seq_count)) {
+  system.time(silva <- filter_taxa(silva, n_obs >= min_seq_count))
+  print(silva)
+}
+if (just_bacteria) {
+  system.time(silva <- filter_taxa(silva, name == "Bacteria", subtaxa = TRUE))
+  print(silva)
+}
+```
+
+
+## Plot whole database
+
+Although SILVA is such a large database (`r nrow(silva$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database:
+
+```{r silva_plot_all}
+system.time(silva_plot_all <- plot(silva,
+                                   node_size = n_obs,
+                                   node_color = n_obs,
+                                   node_label = name,
+                                   node_label_max = 150,
+                                   node_color_axis_label = "Sequence count",
+                                   make_legend = FALSE,
+                                   output_file = file.path(output_folder, paste0("silva--all", output_format))))
+print(silva_plot_all)
+```
+
+
+## PCR
+
+```{r silva_pcr}
+# Replace all u with t so in silico PCR works
+system.time(silva$obs_data$sequence <- gsub(pattern = "u", replacement = "t", silva$obs_data$sequence))
+# in silico PCR
+system.time(silva_pcr <- primersearch(silva,
+                                      forward = forward_primer,
+                                      reverse = reverse_primer,
+                                      mismatch = max_mismatch))
+```
+
+```{r silva_plot_pcr_all}
+system.time(silva_plot_pcr_all <- plot(silva_pcr,
+                                       node_size = n_obs,
+                                       node_label = name,
+                                       node_color = prop_amplified,
+                                       node_color_range =  pcr_success_color_scale,
+                                       node_color_trans = "linear",
+                                       node_label_max = 150,
+                                       node_color_axis_label = "Proportion PCR success",
+                                       node_size_axis_label = "Sequence count",
+                                       output_file = file.path(output_folder, paste0("silva--pcr_all", output_format))))
+print(silva_plot_pcr_all)
+```
+
+```{r, silva_plot_pcr_fail}
+system.time(silva_plot_pcr_fail <- silva_pcr %>%
+              filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
+              plot(node_size = n_obs - count_amplified,
+                   node_label = name,
+                   node_color = prop_amplified,
+                   node_color_range =  pcr_success_color_scale,
+                   node_color_trans = "linear",
+                   node_color_interval = c(0, 1),
+                   node_label_size_range = c(0.008, 0.015),
+                   node_label_max = 500,
+                   node_color_axis_label = "Proportion PCR success",
+                   node_size_axis_label = "Sequences not amplified",
+                   make_legend = FALSE,
+                   output_file = file.path(output_folder, paste0("silva--pcr_fail", output_format))))
+print(silva_plot_pcr_fail)
+```
+
+
+## Save outputs for composite figure
+
+Some results from this file will be combined with other to make a composite figure.
+Below, the needed objects are saved so that they can be loaded by another Rmd file.
+
+```{r silva_save}
+save(file = file.path(output_folder, "silva_data.RData"),
+     silva_plot_all, silva_plot_pcr_fail)
+```
+
diff --git a/inst/doc/publication_analyses/16s_database_comparison/02--rdp.Rmd b/inst/doc/publication_analyses/16s_database_comparison/02--rdp.Rmd
new file mode 100644
index 0000000..1bb3255
--- /dev/null
+++ b/inst/doc/publication_analyses/16s_database_comparison/02--rdp.Rmd
@@ -0,0 +1,115 @@
+---
+title: Graphing the RDP database
+---
+
+```{r rdp_setup, echo=FALSE, warning=FALSE, message=FALSE}
+source("settings.R")
+```
+
+## Requirements 
+
+**NOTE:** This analysis requires at least 16Gb of RAM to run. 
+
+## Parse database
+
+The code below parses and subsets the entire RDP non-redundant reference database.
+The object made is quite large.
+
+```{r load_rdp}
+file_path <- file.path(input_folder, "rdp_current_Bacteria_unaligned.fa")
+system.time(rdp <- extract_taxonomy(seqinr::read.fasta(file_path, as.string = TRUE),
+                                    regex = "\\tLineage(.*)",
+                                    key = c("class"),
+                                    class_regex = "[;=](.+?);(.+?)",
+                                    class_key = c("name", rdp_rank = "taxon_info")))
+print(rdp)
+```
+
+## Subset
+
+```{r subset_rdp}
+if (! is.null(max_taxonomy_depth)) {
+  system.time(rdp <- filter_taxa(rdp, n_supertaxa <= max_taxonomy_depth))
+  print(rdp)
+}
+if (! is.null(min_seq_count)) {
+  system.time(rdp <- filter_taxa(rdp, n_obs >= min_seq_count))
+  print(rdp)
+}
+if (just_bacteria) {
+  system.time(rdp <- filter_taxa(rdp, name == "Bacteria", subtaxa = TRUE))
+  print(rdp)
+}
+```
+
+
+
+## Plot whole database
+
+Although RDP is such a large database (`r nrow(rdp$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database:
+
+```{r rdp_plot_all}
+system.time(rdp_plot_all <- plot(rdp,
+                             node_size = n_obs,
+                             node_color = n_obs,
+                             node_label = name,
+                             node_label_max = 150,
+                             node_color_axis_label = "Sequence count",
+                             make_legend = FALSE,
+                             output_file = file.path(output_folder, paste0("rdp--all", output_format))))
+print(rdp_plot_all)
+```
+
+
+## PCR
+
+```{r rdp_pcr}
+system.time(rdp_pcr <- primersearch(rdp,
+                                      forward = forward_primer,
+                                      reverse = reverse_primer,
+                                      mismatch = max_mismatch))
+```
+
+```{r rdp_plot_pcr_all}
+system.time(rdp_plot_pcr_all <- plot(rdp_pcr,
+                                       node_size = n_obs,
+                                       node_label = name,
+                                       node_color = prop_amplified,
+                                       node_color_range =  pcr_success_color_scale,
+                                       node_color_trans = "linear",
+                                       node_label_max = 150,
+                                       node_color_axis_label = "Proportion PCR success",
+                                       node_size_axis_label = "Sequence count",
+                                       output_file = file.path(output_folder, paste0("rdp--pcr_all", output_format))))
+print(rdp_plot_pcr_all)
+```
+
+```{r, rdp_plot_pcr_fail}
+system.time(rdp_plot_pcr_fail <- rdp_pcr %>%
+              filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
+              plot(node_size = n_obs - count_amplified,
+                   node_label = name,
+                   node_color = prop_amplified,
+                   node_color_range =  pcr_success_color_scale,
+                   node_color_trans = "linear",
+                   node_color_interval = c(0, 1),
+                   node_label_size_range = c(0.008, 0.015),
+                   node_label_max = 500,
+                   node_color_axis_label = "Proportion PCR success",
+                   node_size_axis_label = "Sequences not amplified",
+                   make_legend = FALSE,
+                   output_file = file.path(output_folder, paste0("rdp--pcr_fail", output_format))))
+print(rdp_plot_pcr_fail)
+```
+
+
+## Save outputs for composite figure
+
+Some results from this file will be combined with other to make a composite figure.
+Below, the needed objects are saved so that they can be loaded by another Rmd file.
+
+```{r rdp_save}
+save(file = file.path(output_folder,"rdp_data.RData"),
+     rdp_plot_all, rdp_plot_pcr_fail)
+```
+
diff --git a/inst/doc/publication_analyses/16s_database_comparison/03--greengenes.Rmd b/inst/doc/publication_analyses/16s_database_comparison/03--greengenes.Rmd
new file mode 100644
index 0000000..f35e99d
--- /dev/null
+++ b/inst/doc/publication_analyses/16s_database_comparison/03--greengenes.Rmd
@@ -0,0 +1,154 @@
+---
+title: Graphing the Greengenes database
+---
+
+```{r greengenes_setup, echo=FALSE, warning=FALSE, message=FALSE}
+source("settings.R")
+```
+
+## Requirements 
+
+**NOTE:** This analysis requires at least 10Gb of RAM to run. 
+
+## Parse database
+
+The greengenes database stores sequences in one file and taxonomy information in another and the order of the two files differ making parseing more difficult than the other databases.
+Since taxonomy inforamtion is needed for creating the `taxmap` data structure, we will parse it first and add the sequence information on after.
+
+### Parse taxonomy file
+
+```{r greengenes_taxonomy}
+gg_taxonomy_path <- "databases/gg_13_5_taxonomy.txt"
+gg_taxonomy <- readLines(gg_taxonomy_path)
+print(gg_taxonomy[1:5])
+```
+
+Note that there are some ranks with no names. 
+These will be removed after parsing the file since they provide no information and an uniform-length taxonomy is not needed.
+
+```{r greengenes_taxonomy_parse}
+# Parse taxonomy file
+system.time(greengenes <- extract_taxonomy(input = gg_taxonomy, key = c(id = "obs_info", "class"), regex = "^([0-9]+)\t(.*)$", class_sep = "; ", class_regex = "^([a-z]{1})__(.*)$", class_key = c(rank = "taxon_info", "name")))
+# Remove data for ranks with no information
+greengenes <- filter_taxa(greengenes, name != "")
+print(greengenes)
+```
+
+
+### Parse sequence file
+
+Next we will parse the sequence file so we can add it to the `obs_data` table of the `greengenes` object.
+
+```{r greengenes_sequence}
+gg_sequence_path <- "databases/gg_13_5.fasta"
+substr(readLines(gg_sequence_path, n = 10), 1, 100)
+```
+
+This can be easily parsed using `seqinr`:
+
+```{r greengenes_sequence_read}
+gg_sequences <- seqinr::read.fasta(gg_sequence_path, as.string = TRUE)
+```
+
+
+### Integrating sequence and taxonomy
+
+We will need to use the Greengenes ID to match up which sequence goes with which row since they are in different orders.
+
+```{r greengenes_combine}
+greengenes <- mutate_obs(greengenes, sequence = unlist(gg_sequences)[as.character(id)])
+```
+
+
+## Subset
+
+```{r subset_greengenes}
+if (! is.null(max_taxonomy_depth)) {
+  system.time(greengenes <- filter_taxa(greengenes, n_supertaxa <= max_taxonomy_depth))
+  print(greengenes)
+}
+if (! is.null(min_seq_count)) {
+  system.time(greengenes <- filter_taxa(greengenes, n_obs >= min_seq_count))
+  print(greengenes)
+}
+if (just_bacteria) {
+  system.time(greengenes <- filter_taxa(greengenes, name == "Bacteria", subtaxa = TRUE))
+  print(greengenes)
+}
+```
+
+
+
+## Plot whole database
+
+Although Greengenes is such a large database (`r nrow(greengenes$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database:
+
+```{r greengenes_plot_all}
+system.time(greengenes_plot_all <- plot(greengenes,
+                                        node_size = n_obs,
+                                        node_size_range = c(0.001, 0.01),
+                                        node_size_interval = c(1, 3000000),
+                                        edge_size_range = c(0.001, 0.01),
+                                        edge_size_interval = c(1, 3000000),
+                                        node_color_interval = c(1, 3000000),
+                                        node_color = n_obs,
+                                        node_label = name,
+                                        node_label_max = 150,
+                                        node_color_axis_label = "Sequence count",
+                                        output_file = file.path(output_folder, paste0("greengenes--all", output_format))))
+print(greengenes_plot_all)
+```
+
+
+## PCR
+
+```{r greengenes_pcr}
+system.time(greengenes_pcr <- primersearch(greengenes,
+                                      forward = forward_primer,
+                                      reverse = reverse_primer,
+                                      mismatch = max_mismatch))
+```
+
+```{r greengenes_plot_pcr_all}
+system.time(greengenes_plot_pcr_all <- plot(greengenes_pcr,
+                                       node_size = n_obs,
+                                       node_label = name,
+                                       node_color = prop_amplified,
+                                       node_color_range =  pcr_success_color_scale,
+                                       node_color_trans = "linear",
+                                       node_label_max = 150,
+                                       node_color_axis_label = "Proportion PCR success",
+                                       node_size_axis_label = "Sequence count",
+                                       output_file = file.path(output_folder, paste0("greengenes--pcr_all", output_format))))
+print(greengenes_plot_pcr_all)
+```
+
+```{r, greengenes_plot_pcr_fail}
+system.time(greengenes_plot_pcr_fail <- greengenes_pcr %>%
+              filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
+              plot(node_size = n_obs - count_amplified,
+                   node_label = name,
+                   node_color = prop_amplified,
+                   node_color_range =  pcr_success_color_scale,
+                   node_color_trans = "linear",
+                   node_color_interval = c(0, 1),
+                   # node_label_size_range = c(0.008, 0.015),
+                   node_label_max = 500,
+                   node_color_axis_label = "Proportion PCR success",
+                   node_size_axis_label = "Sequences not amplified",
+                   output_file = file.path(output_folder, paste0("greengenes--pcr_fail", output_format))))
+print(greengenes_plot_pcr_fail)
+```
+
+
+## Save outputs for composite figure
+
+Some results from this file will be combined with other to make a composite figure.
+Below, the needed objects are saved so that they can be loaded by another Rmd file.
+
+```{r greengenes_save}
+save(file = file.path(output_folder, "greengenes_data.RData"),
+     greengenes_plot_all, greengenes_plot_pcr_fail)
+```
+
+
diff --git a/inst/doc/publication_analyses/16s_database_comparison/04--16s_database_comparision.Rmd b/inst/doc/publication_analyses/16s_database_comparison/04--16s_database_comparision.Rmd
new file mode 100644
index 0000000..692f9dd
--- /dev/null
+++ b/inst/doc/publication_analyses/16s_database_comparison/04--16s_database_comparision.Rmd
@@ -0,0 +1,40 @@
+---
+title: "Comparing 16S databases"
+---
+
+```{r, echo=FALSE, warning=FALSE, message=FALSE}
+source("settings.R")
+```
+
+## Run individual analyses
+
+```{r run_all}
+library(rmarkdown)
+render(input = "01--silva.Rmd")
+render(input = "02--rdp.Rmd")
+render(input = "03--greengenes.Rmd")
+```
+
+
+## Load plots 
+
+```{r load}
+load(file.path(output_folder, "silva_data.RData"))
+load(file.path(output_folder, "rdp_data.RData"))
+load(file.path(output_folder, "greengenes_data.RData"))
+```
+
+
+## Combine plots
+
+```{r combine}
+combo_plot <- gridExtra::grid.arrange(ncol = 2, nrow = 3,
+                                      top = "All data                                                           Not amplified        ",
+                                      left = "SILVA                                                      RDP                                                              Greengenes",
+                                      silva_plot_all, silva_plot_pcr_fail,
+                                      rdp_plot_all, rdp_plot_pcr_fail,
+                                      greengenes_plot_all, greengenes_plot_pcr_fail)
+ggplot2::ggsave(file.path(output_folder, "figure_2--16s_database_comparison.pdf"), 
+                combo_plot, width = 7.5, height = 10)
+```
+
diff --git a/inst/doc/publication_analyses/16s_database_comparison/vignette_06_examples--16s_database_comparision.Rmd b/inst/doc/publication_analyses/16s_database_comparison/vignette_06_examples--16s_database_comparision.Rmd
deleted file mode 100644
index 857f70e..0000000
--- a/inst/doc/publication_analyses/16s_database_comparison/vignette_06_examples--16s_database_comparision.Rmd
+++ /dev/null
@@ -1,351 +0,0 @@
----
-title: "Comparing 16S databases"
-output:
-  html_document:
-    toc: false
-    css: custom.css
----
-
-```{r, echo=FALSE, warning=FALSE, message=FALSE}
-library(metacoder)
-library(knitr)
-source("knitr_settings.R")
-```
-
-## Analyses parameters
-
-```{r}
-input_folder <- "../../../big_data"
-output_folder <- "16s_database_comparison_results"
-output_format <- ".pdf"
-max_taxonomy_depth <- 7
-min_seq_count <- 10
-max_mismatch <- 10 # percentage mismatch tolerated in pcr
-pcr_success_cutoff <- 0.90 # Used to subset for graphing
-forward = c("515F" = "GTGYCAGCMGCCGCGGTAA")
-reverse = c("806R" = "GGACTACNVGGGTWTCTAAT")
-```
-
-```{r}
-dir.create(output_folder, showWarnings = FALSE)
-```
-
-
-## SILVA
-
-### Parse file
-
-The code below parses and subsets the entire SILVA non-redundant reference database.
-It is done in a single step so that no large intermediate files are generated.
-Even so, the object made is quite large.
-
-```{r}
-set.seed(1)
-file_path <- file.path(input_folder, "SILVA_123.1_SSURef_Nr99_tax_silva.fasta")
-system.time(silva <- seqinr::read.fasta(file_path, as.string = TRUE) %>%
-              extract_taxonomy(regex = "^(.*?) (.*)$",
-                               key = c(id = "obs_info", "class"),
-                               class_sep = ";") %>% 
-              filter_taxa(n_supertaxa <= max_taxonomy_depth) %>%
-              filter_taxa(name == "Bacteria", subtaxa = TRUE) %>%
-              filter_taxa(n_obs >= min_seq_count))
-```
-
-### Plot data
-
-```{r}
-system.time(filter_taxa(silva, n_supertaxa <= 4) %>%
-              plot(node_size = n_obs,
-                   node_color = n_obs,
-                   node_label = name,
-                   node_label_max = 150,
-                   title = "SILVA sequence abundance",
-                   title_size = 0.03,
-                   node_color_axis_label = "Sequence count",
-                   output_file = file.path(output_folder, paste0("silva--abundance_rank_5", output_format))))
-```
-
-### PCR
-
-```{r}
-silva$obs_data$sequence <- gsub(pattern = "u", replacement = "t", silva$obs_data$sequence)
-system.time(pcr <- primersearch(silva,
-                                forward = forward_primer,
-                                reverse = reverse_primer,
-                                mismatch = max_mismatch))
-```
-
-```{r}
-system.time(filter_taxa(pcr, n_supertaxa <= 4) %>%
-              plot(node_size = n_obs,
-                   node_label = name,
-                   node_color = prop_amplified,
-                   node_color_range =  c("red", "cyan"),
-                   node_color_trans = "linear",
-                   node_label_max = 150,
-                   title = paste0("SILVA in silico pcr with ", pcr$obs_data$pair_name[1]),
-                   title_size = 0.03,
-                   node_color_axis_label = "Proportion PCR success",
-                   node_size_axis_label = "Sequence count",
-                   output_file = file.path(output_folder, paste0("silva--pcr_all_rank_5", output_format))))
-```
-
-```{r}
-system.time(filter_taxa(pcr, n_supertaxa <= 4) %>%
-              filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
-              plot(node_size = n_obs,
-                   node_label = name,
-                   node_color = prop_amplified,
-                   node_color_range =  c("red", "cyan"),
-                   node_color_trans = "linear",
-                   node_label_max = 100,
-                   title = paste0("SILVA in silico pcr with ", pcr$obs_data$pair_name[1], 
-                                  " ( < ", pcr_success_cutoff * 100, "%)"),
-                   title_size = 0.03,
-                   node_color_axis_label = "Proportion PCR success",
-                   node_size_axis_label = "Sequence count",
-                   output_file = file.path(output_folder, paste0("silva--pcr_rank_5_less_than_", 
-                                                                 pcr_success_cutoff * 100, 
-                                                                 output_format))))
-```
-
-
-
-
-## RDP
-
-```{r}
-set.seed(1)
-file_path <- file.path(input_folder, "rdp_current_Bacteria_unaligned.fa")
-system.time(rdp <- seqinr::read.fasta(file_path, as.string = TRUE) %>% 
-              extract_taxonomy(regex = "\\tLineage(.*)",
-                               key = c("class"),
-                               class_regex = "[;=](.+?);(.+?)",
-                               class_key = c("name", rdp_rank = "taxon_info")) %>% 
-              filter_taxa(n_supertaxa <= max_taxonomy_depth) %>%
-              filter_taxa(name == "Bacteria", subtaxa = TRUE) %>%
-              filter_taxa(n_obs >= min_seq_count))
-```
-
-
-### Plot data
-
-```{r}
-system.time(filter_taxa(rdp, n_supertaxa <= 4) %>%
-              plot(node_size = n_obs,
-                 node_color = n_obs,
-                 node_label = name,
-                 node_label_max = 150,
-                 title = "RDP sequence abundance",
-                 title_size = 0.03,
-                 node_color_axis_label = "Sequence count",
-                 output_file = file.path(output_folder, paste0("rdp--abundance_rank_5", output_format))))
-```
-
-
-```{r}
-system.time(plot(rdp, node_size = n_obs,
-                 node_color = n_obs,
-                 node_label = name,
-                 node_label_max = 150,
-                 title = "RDP sequence abundance",
-                 title_size = 0.03,
-                 node_color_axis_label = "Sequence count",
-                 output_file = file.path(output_folder, paste0("rdp--abundance_all_taxa", output_format))))
-```
-
-
-### PCR
-
-```{r}
-system.time(pcr <- primersearch(rdp,
-                                forward = forward_primer,
-                                reverse = reverse_primer,
-                                mismatch = max_mismatch))
-```
-
-```{r}
-system.time(filter_taxa(pcr, n_supertaxa <= 4) %>%
-              plot(node_size = n_obs,
-                 node_label = name,
-                 node_color = prop_amplified,
-                 node_color_range =  c("red", "cyan"),
-                 node_color_trans = "linear",
-                 node_label_max = 150,
-                 title = paste0("RDP in silico pcr with ", unique(pcr$obs_data$pair_name)[2]),
-                 title_size = 0.03,
-                 node_color_axis_label = "Proportion PCR success",
-                 node_size_axis_label = "Sequence count",
-                 output_file = file.path(output_folder, paste0("rdp--pcr_rank_5", output_format))))
-```
-
-```{r}
-system.time(plot(pcr,
-                 node_size = n_obs,
-                 node_label = name,
-                 node_color = prop_amplified,
-                 node_color_range =  c("red", "cyan"),
-                 node_color_trans = "linear",
-                 node_label_max = 150,
-                 title = paste0("RDP in silico pcr with ", unique(pcr$obs_data$pair_name)[2]),
-                 title_size = 0.03,
-                 node_color_axis_label = "Proportion PCR success",
-                 node_size_axis_label = "Sequence count",
-                 output_file = file.path(output_folder, paste0("rdp--pcr_all_taxa", output_format))))
-```
-
-
-
-```{r}
-system.time(filter_taxa(pcr, prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
-              filter_taxa(n_supertaxa <= 4) %>%
-              plot(node_size = n_obs,
-                   node_label = name,
-                   node_color = prop_amplified,
-                   node_color_range =  c("red", "cyan"),
-                   node_color_trans = "linear",
-                   node_label_max = 100,
-                   title = paste0("RDP in silico pcr with ", unique(pcr$obs_data$pair_name)[2], 
-                                  " ( < ", pcr_success_cutoff * 100, "%)"),
-                   title_size = 0.03,
-                   node_color_axis_label = "Proportion PCR success",
-                   node_size_axis_label = "Sequence count",
-                   output_file = file.path(output_folder, paste0("rdp--pcr_rank_5_less_than_", 
-                                                                 pcr_success_cutoff * 100, 
-                                                                 output_format))))
-```
-
-
-```{r}
-system.time(filter_taxa(pcr, prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
-              plot(node_size = n_obs,
-                   node_label = name,
-                   node_color = prop_amplified,
-                   node_color_range =  c("red", "cyan"),
-                   node_color_trans = "linear",
-                   node_label_max = 100,
-                   title = paste0("RDP in silico pcr with ", unique(pcr$obs_data$pair_name)[2], 
-                                  " ( < ", pcr_success_cutoff * 100, "%)"),
-                   title_size = 0.03,
-                   node_color_axis_label = "Proportion PCR success",
-                   node_size_axis_label = "Sequence count",
-                   output_file = file.path(output_folder, paste0("rdp--pcr_all_taxa_less_than_", 
-                                                                 pcr_success_cutoff * 100, 
-                                                                 output_format))))
-```
-
-
-
-## Greengenes
-
-### Parse data
-
-The greengenes database stores sequences in one file and taxonomy information in another and the order of the two files differ making parseing more difficult than the other databases.
-Since taxonomy inforamtion is needed for creating the `taxmap` data structure, we will parse it first and add the sequence information on after.
-
-#### Parse taxonomy file
-
-```{r}
-gg_taxonomy_path <- "data/gg_13_5_taxonomy.txt"
-gg_taxonomy <- readLines(gg_taxonomy_path)
-print(gg_taxonomy[1:5])
-```
-
-Note that there are some ranks with no names. 
-These will be removed after parsing the file since they provide no information and an uniform-length taxonomy is not needed.
-
-```{r}
-# Parse taxonomy file
-system.time(greengenes <- extract_taxonomy(input = gg_taxonomy, key = c(id = "obs_info", "class"), regex = "^([0-9]+)\t(.*)$", class_sep = "; ", class_regex = "^([a-z]{1})__(.*)$", class_key = c(rank = "taxon_info", "name")))
-# Remove data for ranks with no information
-greengenes <- filter_taxa(greengenes, name != "")
-print(greengenes)
-```
-
-
-#### Parse sequence file
-
-Next we will parse the sequence file so we can add it to the `obs_data` table of the `greengenes` object.
-
-```{r}
-gg_sequence_path <- "data/gg_13_5.fasta"
-substr(readLines(gg_sequence_path, n = 10), 1, 100)
-```
-
-This can be easily parsed using `seqinr`:
-
-```{r}
-gg_sequences <- seqinr::read.fasta(gg_sequence_path, as.string = TRUE)
-```
-
-
-#### Integrating sequence and taxonomy
-
-We will need to use the Greengenes ID to match up which sequence goes with which row since they are in different orders.
-
-```{r}
-greengenes <- mutate_obs(greengenes, sequence = unlist(gg_sequences)[as.character(id)])
-```
-
-
-### Plot data
-
-```{r}
-system.time(filter_taxa(greengenes, name == "Bacteria", subtaxa = TRUE) %>%
-              plot(node_size = n_obs,
-                   node_color = n_obs,
-                   node_label = name,
-                   node_label_max = 150,
-                   title = "Greengenes sequence abundance",
-                   title_size = 0.03,
-                   node_color_axis_label = "Sequence count",
-                   output_file = file.path(output_folder, paste0("gg--abundance_all_taxa", output_format))))
-```
-
-
-### PCR
-
-```{r}
-system.time(pcr <- primersearch(greengenes,
-                                forward = forward,
-                                reverse = reverse,
-                                mismatch = max_mismatch))
-```
-
-```{r}
-system.time(filter_taxa(pcr, name == "Bacteria", subtaxa = TRUE) %>%
-              plot(node_size = n_obs,
-                   node_label = name,
-                   node_color = prop_amplified,
-                   node_color_range =  c("red", "cyan"),
-                   node_color_trans = "linear",
-                   node_label_max = 150,
-                   title = paste0("Greengenes in silico pcr with ", unique(pcr$obs_data$pair_name)[2]),
-                   title_size = 0.03,
-                   node_color_axis_label = "Proportion PCR success",
-                   node_size_axis_label = "Sequence count",
-                   output_file = file.path(output_folder, paste0("gg--pcr_all_taxa", output_format))))
-```
-
-```{r}
-system.time(filter_taxa(pcr, name == "Bacteria", subtaxa = TRUE) %>%
-              filter_taxa(n_obs >= 3, supertaxa = TRUE) %>%
-              filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
-              plot(node_size = n_obs - count_amplified,
-                   node_label = name,
-                   node_color = prop_amplified,
-                   node_color_range =  c("red", "orange", "yellow", "green", "cyan"),
-                   node_color_trans = "linear",
-                   node_color_interval = c(0, 1),
-                   node_label_size_range = c(0.008, 0.015),
-                   node_label_max = 500,
-                   title = paste0("Greengenes in silico pcr with ", unique(pcr$obs_data$pair_name)[2], 
-                                  " ( < ", pcr_success_cutoff * 100, "%)"),
-                   title_size = 0.03,
-                   node_color_axis_label = "Proportion PCR success",
-                   node_size_axis_label = "Sequences not amplified",
-                   output_file = file.path(output_folder, paste0("gg--pcr_all_taxa_less_than_", 
-                                                                 pcr_success_cutoff * 100, 
-                                                                 output_format))))
-```
-
diff --git a/inst/doc/publication_analyses/tara_oceans/vignette_05_examples--tara_oceans.Rmd b/inst/doc/publication_analyses/tara_oceans/vignette_05_examples--tara_oceans.Rmd
index bb0e8d1..683576c 100644
--- a/inst/doc/publication_analyses/tara_oceans/vignette_05_examples--tara_oceans.Rmd
+++ b/inst/doc/publication_analyses/tara_oceans/vignette_05_examples--tara_oceans.Rmd
@@ -4,26 +4,18 @@ output: html_document
 ---
 
 ```{r, echo=FALSE, warning=FALSE, message=FALSE}
-library(knitr)
-source("knitr_settings.R")
+library(metacoder)
 ```
 
 
-### Analyses parameters
+## Analyses parameters
 
 ```{r}
-output_folder <- "tara_oceans_results"
+output_folder <- getwd()
 output_format <- ".pdf"
 ```
 
-### Create output directory
-
-```{r}
-dir.create(output_folder, showWarnings = FALSE)
-```
-
-
-### Parsing taxonomic data
+## Parsing taxonomic data
 
 The code below downloads the compressed data to a temporary directory:
 
@@ -47,14 +39,11 @@ unpacked_file_path <- file.path(temp_dir_path,
 ```
 
 ```{r, warning=FALSE, message=FALSE}
-# Load the package
-library(metacoder)
-# Extract the taxonomic information of the sequences
 data <- parse_taxonomy_table(unpacked_file_path, taxon_col = c("class" = -9), class_sep = "\\|")
 ```
 
 
-### Getting sample data
+## Getting sample data
 
 ```{r}
 data_url <- "http://taraoceans.sb-roscoff.fr/EukDiv/data/Database_W1_Sample_parameters.xls"
@@ -65,7 +54,7 @@ sample_data <- readxl::read_excel(local_file_path)
 ```
 
 
-### Caluculate read abundance per taxon
+## Caluculate read abundance per taxon
 
 ```{r}
 sample_columns <- sample_data[["PANGAEA ACCESSION NUMBER"]]
@@ -75,7 +64,7 @@ data <- mutate_taxa(data,
 ```
 
 
-### Plot read and OTU abundance
+## Plot read and OTU abundance
 
 ```{r}
 seed = 9 #9, 10, 12 is good
@@ -114,159 +103,11 @@ data %>%
        background_color = background_color,
        maxiter = 50, fineiter = 50,
        margin_size = c(0.001, 0.001),
-       output_file = file.path(output_folder, paste0("sup_figure_1--tara_all_plankton_diversity--seed_", seed,  output_format)))
-```
-
-
-### Sampling depth differential
-
-#### Calculate depth read counts
-
-```{r}
-sur_columms <- sample_data[["PANGAEA ACCESSION NUMBER"]][sample_data[["SAMPLING DEPTH"]] == "SUR"]
-dcm_columms <- sample_data[["PANGAEA ACCESSION NUMBER"]][sample_data[["SAMPLING DEPTH"]] == "DCM"]
-
-data <- mutate_obs(data, sur_counts = apply(data$obs_data[, sur_columms], MARGIN = 1,
-                                              function(x) sum(as.numeric(x))))
-data <- mutate_obs(data, dcm_counts = apply(data$obs_data[, dcm_columms], MARGIN = 1,
-                                              function(x) sum(as.numeric(x))))
-
-data <- mutate_taxa(data, sur_taxon_counts = vapply(obs(data),
-                                                    function(x) sum(data$obs_data$sur_counts[x]), numeric(1)))
-data <- mutate_taxa(data, dcm_taxon_counts = vapply(obs(data),
-                                                    function(x) sum(data$obs_data$dcm_counts[x]), numeric(1)))
-
-data <- mutate_taxa(data, depth_prop_dcm = dcm_taxon_counts / read_counts)
-total_dcm_reads <- sum(data$taxon_data$dcm_taxon_counts[n_supertaxa(data) == 0])
-total_sur_reads <- sum(data$taxon_data$sur_taxon_counts[n_supertaxa(data) == 0])
-total_reads <- total_dcm_reads + total_sur_reads
-
-
-prop_diff_test_stat <- function(prop_1, prop_2, prop_total, count_1, count_2) {
-  (prop_1 - prop_2) / sqrt(prop_total * (1 - prop_total) * (1 / count_1 + 1 / count_2))
-}
-
-data <- mutate_taxa(data, depth_prop_diff = ((dcm_taxon_counts / total_dcm_reads) - (sur_taxon_counts / total_sur_reads)) / (read_counts / total_reads))
-data <- mutate_taxa(data, depth_diff_test_stat = prop_diff_test_stat(prop_1 = dcm_taxon_counts / total_dcm_reads,
-                                                                     prop_2 = sur_taxon_counts / total_sur_reads,
-                                                                     prop_total = read_counts / total_reads,
-                                                                     count_1 = total_dcm_reads,
-                                                                     count_2 = total_sur_reads))
-```
-
-#### Plot proportion of reads in DCM 
-
-```{r}
-seed = 9 #9, 10, 12 is good
-set.seed(seed)
-taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
-taxa_patterns_to_remove <- paste0("^", c("X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
-data %>%
-  filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>%
-  filter_taxa(read_counts >= 100) %>%
-  filter_taxa(!(taxon_ids %in% subtaxa(data, name == "Eukaryota", simplify = TRUE, include_input = TRUE) & n_supertaxa <= 1)) %>%
-  plot(title = "Location of taxa in water column",
-       title_size = 0.03,
-       node_color_axis_label = "DCM only        Both       Surface only",
-       node_size_axis_label = "Number of species (OTUs)",
-       node_size = n_obs,
-       node_size_range = c(0.0012, NA),
-       node_color = depth_prop_dcm,
-       node_color_range = diverging_palette(),
-       node_color_trans = "linear",
-       node_color_interval = c(0, 1),
-       node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) &
-                               ! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)),
-                             name, NA),
-       node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5,
-       node_label_size_trans = "area",
-       node_label_size_range = c(0.001, NA),
-       node_label_max = 1000,
-       tree_label = name,
-       tree_label_max = 100,
-       initial_layout = "re", layout = "da",
-       overlap_avoidance = .65,
-       maxiter = 50, fineiter = 50,
-       margin_size = c(0.001, 0.001),
-       output_file = file.path(output_folder, paste0("sup_figure_2--tara_proportion_in_dcm--seed_", seed,  output_format)))
+       output_file = file.path(output_folder, paste0("sup_figure_1--tara_all_plankton_diversity",  output_format)))
 ```
 
 
-```{r}
-seed = 9 #9, 10, 12 is good
-set.seed(seed)
-taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
-taxa_patterns_to_remove <- paste0("^", c("X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
-data %>%
-  filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>%
-  filter_taxa(read_counts >= 100) %>%
-  filter_taxa(!(taxon_ids %in% subtaxa(data, name == "Eukaryota", simplify = TRUE, include_input = TRUE) & n_supertaxa <= 1)) %>%
-  plot(title = "Location of taxa in water column",
-       title_size = 0.03,
-       node_color_axis_label = "DCM only        Both       Surface only",
-       node_size_axis_label = "Number of species (OTUs)",
-       node_size = n_obs,
-       node_size_range = c(0.0012, NA),
-       node_color = depth_prop_diff,
-       node_color_range = diverging_palette(),
-       node_color_trans = "linear",
-       node_color_interval = c(-2, 2),
-       edge_color_interval = c(-2, 2),
-       node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) &
-                               ! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)),
-                             name, NA),
-       node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5,
-       node_label_size_trans = "area",
-       node_label_size_range = c(0.001, NA),
-       node_label_max = 1000,
-       tree_label = name,
-       tree_label_max = 100,
-       initial_layout = "re", layout = "da",
-       overlap_avoidance = .65,
-       maxiter = 50, fineiter = 50,
-       margin_size = c(0.001, 0.001),
-       output_file = file.path(output_folder, paste0("sup_figure_2--tara_scaled_proportion_diff--seed_", seed,  output_format)))
-```
-
-```{r}
-seed = 9 #9, 10, 12 is good
-set.seed(seed)
-taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
-taxa_patterns_to_remove <- paste0("^", c("X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
-data %>%
-  filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>%
-  filter_taxa(read_counts >= 100) %>%
-  filter_taxa(!(taxon_ids %in% subtaxa(data, name == "Eukaryota", simplify = TRUE, include_input = TRUE) & n_supertaxa <= 1)) %>%
-  plot(title = "Location of taxa in water column",
-       title_size = 0.03,
-       node_color_axis_label = "DCM only        Both       Surface only",
-       node_size_axis_label = "Number of species (OTUs)",
-       node_size = n_obs,
-       node_size_range = c(0.0012, NA),
-       node_color = depth_diff_test_stat,
-       node_color_range = diverging_palette(),
-       node_color_trans = "area",
-       node_color_interval = c(-1000, 1000),
-       edge_color_interval = c(-1000, 1000),
-       node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) &
-                               ! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)),
-                             name, NA),
-       node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5,
-       node_label_size_trans = "area",
-       node_label_size_range = c(0.001, NA),
-       node_label_max = 1000,
-       tree_label = name,
-       tree_label_max = 100,
-       initial_layout = "re", layout = "da",
-       overlap_avoidance = .65,
-       maxiter = 10, fineiter = 10,
-       margin_size = c(0.001, 0.001),
-       output_file = file.path(output_folder, paste0("sup_figure_2--tara_depth_test_stat--seed_", seed,  output_format)))
-```
-
-
-
-### Plot propotion of OTUs identified 
+## Plot propotion of OTUs identified 
 
 ```{r}
 data <- mutate_obs(data, pid = as.numeric(pid))
@@ -310,10 +151,10 @@ data %>%
        overlap_avoidance = .65,
        maxiter = 50, fineiter = 50,
        margin_size = c(0.001, 0.001),
-       output_file = file.path(output_folder, paste0("sup_figure_3--tara_proportion_identified--seed_", seed,  output_format)))
+       output_file = file.path(output_folder, paste0("sup_figure_2--tara_proportion_identified",  output_format)))
 ```
 
-#### Just Metazoa
+### Just Metazoa
 
 ```{r}
 seed = 1
@@ -340,6 +181,6 @@ data %>%
        initial_layout = "re", layout = "da",
        overlap_avoidance = .65,
        maxiter = 50, fineiter = 50,
-       output_file = file.path(output_folder, paste0("figure_1--tara_proportion_identified_metazoa--seed_", seed,  output_format)))
+       output_file = file.path(output_folder, paste0("figure_1--tara_proportion_identified_metazoa",  output_format)))
 ```