separated database comparisons

grunwaldlab · Aug 8, 2016 · b9cf970 · b9cf970
1 parent adf3659
commit b9cf970
Show file tree

Hide file tree

Showing 7 changed files with 438 additions and 525 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -6,9 +6,7 @@
 ^.+\.Rproj$
 ^\.Rproj\.user$
 ^quilt_config\.yml$
-^inst/doc/presentation/.*$
-^inst/doc/publication_analyses/.*$
-^inst/doc/website_source/.*\.html$
+^inst/doc/.*$
 ^big_data$
 ^scratch$
 ^inst/extdata/mothur\.summary$

diff --git a/inst/doc/publication_analyses/16s_database_comparison/01--silva.Rmd b/inst/doc/publication_analyses/16s_database_comparison/01--silva.Rmd
@@ -0,0 +1,116 @@
+---
+title: Graphing the SILVA database
+---
+
+```{r silva_setup, echo=FALSE, warning=FALSE, message=FALSE}
+source("settings.R")
+```
+
+## Requirements 
+
+**NOTE:** This analysis requires at least 10Gb of RAM to run. 
+
+## Parse database
+
+The code below parses and subsets the entire SILVA non-redundant reference database.
+The object made is quite large.
+
+```{r load_silva}
+file_path <- file.path(input_folder, "SILVA_123.1_SSURef_Nr99_tax_silva.fasta")
+system.time(silva <- extract_taxonomy(seqinr::read.fasta(file_path, as.string = TRUE), 
+                                      regex = "^(.*?) (.*)$",
+                                      key = c(id = "obs_info", "class"),
+                                      class_sep = ";"))
+print(silva)
+```
+
+## Subset
+
+```{r subset_silva}
+if (! is.null(max_taxonomy_depth)) {
+  system.time(silva <- filter_taxa(silva, n_supertaxa <= max_taxonomy_depth))
+  print(silva)
+}
+if (! is.null(min_seq_count)) {
+  system.time(silva <- filter_taxa(silva, n_obs >= min_seq_count))
+  print(silva)
+}
+if (just_bacteria) {
+  system.time(silva <- filter_taxa(silva, name == "Bacteria", subtaxa = TRUE))
+  print(silva)
+}
+```
+
+
+## Plot whole database
+
+Although SILVA is such a large database (`r nrow(silva$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database:
+
+```{r silva_plot_all}
+system.time(silva_plot_all <- plot(silva,
+                                   node_size = n_obs,
+                                   node_color = n_obs,
+                                   node_label = name,
+                                   node_label_max = 150,
+                                   node_color_axis_label = "Sequence count",
+                                   make_legend = FALSE,
+                                   output_file = file.path(output_folder, paste0("silva--all", output_format))))
+print(silva_plot_all)
+```
+
+
+## PCR
+
+```{r silva_pcr}
+# Replace all u with t so in silico PCR works
+system.time(silva$obs_data$sequence <- gsub(pattern = "u", replacement = "t", silva$obs_data$sequence))
+# in silico PCR
+system.time(silva_pcr <- primersearch(silva,
+                                      forward = forward_primer,
+                                      reverse = reverse_primer,
+                                      mismatch = max_mismatch))
+```
+
+```{r silva_plot_pcr_all}
+system.time(silva_plot_pcr_all <- plot(silva_pcr,
+                                       node_size = n_obs,
+                                       node_label = name,
+                                       node_color = prop_amplified,
+                                       node_color_range =  pcr_success_color_scale,
+                                       node_color_trans = "linear",
+                                       node_label_max = 150,
+                                       node_color_axis_label = "Proportion PCR success",
+                                       node_size_axis_label = "Sequence count",
+                                       output_file = file.path(output_folder, paste0("silva--pcr_all", output_format))))
+print(silva_plot_pcr_all)
+```
+
+```{r, silva_plot_pcr_fail}
+system.time(silva_plot_pcr_fail <- silva_pcr %>%
+              filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
+              plot(node_size = n_obs - count_amplified,
+                   node_label = name,
+                   node_color = prop_amplified,
+                   node_color_range =  pcr_success_color_scale,
+                   node_color_trans = "linear",
+                   node_color_interval = c(0, 1),
+                   node_label_size_range = c(0.008, 0.015),
+                   node_label_max = 500,
+                   node_color_axis_label = "Proportion PCR success",
+                   node_size_axis_label = "Sequences not amplified",
+                   make_legend = FALSE,
+                   output_file = file.path(output_folder, paste0("silva--pcr_fail", output_format))))
+print(silva_plot_pcr_fail)
+```
+
+
+## Save outputs for composite figure
+
+Some results from this file will be combined with other to make a composite figure.
+Below, the needed objects are saved so that they can be loaded by another Rmd file.
+
+```{r silva_save}
+save(file = file.path(output_folder, "silva_data.RData"),
+     silva_plot_all, silva_plot_pcr_fail)
+```
+
diff --git a/inst/doc/publication_analyses/16s_database_comparison/02--rdp.Rmd b/inst/doc/publication_analyses/16s_database_comparison/02--rdp.Rmd
@@ -0,0 +1,115 @@
+---
+title: Graphing the RDP database
+---
+
+```{r rdp_setup, echo=FALSE, warning=FALSE, message=FALSE}
+source("settings.R")
+```
+
+## Requirements 
+
+**NOTE:** This analysis requires at least 16Gb of RAM to run. 
+
+## Parse database
+
+The code below parses and subsets the entire RDP non-redundant reference database.
+The object made is quite large.
+
+```{r load_rdp}
+file_path <- file.path(input_folder, "rdp_current_Bacteria_unaligned.fa")
+system.time(rdp <- extract_taxonomy(seqinr::read.fasta(file_path, as.string = TRUE),
+                                    regex = "\\tLineage(.*)",
+                                    key = c("class"),
+                                    class_regex = "[;=](.+?);(.+?)",
+                                    class_key = c("name", rdp_rank = "taxon_info")))
+print(rdp)
+```
+
+## Subset
+
+```{r subset_rdp}
+if (! is.null(max_taxonomy_depth)) {
+  system.time(rdp <- filter_taxa(rdp, n_supertaxa <= max_taxonomy_depth))
+  print(rdp)
+}
+if (! is.null(min_seq_count)) {
+  system.time(rdp <- filter_taxa(rdp, n_obs >= min_seq_count))
+  print(rdp)
+}
+if (just_bacteria) {
+  system.time(rdp <- filter_taxa(rdp, name == "Bacteria", subtaxa = TRUE))
+  print(rdp)
+}
+```
+
+
+
+## Plot whole database
+
+Although RDP is such a large database (`r nrow(rdp$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database:
+
+```{r rdp_plot_all}
+system.time(rdp_plot_all <- plot(rdp,
+                             node_size = n_obs,
+                             node_color = n_obs,
+                             node_label = name,
+                             node_label_max = 150,
+                             node_color_axis_label = "Sequence count",
+                             make_legend = FALSE,
+                             output_file = file.path(output_folder, paste0("rdp--all", output_format))))
+print(rdp_plot_all)
+```
+
+
+## PCR
+
+```{r rdp_pcr}
+system.time(rdp_pcr <- primersearch(rdp,
+                                      forward = forward_primer,
+                                      reverse = reverse_primer,
+                                      mismatch = max_mismatch))
+```
+
+```{r rdp_plot_pcr_all}
+system.time(rdp_plot_pcr_all <- plot(rdp_pcr,
+                                       node_size = n_obs,
+                                       node_label = name,
+                                       node_color = prop_amplified,
+                                       node_color_range =  pcr_success_color_scale,
+                                       node_color_trans = "linear",
+                                       node_label_max = 150,
+                                       node_color_axis_label = "Proportion PCR success",
+                                       node_size_axis_label = "Sequence count",
+                                       output_file = file.path(output_folder, paste0("rdp--pcr_all", output_format))))
+print(rdp_plot_pcr_all)
+```
+
+```{r, rdp_plot_pcr_fail}
+system.time(rdp_plot_pcr_fail <- rdp_pcr %>%
+              filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
+              plot(node_size = n_obs - count_amplified,
+                   node_label = name,
+                   node_color = prop_amplified,
+                   node_color_range =  pcr_success_color_scale,
+                   node_color_trans = "linear",
+                   node_color_interval = c(0, 1),
+                   node_label_size_range = c(0.008, 0.015),
+                   node_label_max = 500,
+                   node_color_axis_label = "Proportion PCR success",
+                   node_size_axis_label = "Sequences not amplified",
+                   make_legend = FALSE,
+                   output_file = file.path(output_folder, paste0("rdp--pcr_fail", output_format))))
+print(rdp_plot_pcr_fail)
+```
+
+
+## Save outputs for composite figure
+
+Some results from this file will be combined with other to make a composite figure.
+Below, the needed objects are saved so that they can be loaded by another Rmd file.
+
+```{r rdp_save}
+save(file = file.path(output_folder,"rdp_data.RData"),
+     rdp_plot_all, rdp_plot_pcr_fail)
+```
+
diff --git a/inst/doc/publication_analyses/16s_database_comparison/03--greengenes.Rmd b/inst/doc/publication_analyses/16s_database_comparison/03--greengenes.Rmd
@@ -0,0 +1,154 @@
+---
+title: Graphing the Greengenes database
+---
+
+```{r greengenes_setup, echo=FALSE, warning=FALSE, message=FALSE}
+source("settings.R")
+```
+
+## Requirements 
+
+**NOTE:** This analysis requires at least 10Gb of RAM to run. 
+
+## Parse database
+
+The greengenes database stores sequences in one file and taxonomy information in another and the order of the two files differ making parseing more difficult than the other databases.
+Since taxonomy inforamtion is needed for creating the `taxmap` data structure, we will parse it first and add the sequence information on after.
+
+### Parse taxonomy file
+
+```{r greengenes_taxonomy}
+gg_taxonomy_path <- "databases/gg_13_5_taxonomy.txt"
+gg_taxonomy <- readLines(gg_taxonomy_path)
+print(gg_taxonomy[1:5])
+```
+
+Note that there are some ranks with no names. 
+These will be removed after parsing the file since they provide no information and an uniform-length taxonomy is not needed.
+
+```{r greengenes_taxonomy_parse}
+# Parse taxonomy file
+system.time(greengenes <- extract_taxonomy(input = gg_taxonomy, key = c(id = "obs_info", "class"), regex = "^([0-9]+)\t(.*)$", class_sep = "; ", class_regex = "^([a-z]{1})__(.*)$", class_key = c(rank = "taxon_info", "name")))
+# Remove data for ranks with no information
+greengenes <- filter_taxa(greengenes, name != "")
+print(greengenes)
+```
+
+
+### Parse sequence file
+
+Next we will parse the sequence file so we can add it to the `obs_data` table of the `greengenes` object.
+
+```{r greengenes_sequence}
+gg_sequence_path <- "databases/gg_13_5.fasta"
+substr(readLines(gg_sequence_path, n = 10), 1, 100)
+```
+
+This can be easily parsed using `seqinr`:
+
+```{r greengenes_sequence_read}
+gg_sequences <- seqinr::read.fasta(gg_sequence_path, as.string = TRUE)
+```
+
+
+### Integrating sequence and taxonomy
+
+We will need to use the Greengenes ID to match up which sequence goes with which row since they are in different orders.
+
+```{r greengenes_combine}
+greengenes <- mutate_obs(greengenes, sequence = unlist(gg_sequences)[as.character(id)])
+```
+
+
+## Subset
+
+```{r subset_greengenes}
+if (! is.null(max_taxonomy_depth)) {
+  system.time(greengenes <- filter_taxa(greengenes, n_supertaxa <= max_taxonomy_depth))
+  print(greengenes)
+}
+if (! is.null(min_seq_count)) {
+  system.time(greengenes <- filter_taxa(greengenes, n_obs >= min_seq_count))
+  print(greengenes)
+}
+if (just_bacteria) {
+  system.time(greengenes <- filter_taxa(greengenes, name == "Bacteria", subtaxa = TRUE))
+  print(greengenes)
+}
+```
+
+
+
+## Plot whole database
+
+Although Greengenes is such a large database (`r nrow(greengenes$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database:
+
+```{r greengenes_plot_all}
+system.time(greengenes_plot_all <- plot(greengenes,
+                                        node_size = n_obs,
+                                        node_size_range = c(0.001, 0.01),
+                                        node_size_interval = c(1, 3000000),
+                                        edge_size_range = c(0.001, 0.01),
+                                        edge_size_interval = c(1, 3000000),
+                                        node_color_interval = c(1, 3000000),
+                                        node_color = n_obs,
+                                        node_label = name,
+                                        node_label_max = 150,
+                                        node_color_axis_label = "Sequence count",
+                                        output_file = file.path(output_folder, paste0("greengenes--all", output_format))))
+print(greengenes_plot_all)
+```
+
+
+## PCR
+
+```{r greengenes_pcr}
+system.time(greengenes_pcr <- primersearch(greengenes,
+                                      forward = forward_primer,
+                                      reverse = reverse_primer,
+                                      mismatch = max_mismatch))
+```
+
+```{r greengenes_plot_pcr_all}
+system.time(greengenes_plot_pcr_all <- plot(greengenes_pcr,
+                                       node_size = n_obs,
+                                       node_label = name,
+                                       node_color = prop_amplified,
+                                       node_color_range =  pcr_success_color_scale,
+                                       node_color_trans = "linear",
+                                       node_label_max = 150,
+                                       node_color_axis_label = "Proportion PCR success",
+                                       node_size_axis_label = "Sequence count",
+                                       output_file = file.path(output_folder, paste0("greengenes--pcr_all", output_format))))
+print(greengenes_plot_pcr_all)
+```
+
+```{r, greengenes_plot_pcr_fail}
+system.time(greengenes_plot_pcr_fail <- greengenes_pcr %>%
+              filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
+              plot(node_size = n_obs - count_amplified,
+                   node_label = name,
+                   node_color = prop_amplified,
+                   node_color_range =  pcr_success_color_scale,
+                   node_color_trans = "linear",
+                   node_color_interval = c(0, 1),
+                   # node_label_size_range = c(0.008, 0.015),
+                   node_label_max = 500,
+                   node_color_axis_label = "Proportion PCR success",
+                   node_size_axis_label = "Sequences not amplified",
+                   output_file = file.path(output_folder, paste0("greengenes--pcr_fail", output_format))))
+print(greengenes_plot_pcr_fail)
+```
+
+
+## Save outputs for composite figure
+
+Some results from this file will be combined with other to make a composite figure.
+Below, the needed objects are saved so that they can be loaded by another Rmd file.
+
+```{r greengenes_save}
+save(file = file.path(output_folder, "greengenes_data.RData"),
+     greengenes_plot_all, greengenes_plot_pcr_fail)
+```
+
+