diff --git a/.Rbuildignore b/.Rbuildignore index bd73ad6..62c5510 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,9 +6,7 @@ ^.+\.Rproj$ ^\.Rproj\.user$ ^quilt_config\.yml$ -^inst/doc/presentation/.*$ -^inst/doc/publication_analyses/.*$ -^inst/doc/website_source/.*\.html$ +^inst/doc/.*$ ^big_data$ ^scratch$ ^inst/extdata/mothur\.summary$ diff --git a/inst/doc/publication_analyses/16s_database_comparison/01--silva.Rmd b/inst/doc/publication_analyses/16s_database_comparison/01--silva.Rmd new file mode 100644 index 0000000..caff09a --- /dev/null +++ b/inst/doc/publication_analyses/16s_database_comparison/01--silva.Rmd @@ -0,0 +1,116 @@ +--- +title: Graphing the SILVA database +--- + +```{r silva_setup, echo=FALSE, warning=FALSE, message=FALSE} +source("settings.R") +``` + +## Requirements + +**NOTE:** This analysis requires at least 10Gb of RAM to run. + +## Parse database + +The code below parses and subsets the entire SILVA non-redundant reference database. +The object made is quite large. + +```{r load_silva} +file_path <- file.path(input_folder, "SILVA_123.1_SSURef_Nr99_tax_silva.fasta") +system.time(silva <- extract_taxonomy(seqinr::read.fasta(file_path, as.string = TRUE), + regex = "^(.*?) (.*)$", + key = c(id = "obs_info", "class"), + class_sep = ";")) +print(silva) +``` + +## Subset + +```{r subset_silva} +if (! is.null(max_taxonomy_depth)) { + system.time(silva <- filter_taxa(silva, n_supertaxa <= max_taxonomy_depth)) + print(silva) +} +if (! is.null(min_seq_count)) { + system.time(silva <- filter_taxa(silva, n_obs >= min_seq_count)) + print(silva) +} +if (just_bacteria) { + system.time(silva <- filter_taxa(silva, name == "Bacteria", subtaxa = TRUE)) + print(silva) +} +``` + + +## Plot whole database + +Although SILVA is such a large database (`r nrow(silva$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database: + +```{r silva_plot_all} +system.time(silva_plot_all <- plot(silva, + node_size = n_obs, + node_color = n_obs, + node_label = name, + node_label_max = 150, + node_color_axis_label = "Sequence count", + make_legend = FALSE, + output_file = file.path(output_folder, paste0("silva--all", output_format)))) +print(silva_plot_all) +``` + + +## PCR + +```{r silva_pcr} +# Replace all u with t so in silico PCR works +system.time(silva$obs_data$sequence <- gsub(pattern = "u", replacement = "t", silva$obs_data$sequence)) +# in silico PCR +system.time(silva_pcr <- primersearch(silva, + forward = forward_primer, + reverse = reverse_primer, + mismatch = max_mismatch)) +``` + +```{r silva_plot_pcr_all} +system.time(silva_plot_pcr_all <- plot(silva_pcr, + node_size = n_obs, + node_label = name, + node_color = prop_amplified, + node_color_range = pcr_success_color_scale, + node_color_trans = "linear", + node_label_max = 150, + node_color_axis_label = "Proportion PCR success", + node_size_axis_label = "Sequence count", + output_file = file.path(output_folder, paste0("silva--pcr_all", output_format)))) +print(silva_plot_pcr_all) +``` + +```{r, silva_plot_pcr_fail} +system.time(silva_plot_pcr_fail <- silva_pcr %>% + filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>% + plot(node_size = n_obs - count_amplified, + node_label = name, + node_color = prop_amplified, + node_color_range = pcr_success_color_scale, + node_color_trans = "linear", + node_color_interval = c(0, 1), + node_label_size_range = c(0.008, 0.015), + node_label_max = 500, + node_color_axis_label = "Proportion PCR success", + node_size_axis_label = "Sequences not amplified", + make_legend = FALSE, + output_file = file.path(output_folder, paste0("silva--pcr_fail", output_format)))) +print(silva_plot_pcr_fail) +``` + + +## Save outputs for composite figure + +Some results from this file will be combined with other to make a composite figure. +Below, the needed objects are saved so that they can be loaded by another Rmd file. + +```{r silva_save} +save(file = file.path(output_folder, "silva_data.RData"), + silva_plot_all, silva_plot_pcr_fail) +``` + diff --git a/inst/doc/publication_analyses/16s_database_comparison/02--rdp.Rmd b/inst/doc/publication_analyses/16s_database_comparison/02--rdp.Rmd new file mode 100644 index 0000000..1bb3255 --- /dev/null +++ b/inst/doc/publication_analyses/16s_database_comparison/02--rdp.Rmd @@ -0,0 +1,115 @@ +--- +title: Graphing the RDP database +--- + +```{r rdp_setup, echo=FALSE, warning=FALSE, message=FALSE} +source("settings.R") +``` + +## Requirements + +**NOTE:** This analysis requires at least 16Gb of RAM to run. + +## Parse database + +The code below parses and subsets the entire RDP non-redundant reference database. +The object made is quite large. + +```{r load_rdp} +file_path <- file.path(input_folder, "rdp_current_Bacteria_unaligned.fa") +system.time(rdp <- extract_taxonomy(seqinr::read.fasta(file_path, as.string = TRUE), + regex = "\\tLineage(.*)", + key = c("class"), + class_regex = "[;=](.+?);(.+?)", + class_key = c("name", rdp_rank = "taxon_info"))) +print(rdp) +``` + +## Subset + +```{r subset_rdp} +if (! is.null(max_taxonomy_depth)) { + system.time(rdp <- filter_taxa(rdp, n_supertaxa <= max_taxonomy_depth)) + print(rdp) +} +if (! is.null(min_seq_count)) { + system.time(rdp <- filter_taxa(rdp, n_obs >= min_seq_count)) + print(rdp) +} +if (just_bacteria) { + system.time(rdp <- filter_taxa(rdp, name == "Bacteria", subtaxa = TRUE)) + print(rdp) +} +``` + + + +## Plot whole database + +Although RDP is such a large database (`r nrow(rdp$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database: + +```{r rdp_plot_all} +system.time(rdp_plot_all <- plot(rdp, + node_size = n_obs, + node_color = n_obs, + node_label = name, + node_label_max = 150, + node_color_axis_label = "Sequence count", + make_legend = FALSE, + output_file = file.path(output_folder, paste0("rdp--all", output_format)))) +print(rdp_plot_all) +``` + + +## PCR + +```{r rdp_pcr} +system.time(rdp_pcr <- primersearch(rdp, + forward = forward_primer, + reverse = reverse_primer, + mismatch = max_mismatch)) +``` + +```{r rdp_plot_pcr_all} +system.time(rdp_plot_pcr_all <- plot(rdp_pcr, + node_size = n_obs, + node_label = name, + node_color = prop_amplified, + node_color_range = pcr_success_color_scale, + node_color_trans = "linear", + node_label_max = 150, + node_color_axis_label = "Proportion PCR success", + node_size_axis_label = "Sequence count", + output_file = file.path(output_folder, paste0("rdp--pcr_all", output_format)))) +print(rdp_plot_pcr_all) +``` + +```{r, rdp_plot_pcr_fail} +system.time(rdp_plot_pcr_fail <- rdp_pcr %>% + filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>% + plot(node_size = n_obs - count_amplified, + node_label = name, + node_color = prop_amplified, + node_color_range = pcr_success_color_scale, + node_color_trans = "linear", + node_color_interval = c(0, 1), + node_label_size_range = c(0.008, 0.015), + node_label_max = 500, + node_color_axis_label = "Proportion PCR success", + node_size_axis_label = "Sequences not amplified", + make_legend = FALSE, + output_file = file.path(output_folder, paste0("rdp--pcr_fail", output_format)))) +print(rdp_plot_pcr_fail) +``` + + +## Save outputs for composite figure + +Some results from this file will be combined with other to make a composite figure. +Below, the needed objects are saved so that they can be loaded by another Rmd file. + +```{r rdp_save} +save(file = file.path(output_folder,"rdp_data.RData"), + rdp_plot_all, rdp_plot_pcr_fail) +``` + diff --git a/inst/doc/publication_analyses/16s_database_comparison/03--greengenes.Rmd b/inst/doc/publication_analyses/16s_database_comparison/03--greengenes.Rmd new file mode 100644 index 0000000..f35e99d --- /dev/null +++ b/inst/doc/publication_analyses/16s_database_comparison/03--greengenes.Rmd @@ -0,0 +1,154 @@ +--- +title: Graphing the Greengenes database +--- + +```{r greengenes_setup, echo=FALSE, warning=FALSE, message=FALSE} +source("settings.R") +``` + +## Requirements + +**NOTE:** This analysis requires at least 10Gb of RAM to run. + +## Parse database + +The greengenes database stores sequences in one file and taxonomy information in another and the order of the two files differ making parseing more difficult than the other databases. +Since taxonomy inforamtion is needed for creating the `taxmap` data structure, we will parse it first and add the sequence information on after. + +### Parse taxonomy file + +```{r greengenes_taxonomy} +gg_taxonomy_path <- "databases/gg_13_5_taxonomy.txt" +gg_taxonomy <- readLines(gg_taxonomy_path) +print(gg_taxonomy[1:5]) +``` + +Note that there are some ranks with no names. +These will be removed after parsing the file since they provide no information and an uniform-length taxonomy is not needed. + +```{r greengenes_taxonomy_parse} +# Parse taxonomy file +system.time(greengenes <- extract_taxonomy(input = gg_taxonomy, key = c(id = "obs_info", "class"), regex = "^([0-9]+)\t(.*)$", class_sep = "; ", class_regex = "^([a-z]{1})__(.*)$", class_key = c(rank = "taxon_info", "name"))) +# Remove data for ranks with no information +greengenes <- filter_taxa(greengenes, name != "") +print(greengenes) +``` + + +### Parse sequence file + +Next we will parse the sequence file so we can add it to the `obs_data` table of the `greengenes` object. + +```{r greengenes_sequence} +gg_sequence_path <- "databases/gg_13_5.fasta" +substr(readLines(gg_sequence_path, n = 10), 1, 100) +``` + +This can be easily parsed using `seqinr`: + +```{r greengenes_sequence_read} +gg_sequences <- seqinr::read.fasta(gg_sequence_path, as.string = TRUE) +``` + + +### Integrating sequence and taxonomy + +We will need to use the Greengenes ID to match up which sequence goes with which row since they are in different orders. + +```{r greengenes_combine} +greengenes <- mutate_obs(greengenes, sequence = unlist(gg_sequences)[as.character(id)]) +``` + + +## Subset + +```{r subset_greengenes} +if (! is.null(max_taxonomy_depth)) { + system.time(greengenes <- filter_taxa(greengenes, n_supertaxa <= max_taxonomy_depth)) + print(greengenes) +} +if (! is.null(min_seq_count)) { + system.time(greengenes <- filter_taxa(greengenes, n_obs >= min_seq_count)) + print(greengenes) +} +if (just_bacteria) { + system.time(greengenes <- filter_taxa(greengenes, name == "Bacteria", subtaxa = TRUE)) + print(greengenes) +} +``` + + + +## Plot whole database + +Although Greengenes is such a large database (`r nrow(greengenes$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database: + +```{r greengenes_plot_all} +system.time(greengenes_plot_all <- plot(greengenes, + node_size = n_obs, + node_size_range = c(0.001, 0.01), + node_size_interval = c(1, 3000000), + edge_size_range = c(0.001, 0.01), + edge_size_interval = c(1, 3000000), + node_color_interval = c(1, 3000000), + node_color = n_obs, + node_label = name, + node_label_max = 150, + node_color_axis_label = "Sequence count", + output_file = file.path(output_folder, paste0("greengenes--all", output_format)))) +print(greengenes_plot_all) +``` + + +## PCR + +```{r greengenes_pcr} +system.time(greengenes_pcr <- primersearch(greengenes, + forward = forward_primer, + reverse = reverse_primer, + mismatch = max_mismatch)) +``` + +```{r greengenes_plot_pcr_all} +system.time(greengenes_plot_pcr_all <- plot(greengenes_pcr, + node_size = n_obs, + node_label = name, + node_color = prop_amplified, + node_color_range = pcr_success_color_scale, + node_color_trans = "linear", + node_label_max = 150, + node_color_axis_label = "Proportion PCR success", + node_size_axis_label = "Sequence count", + output_file = file.path(output_folder, paste0("greengenes--pcr_all", output_format)))) +print(greengenes_plot_pcr_all) +``` + +```{r, greengenes_plot_pcr_fail} +system.time(greengenes_plot_pcr_fail <- greengenes_pcr %>% + filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>% + plot(node_size = n_obs - count_amplified, + node_label = name, + node_color = prop_amplified, + node_color_range = pcr_success_color_scale, + node_color_trans = "linear", + node_color_interval = c(0, 1), + # node_label_size_range = c(0.008, 0.015), + node_label_max = 500, + node_color_axis_label = "Proportion PCR success", + node_size_axis_label = "Sequences not amplified", + output_file = file.path(output_folder, paste0("greengenes--pcr_fail", output_format)))) +print(greengenes_plot_pcr_fail) +``` + + +## Save outputs for composite figure + +Some results from this file will be combined with other to make a composite figure. +Below, the needed objects are saved so that they can be loaded by another Rmd file. + +```{r greengenes_save} +save(file = file.path(output_folder, "greengenes_data.RData"), + greengenes_plot_all, greengenes_plot_pcr_fail) +``` + + diff --git a/inst/doc/publication_analyses/16s_database_comparison/04--16s_database_comparision.Rmd b/inst/doc/publication_analyses/16s_database_comparison/04--16s_database_comparision.Rmd new file mode 100644 index 0000000..692f9dd --- /dev/null +++ b/inst/doc/publication_analyses/16s_database_comparison/04--16s_database_comparision.Rmd @@ -0,0 +1,40 @@ +--- +title: "Comparing 16S databases" +--- + +```{r, echo=FALSE, warning=FALSE, message=FALSE} +source("settings.R") +``` + +## Run individual analyses + +```{r run_all} +library(rmarkdown) +render(input = "01--silva.Rmd") +render(input = "02--rdp.Rmd") +render(input = "03--greengenes.Rmd") +``` + + +## Load plots + +```{r load} +load(file.path(output_folder, "silva_data.RData")) +load(file.path(output_folder, "rdp_data.RData")) +load(file.path(output_folder, "greengenes_data.RData")) +``` + + +## Combine plots + +```{r combine} +combo_plot <- gridExtra::grid.arrange(ncol = 2, nrow = 3, + top = "All data Not amplified ", + left = "SILVA RDP Greengenes", + silva_plot_all, silva_plot_pcr_fail, + rdp_plot_all, rdp_plot_pcr_fail, + greengenes_plot_all, greengenes_plot_pcr_fail) +ggplot2::ggsave(file.path(output_folder, "figure_2--16s_database_comparison.pdf"), + combo_plot, width = 7.5, height = 10) +``` + diff --git a/inst/doc/publication_analyses/16s_database_comparison/vignette_06_examples--16s_database_comparision.Rmd b/inst/doc/publication_analyses/16s_database_comparison/vignette_06_examples--16s_database_comparision.Rmd deleted file mode 100644 index 857f70e..0000000 --- a/inst/doc/publication_analyses/16s_database_comparison/vignette_06_examples--16s_database_comparision.Rmd +++ /dev/null @@ -1,351 +0,0 @@ ---- -title: "Comparing 16S databases" -output: - html_document: - toc: false - css: custom.css ---- - -```{r, echo=FALSE, warning=FALSE, message=FALSE} -library(metacoder) -library(knitr) -source("knitr_settings.R") -``` - -## Analyses parameters - -```{r} -input_folder <- "../../../big_data" -output_folder <- "16s_database_comparison_results" -output_format <- ".pdf" -max_taxonomy_depth <- 7 -min_seq_count <- 10 -max_mismatch <- 10 # percentage mismatch tolerated in pcr -pcr_success_cutoff <- 0.90 # Used to subset for graphing -forward = c("515F" = "GTGYCAGCMGCCGCGGTAA") -reverse = c("806R" = "GGACTACNVGGGTWTCTAAT") -``` - -```{r} -dir.create(output_folder, showWarnings = FALSE) -``` - - -## SILVA - -### Parse file - -The code below parses and subsets the entire SILVA non-redundant reference database. -It is done in a single step so that no large intermediate files are generated. -Even so, the object made is quite large. - -```{r} -set.seed(1) -file_path <- file.path(input_folder, "SILVA_123.1_SSURef_Nr99_tax_silva.fasta") -system.time(silva <- seqinr::read.fasta(file_path, as.string = TRUE) %>% - extract_taxonomy(regex = "^(.*?) (.*)$", - key = c(id = "obs_info", "class"), - class_sep = ";") %>% - filter_taxa(n_supertaxa <= max_taxonomy_depth) %>% - filter_taxa(name == "Bacteria", subtaxa = TRUE) %>% - filter_taxa(n_obs >= min_seq_count)) -``` - -### Plot data - -```{r} -system.time(filter_taxa(silva, n_supertaxa <= 4) %>% - plot(node_size = n_obs, - node_color = n_obs, - node_label = name, - node_label_max = 150, - title = "SILVA sequence abundance", - title_size = 0.03, - node_color_axis_label = "Sequence count", - output_file = file.path(output_folder, paste0("silva--abundance_rank_5", output_format)))) -``` - -### PCR - -```{r} -silva$obs_data$sequence <- gsub(pattern = "u", replacement = "t", silva$obs_data$sequence) -system.time(pcr <- primersearch(silva, - forward = forward_primer, - reverse = reverse_primer, - mismatch = max_mismatch)) -``` - -```{r} -system.time(filter_taxa(pcr, n_supertaxa <= 4) %>% - plot(node_size = n_obs, - node_label = name, - node_color = prop_amplified, - node_color_range = c("red", "cyan"), - node_color_trans = "linear", - node_label_max = 150, - title = paste0("SILVA in silico pcr with ", pcr$obs_data$pair_name[1]), - title_size = 0.03, - node_color_axis_label = "Proportion PCR success", - node_size_axis_label = "Sequence count", - output_file = file.path(output_folder, paste0("silva--pcr_all_rank_5", output_format)))) -``` - -```{r} -system.time(filter_taxa(pcr, n_supertaxa <= 4) %>% - filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>% - plot(node_size = n_obs, - node_label = name, - node_color = prop_amplified, - node_color_range = c("red", "cyan"), - node_color_trans = "linear", - node_label_max = 100, - title = paste0("SILVA in silico pcr with ", pcr$obs_data$pair_name[1], - " ( < ", pcr_success_cutoff * 100, "%)"), - title_size = 0.03, - node_color_axis_label = "Proportion PCR success", - node_size_axis_label = "Sequence count", - output_file = file.path(output_folder, paste0("silva--pcr_rank_5_less_than_", - pcr_success_cutoff * 100, - output_format)))) -``` - - - - -## RDP - -```{r} -set.seed(1) -file_path <- file.path(input_folder, "rdp_current_Bacteria_unaligned.fa") -system.time(rdp <- seqinr::read.fasta(file_path, as.string = TRUE) %>% - extract_taxonomy(regex = "\\tLineage(.*)", - key = c("class"), - class_regex = "[;=](.+?);(.+?)", - class_key = c("name", rdp_rank = "taxon_info")) %>% - filter_taxa(n_supertaxa <= max_taxonomy_depth) %>% - filter_taxa(name == "Bacteria", subtaxa = TRUE) %>% - filter_taxa(n_obs >= min_seq_count)) -``` - - -### Plot data - -```{r} -system.time(filter_taxa(rdp, n_supertaxa <= 4) %>% - plot(node_size = n_obs, - node_color = n_obs, - node_label = name, - node_label_max = 150, - title = "RDP sequence abundance", - title_size = 0.03, - node_color_axis_label = "Sequence count", - output_file = file.path(output_folder, paste0("rdp--abundance_rank_5", output_format)))) -``` - - -```{r} -system.time(plot(rdp, node_size = n_obs, - node_color = n_obs, - node_label = name, - node_label_max = 150, - title = "RDP sequence abundance", - title_size = 0.03, - node_color_axis_label = "Sequence count", - output_file = file.path(output_folder, paste0("rdp--abundance_all_taxa", output_format)))) -``` - - -### PCR - -```{r} -system.time(pcr <- primersearch(rdp, - forward = forward_primer, - reverse = reverse_primer, - mismatch = max_mismatch)) -``` - -```{r} -system.time(filter_taxa(pcr, n_supertaxa <= 4) %>% - plot(node_size = n_obs, - node_label = name, - node_color = prop_amplified, - node_color_range = c("red", "cyan"), - node_color_trans = "linear", - node_label_max = 150, - title = paste0("RDP in silico pcr with ", unique(pcr$obs_data$pair_name)[2]), - title_size = 0.03, - node_color_axis_label = "Proportion PCR success", - node_size_axis_label = "Sequence count", - output_file = file.path(output_folder, paste0("rdp--pcr_rank_5", output_format)))) -``` - -```{r} -system.time(plot(pcr, - node_size = n_obs, - node_label = name, - node_color = prop_amplified, - node_color_range = c("red", "cyan"), - node_color_trans = "linear", - node_label_max = 150, - title = paste0("RDP in silico pcr with ", unique(pcr$obs_data$pair_name)[2]), - title_size = 0.03, - node_color_axis_label = "Proportion PCR success", - node_size_axis_label = "Sequence count", - output_file = file.path(output_folder, paste0("rdp--pcr_all_taxa", output_format)))) -``` - - - -```{r} -system.time(filter_taxa(pcr, prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>% - filter_taxa(n_supertaxa <= 4) %>% - plot(node_size = n_obs, - node_label = name, - node_color = prop_amplified, - node_color_range = c("red", "cyan"), - node_color_trans = "linear", - node_label_max = 100, - title = paste0("RDP in silico pcr with ", unique(pcr$obs_data$pair_name)[2], - " ( < ", pcr_success_cutoff * 100, "%)"), - title_size = 0.03, - node_color_axis_label = "Proportion PCR success", - node_size_axis_label = "Sequence count", - output_file = file.path(output_folder, paste0("rdp--pcr_rank_5_less_than_", - pcr_success_cutoff * 100, - output_format)))) -``` - - -```{r} -system.time(filter_taxa(pcr, prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>% - plot(node_size = n_obs, - node_label = name, - node_color = prop_amplified, - node_color_range = c("red", "cyan"), - node_color_trans = "linear", - node_label_max = 100, - title = paste0("RDP in silico pcr with ", unique(pcr$obs_data$pair_name)[2], - " ( < ", pcr_success_cutoff * 100, "%)"), - title_size = 0.03, - node_color_axis_label = "Proportion PCR success", - node_size_axis_label = "Sequence count", - output_file = file.path(output_folder, paste0("rdp--pcr_all_taxa_less_than_", - pcr_success_cutoff * 100, - output_format)))) -``` - - - -## Greengenes - -### Parse data - -The greengenes database stores sequences in one file and taxonomy information in another and the order of the two files differ making parseing more difficult than the other databases. -Since taxonomy inforamtion is needed for creating the `taxmap` data structure, we will parse it first and add the sequence information on after. - -#### Parse taxonomy file - -```{r} -gg_taxonomy_path <- "data/gg_13_5_taxonomy.txt" -gg_taxonomy <- readLines(gg_taxonomy_path) -print(gg_taxonomy[1:5]) -``` - -Note that there are some ranks with no names. -These will be removed after parsing the file since they provide no information and an uniform-length taxonomy is not needed. - -```{r} -# Parse taxonomy file -system.time(greengenes <- extract_taxonomy(input = gg_taxonomy, key = c(id = "obs_info", "class"), regex = "^([0-9]+)\t(.*)$", class_sep = "; ", class_regex = "^([a-z]{1})__(.*)$", class_key = c(rank = "taxon_info", "name"))) -# Remove data for ranks with no information -greengenes <- filter_taxa(greengenes, name != "") -print(greengenes) -``` - - -#### Parse sequence file - -Next we will parse the sequence file so we can add it to the `obs_data` table of the `greengenes` object. - -```{r} -gg_sequence_path <- "data/gg_13_5.fasta" -substr(readLines(gg_sequence_path, n = 10), 1, 100) -``` - -This can be easily parsed using `seqinr`: - -```{r} -gg_sequences <- seqinr::read.fasta(gg_sequence_path, as.string = TRUE) -``` - - -#### Integrating sequence and taxonomy - -We will need to use the Greengenes ID to match up which sequence goes with which row since they are in different orders. - -```{r} -greengenes <- mutate_obs(greengenes, sequence = unlist(gg_sequences)[as.character(id)]) -``` - - -### Plot data - -```{r} -system.time(filter_taxa(greengenes, name == "Bacteria", subtaxa = TRUE) %>% - plot(node_size = n_obs, - node_color = n_obs, - node_label = name, - node_label_max = 150, - title = "Greengenes sequence abundance", - title_size = 0.03, - node_color_axis_label = "Sequence count", - output_file = file.path(output_folder, paste0("gg--abundance_all_taxa", output_format)))) -``` - - -### PCR - -```{r} -system.time(pcr <- primersearch(greengenes, - forward = forward, - reverse = reverse, - mismatch = max_mismatch)) -``` - -```{r} -system.time(filter_taxa(pcr, name == "Bacteria", subtaxa = TRUE) %>% - plot(node_size = n_obs, - node_label = name, - node_color = prop_amplified, - node_color_range = c("red", "cyan"), - node_color_trans = "linear", - node_label_max = 150, - title = paste0("Greengenes in silico pcr with ", unique(pcr$obs_data$pair_name)[2]), - title_size = 0.03, - node_color_axis_label = "Proportion PCR success", - node_size_axis_label = "Sequence count", - output_file = file.path(output_folder, paste0("gg--pcr_all_taxa", output_format)))) -``` - -```{r} -system.time(filter_taxa(pcr, name == "Bacteria", subtaxa = TRUE) %>% - filter_taxa(n_obs >= 3, supertaxa = TRUE) %>% - filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>% - plot(node_size = n_obs - count_amplified, - node_label = name, - node_color = prop_amplified, - node_color_range = c("red", "orange", "yellow", "green", "cyan"), - node_color_trans = "linear", - node_color_interval = c(0, 1), - node_label_size_range = c(0.008, 0.015), - node_label_max = 500, - title = paste0("Greengenes in silico pcr with ", unique(pcr$obs_data$pair_name)[2], - " ( < ", pcr_success_cutoff * 100, "%)"), - title_size = 0.03, - node_color_axis_label = "Proportion PCR success", - node_size_axis_label = "Sequences not amplified", - output_file = file.path(output_folder, paste0("gg--pcr_all_taxa_less_than_", - pcr_success_cutoff * 100, - output_format)))) -``` - diff --git a/inst/doc/publication_analyses/tara_oceans/vignette_05_examples--tara_oceans.Rmd b/inst/doc/publication_analyses/tara_oceans/vignette_05_examples--tara_oceans.Rmd index bb0e8d1..683576c 100644 --- a/inst/doc/publication_analyses/tara_oceans/vignette_05_examples--tara_oceans.Rmd +++ b/inst/doc/publication_analyses/tara_oceans/vignette_05_examples--tara_oceans.Rmd @@ -4,26 +4,18 @@ output: html_document --- ```{r, echo=FALSE, warning=FALSE, message=FALSE} -library(knitr) -source("knitr_settings.R") +library(metacoder) ``` -### Analyses parameters +## Analyses parameters ```{r} -output_folder <- "tara_oceans_results" +output_folder <- getwd() output_format <- ".pdf" ``` -### Create output directory - -```{r} -dir.create(output_folder, showWarnings = FALSE) -``` - - -### Parsing taxonomic data +## Parsing taxonomic data The code below downloads the compressed data to a temporary directory: @@ -47,14 +39,11 @@ unpacked_file_path <- file.path(temp_dir_path, ``` ```{r, warning=FALSE, message=FALSE} -# Load the package -library(metacoder) -# Extract the taxonomic information of the sequences data <- parse_taxonomy_table(unpacked_file_path, taxon_col = c("class" = -9), class_sep = "\\|") ``` -### Getting sample data +## Getting sample data ```{r} data_url <- "http://taraoceans.sb-roscoff.fr/EukDiv/data/Database_W1_Sample_parameters.xls" @@ -65,7 +54,7 @@ sample_data <- readxl::read_excel(local_file_path) ``` -### Caluculate read abundance per taxon +## Caluculate read abundance per taxon ```{r} sample_columns <- sample_data[["PANGAEA ACCESSION NUMBER"]] @@ -75,7 +64,7 @@ data <- mutate_taxa(data, ``` -### Plot read and OTU abundance +## Plot read and OTU abundance ```{r} seed = 9 #9, 10, 12 is good @@ -114,159 +103,11 @@ data %>% background_color = background_color, maxiter = 50, fineiter = 50, margin_size = c(0.001, 0.001), - output_file = file.path(output_folder, paste0("sup_figure_1--tara_all_plankton_diversity--seed_", seed, output_format))) -``` - - -### Sampling depth differential - -#### Calculate depth read counts - -```{r} -sur_columms <- sample_data[["PANGAEA ACCESSION NUMBER"]][sample_data[["SAMPLING DEPTH"]] == "SUR"] -dcm_columms <- sample_data[["PANGAEA ACCESSION NUMBER"]][sample_data[["SAMPLING DEPTH"]] == "DCM"] - -data <- mutate_obs(data, sur_counts = apply(data$obs_data[, sur_columms], MARGIN = 1, - function(x) sum(as.numeric(x)))) -data <- mutate_obs(data, dcm_counts = apply(data$obs_data[, dcm_columms], MARGIN = 1, - function(x) sum(as.numeric(x)))) - -data <- mutate_taxa(data, sur_taxon_counts = vapply(obs(data), - function(x) sum(data$obs_data$sur_counts[x]), numeric(1))) -data <- mutate_taxa(data, dcm_taxon_counts = vapply(obs(data), - function(x) sum(data$obs_data$dcm_counts[x]), numeric(1))) - -data <- mutate_taxa(data, depth_prop_dcm = dcm_taxon_counts / read_counts) -total_dcm_reads <- sum(data$taxon_data$dcm_taxon_counts[n_supertaxa(data) == 0]) -total_sur_reads <- sum(data$taxon_data$sur_taxon_counts[n_supertaxa(data) == 0]) -total_reads <- total_dcm_reads + total_sur_reads - - -prop_diff_test_stat <- function(prop_1, prop_2, prop_total, count_1, count_2) { - (prop_1 - prop_2) / sqrt(prop_total * (1 - prop_total) * (1 / count_1 + 1 / count_2)) -} - -data <- mutate_taxa(data, depth_prop_diff = ((dcm_taxon_counts / total_dcm_reads) - (sur_taxon_counts / total_sur_reads)) / (read_counts / total_reads)) -data <- mutate_taxa(data, depth_diff_test_stat = prop_diff_test_stat(prop_1 = dcm_taxon_counts / total_dcm_reads, - prop_2 = sur_taxon_counts / total_sur_reads, - prop_total = read_counts / total_reads, - count_1 = total_dcm_reads, - count_2 = total_sur_reads)) -``` - -#### Plot proportion of reads in DCM - -```{r} -seed = 9 #9, 10, 12 is good -set.seed(seed) -taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$") -taxa_patterns_to_remove <- paste0("^", c("X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$") -data %>% - filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>% - filter_taxa(read_counts >= 100) %>% - filter_taxa(!(taxon_ids %in% subtaxa(data, name == "Eukaryota", simplify = TRUE, include_input = TRUE) & n_supertaxa <= 1)) %>% - plot(title = "Location of taxa in water column", - title_size = 0.03, - node_color_axis_label = "DCM only Both Surface only", - node_size_axis_label = "Number of species (OTUs)", - node_size = n_obs, - node_size_range = c(0.0012, NA), - node_color = depth_prop_dcm, - node_color_range = diverging_palette(), - node_color_trans = "linear", - node_color_interval = c(0, 1), - node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) & - ! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)), - name, NA), - node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5, - node_label_size_trans = "area", - node_label_size_range = c(0.001, NA), - node_label_max = 1000, - tree_label = name, - tree_label_max = 100, - initial_layout = "re", layout = "da", - overlap_avoidance = .65, - maxiter = 50, fineiter = 50, - margin_size = c(0.001, 0.001), - output_file = file.path(output_folder, paste0("sup_figure_2--tara_proportion_in_dcm--seed_", seed, output_format))) + output_file = file.path(output_folder, paste0("sup_figure_1--tara_all_plankton_diversity", output_format))) ``` -```{r} -seed = 9 #9, 10, 12 is good -set.seed(seed) -taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$") -taxa_patterns_to_remove <- paste0("^", c("X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$") -data %>% - filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>% - filter_taxa(read_counts >= 100) %>% - filter_taxa(!(taxon_ids %in% subtaxa(data, name == "Eukaryota", simplify = TRUE, include_input = TRUE) & n_supertaxa <= 1)) %>% - plot(title = "Location of taxa in water column", - title_size = 0.03, - node_color_axis_label = "DCM only Both Surface only", - node_size_axis_label = "Number of species (OTUs)", - node_size = n_obs, - node_size_range = c(0.0012, NA), - node_color = depth_prop_diff, - node_color_range = diverging_palette(), - node_color_trans = "linear", - node_color_interval = c(-2, 2), - edge_color_interval = c(-2, 2), - node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) & - ! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)), - name, NA), - node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5, - node_label_size_trans = "area", - node_label_size_range = c(0.001, NA), - node_label_max = 1000, - tree_label = name, - tree_label_max = 100, - initial_layout = "re", layout = "da", - overlap_avoidance = .65, - maxiter = 50, fineiter = 50, - margin_size = c(0.001, 0.001), - output_file = file.path(output_folder, paste0("sup_figure_2--tara_scaled_proportion_diff--seed_", seed, output_format))) -``` - -```{r} -seed = 9 #9, 10, 12 is good -set.seed(seed) -taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$") -taxa_patterns_to_remove <- paste0("^", c("X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$") -data %>% - filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>% - filter_taxa(read_counts >= 100) %>% - filter_taxa(!(taxon_ids %in% subtaxa(data, name == "Eukaryota", simplify = TRUE, include_input = TRUE) & n_supertaxa <= 1)) %>% - plot(title = "Location of taxa in water column", - title_size = 0.03, - node_color_axis_label = "DCM only Both Surface only", - node_size_axis_label = "Number of species (OTUs)", - node_size = n_obs, - node_size_range = c(0.0012, NA), - node_color = depth_diff_test_stat, - node_color_range = diverging_palette(), - node_color_trans = "area", - node_color_interval = c(-1000, 1000), - edge_color_interval = c(-1000, 1000), - node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) & - ! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)), - name, NA), - node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5, - node_label_size_trans = "area", - node_label_size_range = c(0.001, NA), - node_label_max = 1000, - tree_label = name, - tree_label_max = 100, - initial_layout = "re", layout = "da", - overlap_avoidance = .65, - maxiter = 10, fineiter = 10, - margin_size = c(0.001, 0.001), - output_file = file.path(output_folder, paste0("sup_figure_2--tara_depth_test_stat--seed_", seed, output_format))) -``` - - - -### Plot propotion of OTUs identified +## Plot propotion of OTUs identified ```{r} data <- mutate_obs(data, pid = as.numeric(pid)) @@ -310,10 +151,10 @@ data %>% overlap_avoidance = .65, maxiter = 50, fineiter = 50, margin_size = c(0.001, 0.001), - output_file = file.path(output_folder, paste0("sup_figure_3--tara_proportion_identified--seed_", seed, output_format))) + output_file = file.path(output_folder, paste0("sup_figure_2--tara_proportion_identified", output_format))) ``` -#### Just Metazoa +### Just Metazoa ```{r} seed = 1 @@ -340,6 +181,6 @@ data %>% initial_layout = "re", layout = "da", overlap_avoidance = .65, maxiter = 50, fineiter = 50, - output_file = file.path(output_folder, paste0("figure_1--tara_proportion_identified_metazoa--seed_", seed, output_format))) + output_file = file.path(output_folder, paste0("figure_1--tara_proportion_identified_metazoa", output_format))) ```