-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
adf3659
commit b9cf970
Showing
7 changed files
with
438 additions
and
525 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
116 changes: 116 additions & 0 deletions
116
inst/doc/publication_analyses/16s_database_comparison/01--silva.Rmd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
--- | ||
title: Graphing the SILVA database | ||
--- | ||
|
||
```{r silva_setup, echo=FALSE, warning=FALSE, message=FALSE} | ||
source("settings.R") | ||
``` | ||
|
||
## Requirements | ||
|
||
**NOTE:** This analysis requires at least 10Gb of RAM to run. | ||
|
||
## Parse database | ||
|
||
The code below parses and subsets the entire SILVA non-redundant reference database. | ||
The object made is quite large. | ||
|
||
```{r load_silva} | ||
file_path <- file.path(input_folder, "SILVA_123.1_SSURef_Nr99_tax_silva.fasta") | ||
system.time(silva <- extract_taxonomy(seqinr::read.fasta(file_path, as.string = TRUE), | ||
regex = "^(.*?) (.*)$", | ||
key = c(id = "obs_info", "class"), | ||
class_sep = ";")) | ||
print(silva) | ||
``` | ||
|
||
## Subset | ||
|
||
```{r subset_silva} | ||
if (! is.null(max_taxonomy_depth)) { | ||
system.time(silva <- filter_taxa(silva, n_supertaxa <= max_taxonomy_depth)) | ||
print(silva) | ||
} | ||
if (! is.null(min_seq_count)) { | ||
system.time(silva <- filter_taxa(silva, n_obs >= min_seq_count)) | ||
print(silva) | ||
} | ||
if (just_bacteria) { | ||
system.time(silva <- filter_taxa(silva, name == "Bacteria", subtaxa = TRUE)) | ||
print(silva) | ||
} | ||
``` | ||
|
||
|
||
## Plot whole database | ||
|
||
Although SILVA is such a large database (`r nrow(silva$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database: | ||
|
||
```{r silva_plot_all} | ||
system.time(silva_plot_all <- plot(silva, | ||
node_size = n_obs, | ||
node_color = n_obs, | ||
node_label = name, | ||
node_label_max = 150, | ||
node_color_axis_label = "Sequence count", | ||
make_legend = FALSE, | ||
output_file = file.path(output_folder, paste0("silva--all", output_format)))) | ||
print(silva_plot_all) | ||
``` | ||
|
||
|
||
## PCR | ||
|
||
```{r silva_pcr} | ||
# Replace all u with t so in silico PCR works | ||
system.time(silva$obs_data$sequence <- gsub(pattern = "u", replacement = "t", silva$obs_data$sequence)) | ||
# in silico PCR | ||
system.time(silva_pcr <- primersearch(silva, | ||
forward = forward_primer, | ||
reverse = reverse_primer, | ||
mismatch = max_mismatch)) | ||
``` | ||
|
||
```{r silva_plot_pcr_all} | ||
system.time(silva_plot_pcr_all <- plot(silva_pcr, | ||
node_size = n_obs, | ||
node_label = name, | ||
node_color = prop_amplified, | ||
node_color_range = pcr_success_color_scale, | ||
node_color_trans = "linear", | ||
node_label_max = 150, | ||
node_color_axis_label = "Proportion PCR success", | ||
node_size_axis_label = "Sequence count", | ||
output_file = file.path(output_folder, paste0("silva--pcr_all", output_format)))) | ||
print(silva_plot_pcr_all) | ||
``` | ||
|
||
```{r, silva_plot_pcr_fail} | ||
system.time(silva_plot_pcr_fail <- silva_pcr %>% | ||
filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>% | ||
plot(node_size = n_obs - count_amplified, | ||
node_label = name, | ||
node_color = prop_amplified, | ||
node_color_range = pcr_success_color_scale, | ||
node_color_trans = "linear", | ||
node_color_interval = c(0, 1), | ||
node_label_size_range = c(0.008, 0.015), | ||
node_label_max = 500, | ||
node_color_axis_label = "Proportion PCR success", | ||
node_size_axis_label = "Sequences not amplified", | ||
make_legend = FALSE, | ||
output_file = file.path(output_folder, paste0("silva--pcr_fail", output_format)))) | ||
print(silva_plot_pcr_fail) | ||
``` | ||
|
||
|
||
## Save outputs for composite figure | ||
|
||
Some results from this file will be combined with other to make a composite figure. | ||
Below, the needed objects are saved so that they can be loaded by another Rmd file. | ||
|
||
```{r silva_save} | ||
save(file = file.path(output_folder, "silva_data.RData"), | ||
silva_plot_all, silva_plot_pcr_fail) | ||
``` | ||
|
115 changes: 115 additions & 0 deletions
115
inst/doc/publication_analyses/16s_database_comparison/02--rdp.Rmd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
--- | ||
title: Graphing the RDP database | ||
--- | ||
|
||
```{r rdp_setup, echo=FALSE, warning=FALSE, message=FALSE} | ||
source("settings.R") | ||
``` | ||
|
||
## Requirements | ||
|
||
**NOTE:** This analysis requires at least 16Gb of RAM to run. | ||
|
||
## Parse database | ||
|
||
The code below parses and subsets the entire RDP non-redundant reference database. | ||
The object made is quite large. | ||
|
||
```{r load_rdp} | ||
file_path <- file.path(input_folder, "rdp_current_Bacteria_unaligned.fa") | ||
system.time(rdp <- extract_taxonomy(seqinr::read.fasta(file_path, as.string = TRUE), | ||
regex = "\\tLineage(.*)", | ||
key = c("class"), | ||
class_regex = "[;=](.+?);(.+?)", | ||
class_key = c("name", rdp_rank = "taxon_info"))) | ||
print(rdp) | ||
``` | ||
|
||
## Subset | ||
|
||
```{r subset_rdp} | ||
if (! is.null(max_taxonomy_depth)) { | ||
system.time(rdp <- filter_taxa(rdp, n_supertaxa <= max_taxonomy_depth)) | ||
print(rdp) | ||
} | ||
if (! is.null(min_seq_count)) { | ||
system.time(rdp <- filter_taxa(rdp, n_obs >= min_seq_count)) | ||
print(rdp) | ||
} | ||
if (just_bacteria) { | ||
system.time(rdp <- filter_taxa(rdp, name == "Bacteria", subtaxa = TRUE)) | ||
print(rdp) | ||
} | ||
``` | ||
|
||
|
||
|
||
## Plot whole database | ||
|
||
Although RDP is such a large database (`r nrow(rdp$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database: | ||
|
||
```{r rdp_plot_all} | ||
system.time(rdp_plot_all <- plot(rdp, | ||
node_size = n_obs, | ||
node_color = n_obs, | ||
node_label = name, | ||
node_label_max = 150, | ||
node_color_axis_label = "Sequence count", | ||
make_legend = FALSE, | ||
output_file = file.path(output_folder, paste0("rdp--all", output_format)))) | ||
print(rdp_plot_all) | ||
``` | ||
|
||
|
||
## PCR | ||
|
||
```{r rdp_pcr} | ||
system.time(rdp_pcr <- primersearch(rdp, | ||
forward = forward_primer, | ||
reverse = reverse_primer, | ||
mismatch = max_mismatch)) | ||
``` | ||
|
||
```{r rdp_plot_pcr_all} | ||
system.time(rdp_plot_pcr_all <- plot(rdp_pcr, | ||
node_size = n_obs, | ||
node_label = name, | ||
node_color = prop_amplified, | ||
node_color_range = pcr_success_color_scale, | ||
node_color_trans = "linear", | ||
node_label_max = 150, | ||
node_color_axis_label = "Proportion PCR success", | ||
node_size_axis_label = "Sequence count", | ||
output_file = file.path(output_folder, paste0("rdp--pcr_all", output_format)))) | ||
print(rdp_plot_pcr_all) | ||
``` | ||
|
||
```{r, rdp_plot_pcr_fail} | ||
system.time(rdp_plot_pcr_fail <- rdp_pcr %>% | ||
filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>% | ||
plot(node_size = n_obs - count_amplified, | ||
node_label = name, | ||
node_color = prop_amplified, | ||
node_color_range = pcr_success_color_scale, | ||
node_color_trans = "linear", | ||
node_color_interval = c(0, 1), | ||
node_label_size_range = c(0.008, 0.015), | ||
node_label_max = 500, | ||
node_color_axis_label = "Proportion PCR success", | ||
node_size_axis_label = "Sequences not amplified", | ||
make_legend = FALSE, | ||
output_file = file.path(output_folder, paste0("rdp--pcr_fail", output_format)))) | ||
print(rdp_plot_pcr_fail) | ||
``` | ||
|
||
|
||
## Save outputs for composite figure | ||
|
||
Some results from this file will be combined with other to make a composite figure. | ||
Below, the needed objects are saved so that they can be loaded by another Rmd file. | ||
|
||
```{r rdp_save} | ||
save(file = file.path(output_folder,"rdp_data.RData"), | ||
rdp_plot_all, rdp_plot_pcr_fail) | ||
``` | ||
|
154 changes: 154 additions & 0 deletions
154
inst/doc/publication_analyses/16s_database_comparison/03--greengenes.Rmd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
--- | ||
title: Graphing the Greengenes database | ||
--- | ||
|
||
```{r greengenes_setup, echo=FALSE, warning=FALSE, message=FALSE} | ||
source("settings.R") | ||
``` | ||
|
||
## Requirements | ||
|
||
**NOTE:** This analysis requires at least 10Gb of RAM to run. | ||
|
||
## Parse database | ||
|
||
The greengenes database stores sequences in one file and taxonomy information in another and the order of the two files differ making parseing more difficult than the other databases. | ||
Since taxonomy inforamtion is needed for creating the `taxmap` data structure, we will parse it first and add the sequence information on after. | ||
|
||
### Parse taxonomy file | ||
|
||
```{r greengenes_taxonomy} | ||
gg_taxonomy_path <- "databases/gg_13_5_taxonomy.txt" | ||
gg_taxonomy <- readLines(gg_taxonomy_path) | ||
print(gg_taxonomy[1:5]) | ||
``` | ||
|
||
Note that there are some ranks with no names. | ||
These will be removed after parsing the file since they provide no information and an uniform-length taxonomy is not needed. | ||
|
||
```{r greengenes_taxonomy_parse} | ||
# Parse taxonomy file | ||
system.time(greengenes <- extract_taxonomy(input = gg_taxonomy, key = c(id = "obs_info", "class"), regex = "^([0-9]+)\t(.*)$", class_sep = "; ", class_regex = "^([a-z]{1})__(.*)$", class_key = c(rank = "taxon_info", "name"))) | ||
# Remove data for ranks with no information | ||
greengenes <- filter_taxa(greengenes, name != "") | ||
print(greengenes) | ||
``` | ||
|
||
|
||
### Parse sequence file | ||
|
||
Next we will parse the sequence file so we can add it to the `obs_data` table of the `greengenes` object. | ||
|
||
```{r greengenes_sequence} | ||
gg_sequence_path <- "databases/gg_13_5.fasta" | ||
substr(readLines(gg_sequence_path, n = 10), 1, 100) | ||
``` | ||
|
||
This can be easily parsed using `seqinr`: | ||
|
||
```{r greengenes_sequence_read} | ||
gg_sequences <- seqinr::read.fasta(gg_sequence_path, as.string = TRUE) | ||
``` | ||
|
||
|
||
### Integrating sequence and taxonomy | ||
|
||
We will need to use the Greengenes ID to match up which sequence goes with which row since they are in different orders. | ||
|
||
```{r greengenes_combine} | ||
greengenes <- mutate_obs(greengenes, sequence = unlist(gg_sequences)[as.character(id)]) | ||
``` | ||
|
||
|
||
## Subset | ||
|
||
```{r subset_greengenes} | ||
if (! is.null(max_taxonomy_depth)) { | ||
system.time(greengenes <- filter_taxa(greengenes, n_supertaxa <= max_taxonomy_depth)) | ||
print(greengenes) | ||
} | ||
if (! is.null(min_seq_count)) { | ||
system.time(greengenes <- filter_taxa(greengenes, n_obs >= min_seq_count)) | ||
print(greengenes) | ||
} | ||
if (just_bacteria) { | ||
system.time(greengenes <- filter_taxa(greengenes, name == "Bacteria", subtaxa = TRUE)) | ||
print(greengenes) | ||
} | ||
``` | ||
|
||
|
||
|
||
## Plot whole database | ||
|
||
Although Greengenes is such a large database (`r nrow(greengenes$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database: | ||
|
||
```{r greengenes_plot_all} | ||
system.time(greengenes_plot_all <- plot(greengenes, | ||
node_size = n_obs, | ||
node_size_range = c(0.001, 0.01), | ||
node_size_interval = c(1, 3000000), | ||
edge_size_range = c(0.001, 0.01), | ||
edge_size_interval = c(1, 3000000), | ||
node_color_interval = c(1, 3000000), | ||
node_color = n_obs, | ||
node_label = name, | ||
node_label_max = 150, | ||
node_color_axis_label = "Sequence count", | ||
output_file = file.path(output_folder, paste0("greengenes--all", output_format)))) | ||
print(greengenes_plot_all) | ||
``` | ||
|
||
|
||
## PCR | ||
|
||
```{r greengenes_pcr} | ||
system.time(greengenes_pcr <- primersearch(greengenes, | ||
forward = forward_primer, | ||
reverse = reverse_primer, | ||
mismatch = max_mismatch)) | ||
``` | ||
|
||
```{r greengenes_plot_pcr_all} | ||
system.time(greengenes_plot_pcr_all <- plot(greengenes_pcr, | ||
node_size = n_obs, | ||
node_label = name, | ||
node_color = prop_amplified, | ||
node_color_range = pcr_success_color_scale, | ||
node_color_trans = "linear", | ||
node_label_max = 150, | ||
node_color_axis_label = "Proportion PCR success", | ||
node_size_axis_label = "Sequence count", | ||
output_file = file.path(output_folder, paste0("greengenes--pcr_all", output_format)))) | ||
print(greengenes_plot_pcr_all) | ||
``` | ||
|
||
```{r, greengenes_plot_pcr_fail} | ||
system.time(greengenes_plot_pcr_fail <- greengenes_pcr %>% | ||
filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>% | ||
plot(node_size = n_obs - count_amplified, | ||
node_label = name, | ||
node_color = prop_amplified, | ||
node_color_range = pcr_success_color_scale, | ||
node_color_trans = "linear", | ||
node_color_interval = c(0, 1), | ||
# node_label_size_range = c(0.008, 0.015), | ||
node_label_max = 500, | ||
node_color_axis_label = "Proportion PCR success", | ||
node_size_axis_label = "Sequences not amplified", | ||
output_file = file.path(output_folder, paste0("greengenes--pcr_fail", output_format)))) | ||
print(greengenes_plot_pcr_fail) | ||
``` | ||
|
||
|
||
## Save outputs for composite figure | ||
|
||
Some results from this file will be combined with other to make a composite figure. | ||
Below, the needed objects are saved so that they can be loaded by another Rmd file. | ||
|
||
```{r greengenes_save} | ||
save(file = file.path(output_folder, "greengenes_data.RData"), | ||
greengenes_plot_all, greengenes_plot_pcr_fail) | ||
``` | ||
|
||
|
Oops, something went wrong.