Skip to content

Commit

Permalink
separated database comparisons
Browse files Browse the repository at this point in the history
  • Loading branch information
zachary-foster committed Aug 8, 2016
1 parent adf3659 commit b9cf970
Show file tree
Hide file tree
Showing 7 changed files with 438 additions and 525 deletions.
4 changes: 1 addition & 3 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
^.+\.Rproj$
^\.Rproj\.user$
^quilt_config\.yml$
^inst/doc/presentation/.*$
^inst/doc/publication_analyses/.*$
^inst/doc/website_source/.*\.html$
^inst/doc/.*$
^big_data$
^scratch$
^inst/extdata/mothur\.summary$
Expand Down
116 changes: 116 additions & 0 deletions inst/doc/publication_analyses/16s_database_comparison/01--silva.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
---
title: Graphing the SILVA database
---

```{r silva_setup, echo=FALSE, warning=FALSE, message=FALSE}
source("settings.R")
```

## Requirements

**NOTE:** This analysis requires at least 10Gb of RAM to run.

## Parse database

The code below parses and subsets the entire SILVA non-redundant reference database.
The object made is quite large.

```{r load_silva}
file_path <- file.path(input_folder, "SILVA_123.1_SSURef_Nr99_tax_silva.fasta")
system.time(silva <- extract_taxonomy(seqinr::read.fasta(file_path, as.string = TRUE),
regex = "^(.*?) (.*)$",
key = c(id = "obs_info", "class"),
class_sep = ";"))
print(silva)
```

## Subset

```{r subset_silva}
if (! is.null(max_taxonomy_depth)) {
system.time(silva <- filter_taxa(silva, n_supertaxa <= max_taxonomy_depth))
print(silva)
}
if (! is.null(min_seq_count)) {
system.time(silva <- filter_taxa(silva, n_obs >= min_seq_count))
print(silva)
}
if (just_bacteria) {
system.time(silva <- filter_taxa(silva, name == "Bacteria", subtaxa = TRUE))
print(silva)
}
```


## Plot whole database

Although SILVA is such a large database (`r nrow(silva$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database:

```{r silva_plot_all}
system.time(silva_plot_all <- plot(silva,
node_size = n_obs,
node_color = n_obs,
node_label = name,
node_label_max = 150,
node_color_axis_label = "Sequence count",
make_legend = FALSE,
output_file = file.path(output_folder, paste0("silva--all", output_format))))
print(silva_plot_all)
```


## PCR

```{r silva_pcr}
# Replace all u with t so in silico PCR works
system.time(silva$obs_data$sequence <- gsub(pattern = "u", replacement = "t", silva$obs_data$sequence))
# in silico PCR
system.time(silva_pcr <- primersearch(silva,
forward = forward_primer,
reverse = reverse_primer,
mismatch = max_mismatch))
```

```{r silva_plot_pcr_all}
system.time(silva_plot_pcr_all <- plot(silva_pcr,
node_size = n_obs,
node_label = name,
node_color = prop_amplified,
node_color_range = pcr_success_color_scale,
node_color_trans = "linear",
node_label_max = 150,
node_color_axis_label = "Proportion PCR success",
node_size_axis_label = "Sequence count",
output_file = file.path(output_folder, paste0("silva--pcr_all", output_format))))
print(silva_plot_pcr_all)
```

```{r, silva_plot_pcr_fail}
system.time(silva_plot_pcr_fail <- silva_pcr %>%
filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
plot(node_size = n_obs - count_amplified,
node_label = name,
node_color = prop_amplified,
node_color_range = pcr_success_color_scale,
node_color_trans = "linear",
node_color_interval = c(0, 1),
node_label_size_range = c(0.008, 0.015),
node_label_max = 500,
node_color_axis_label = "Proportion PCR success",
node_size_axis_label = "Sequences not amplified",
make_legend = FALSE,
output_file = file.path(output_folder, paste0("silva--pcr_fail", output_format))))
print(silva_plot_pcr_fail)
```


## Save outputs for composite figure

Some results from this file will be combined with other to make a composite figure.
Below, the needed objects are saved so that they can be loaded by another Rmd file.

```{r silva_save}
save(file = file.path(output_folder, "silva_data.RData"),
silva_plot_all, silva_plot_pcr_fail)
```

115 changes: 115 additions & 0 deletions inst/doc/publication_analyses/16s_database_comparison/02--rdp.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
---
title: Graphing the RDP database
---

```{r rdp_setup, echo=FALSE, warning=FALSE, message=FALSE}
source("settings.R")
```

## Requirements

**NOTE:** This analysis requires at least 16Gb of RAM to run.

## Parse database

The code below parses and subsets the entire RDP non-redundant reference database.
The object made is quite large.

```{r load_rdp}
file_path <- file.path(input_folder, "rdp_current_Bacteria_unaligned.fa")
system.time(rdp <- extract_taxonomy(seqinr::read.fasta(file_path, as.string = TRUE),
regex = "\\tLineage(.*)",
key = c("class"),
class_regex = "[;=](.+?);(.+?)",
class_key = c("name", rdp_rank = "taxon_info")))
print(rdp)
```

## Subset

```{r subset_rdp}
if (! is.null(max_taxonomy_depth)) {
system.time(rdp <- filter_taxa(rdp, n_supertaxa <= max_taxonomy_depth))
print(rdp)
}
if (! is.null(min_seq_count)) {
system.time(rdp <- filter_taxa(rdp, n_obs >= min_seq_count))
print(rdp)
}
if (just_bacteria) {
system.time(rdp <- filter_taxa(rdp, name == "Bacteria", subtaxa = TRUE))
print(rdp)
}
```



## Plot whole database

Although RDP is such a large database (`r nrow(rdp$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database:

```{r rdp_plot_all}
system.time(rdp_plot_all <- plot(rdp,
node_size = n_obs,
node_color = n_obs,
node_label = name,
node_label_max = 150,
node_color_axis_label = "Sequence count",
make_legend = FALSE,
output_file = file.path(output_folder, paste0("rdp--all", output_format))))
print(rdp_plot_all)
```


## PCR

```{r rdp_pcr}
system.time(rdp_pcr <- primersearch(rdp,
forward = forward_primer,
reverse = reverse_primer,
mismatch = max_mismatch))
```

```{r rdp_plot_pcr_all}
system.time(rdp_plot_pcr_all <- plot(rdp_pcr,
node_size = n_obs,
node_label = name,
node_color = prop_amplified,
node_color_range = pcr_success_color_scale,
node_color_trans = "linear",
node_label_max = 150,
node_color_axis_label = "Proportion PCR success",
node_size_axis_label = "Sequence count",
output_file = file.path(output_folder, paste0("rdp--pcr_all", output_format))))
print(rdp_plot_pcr_all)
```

```{r, rdp_plot_pcr_fail}
system.time(rdp_plot_pcr_fail <- rdp_pcr %>%
filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
plot(node_size = n_obs - count_amplified,
node_label = name,
node_color = prop_amplified,
node_color_range = pcr_success_color_scale,
node_color_trans = "linear",
node_color_interval = c(0, 1),
node_label_size_range = c(0.008, 0.015),
node_label_max = 500,
node_color_axis_label = "Proportion PCR success",
node_size_axis_label = "Sequences not amplified",
make_legend = FALSE,
output_file = file.path(output_folder, paste0("rdp--pcr_fail", output_format))))
print(rdp_plot_pcr_fail)
```


## Save outputs for composite figure

Some results from this file will be combined with other to make a composite figure.
Below, the needed objects are saved so that they can be loaded by another Rmd file.

```{r rdp_save}
save(file = file.path(output_folder,"rdp_data.RData"),
rdp_plot_all, rdp_plot_pcr_fail)
```

Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
---
title: Graphing the Greengenes database
---

```{r greengenes_setup, echo=FALSE, warning=FALSE, message=FALSE}
source("settings.R")
```

## Requirements

**NOTE:** This analysis requires at least 10Gb of RAM to run.

## Parse database

The greengenes database stores sequences in one file and taxonomy information in another and the order of the two files differ making parseing more difficult than the other databases.
Since taxonomy inforamtion is needed for creating the `taxmap` data structure, we will parse it first and add the sequence information on after.

### Parse taxonomy file

```{r greengenes_taxonomy}
gg_taxonomy_path <- "databases/gg_13_5_taxonomy.txt"
gg_taxonomy <- readLines(gg_taxonomy_path)
print(gg_taxonomy[1:5])
```

Note that there are some ranks with no names.
These will be removed after parsing the file since they provide no information and an uniform-length taxonomy is not needed.

```{r greengenes_taxonomy_parse}
# Parse taxonomy file
system.time(greengenes <- extract_taxonomy(input = gg_taxonomy, key = c(id = "obs_info", "class"), regex = "^([0-9]+)\t(.*)$", class_sep = "; ", class_regex = "^([a-z]{1})__(.*)$", class_key = c(rank = "taxon_info", "name")))
# Remove data for ranks with no information
greengenes <- filter_taxa(greengenes, name != "")
print(greengenes)
```


### Parse sequence file

Next we will parse the sequence file so we can add it to the `obs_data` table of the `greengenes` object.

```{r greengenes_sequence}
gg_sequence_path <- "databases/gg_13_5.fasta"
substr(readLines(gg_sequence_path, n = 10), 1, 100)
```

This can be easily parsed using `seqinr`:

```{r greengenes_sequence_read}
gg_sequences <- seqinr::read.fasta(gg_sequence_path, as.string = TRUE)
```


### Integrating sequence and taxonomy

We will need to use the Greengenes ID to match up which sequence goes with which row since they are in different orders.

```{r greengenes_combine}
greengenes <- mutate_obs(greengenes, sequence = unlist(gg_sequences)[as.character(id)])
```


## Subset

```{r subset_greengenes}
if (! is.null(max_taxonomy_depth)) {
system.time(greengenes <- filter_taxa(greengenes, n_supertaxa <= max_taxonomy_depth))
print(greengenes)
}
if (! is.null(min_seq_count)) {
system.time(greengenes <- filter_taxa(greengenes, n_obs >= min_seq_count))
print(greengenes)
}
if (just_bacteria) {
system.time(greengenes <- filter_taxa(greengenes, name == "Bacteria", subtaxa = TRUE))
print(greengenes)
}
```



## Plot whole database

Although Greengenes is such a large database (`r nrow(greengenes$taxa_data)` taxa) that graphing everything can be a bit overwhelming, it gives an intuitive feel for the complexity of the database:

```{r greengenes_plot_all}
system.time(greengenes_plot_all <- plot(greengenes,
node_size = n_obs,
node_size_range = c(0.001, 0.01),
node_size_interval = c(1, 3000000),
edge_size_range = c(0.001, 0.01),
edge_size_interval = c(1, 3000000),
node_color_interval = c(1, 3000000),
node_color = n_obs,
node_label = name,
node_label_max = 150,
node_color_axis_label = "Sequence count",
output_file = file.path(output_folder, paste0("greengenes--all", output_format))))
print(greengenes_plot_all)
```


## PCR

```{r greengenes_pcr}
system.time(greengenes_pcr <- primersearch(greengenes,
forward = forward_primer,
reverse = reverse_primer,
mismatch = max_mismatch))
```

```{r greengenes_plot_pcr_all}
system.time(greengenes_plot_pcr_all <- plot(greengenes_pcr,
node_size = n_obs,
node_label = name,
node_color = prop_amplified,
node_color_range = pcr_success_color_scale,
node_color_trans = "linear",
node_label_max = 150,
node_color_axis_label = "Proportion PCR success",
node_size_axis_label = "Sequence count",
output_file = file.path(output_folder, paste0("greengenes--pcr_all", output_format))))
print(greengenes_plot_pcr_all)
```

```{r, greengenes_plot_pcr_fail}
system.time(greengenes_plot_pcr_fail <- greengenes_pcr %>%
filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
plot(node_size = n_obs - count_amplified,
node_label = name,
node_color = prop_amplified,
node_color_range = pcr_success_color_scale,
node_color_trans = "linear",
node_color_interval = c(0, 1),
# node_label_size_range = c(0.008, 0.015),
node_label_max = 500,
node_color_axis_label = "Proportion PCR success",
node_size_axis_label = "Sequences not amplified",
output_file = file.path(output_folder, paste0("greengenes--pcr_fail", output_format))))
print(greengenes_plot_pcr_fail)
```


## Save outputs for composite figure

Some results from this file will be combined with other to make a composite figure.
Below, the needed objects are saved so that they can be loaded by another Rmd file.

```{r greengenes_save}
save(file = file.path(output_folder, "greengenes_data.RData"),
greengenes_plot_all, greengenes_plot_pcr_fail)
```


Loading

0 comments on commit b9cf970

Please sign in to comment.