You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
The length filtering is arbitrary: it would be interesting to have a plot showing the length disperstion with the taxonomic assignement. It would allow for instance to see if we could exlcue more sequence based on their length. I had made one previously (draft of code hereunder) but not adapted to the current output of the pipeline.
Dada2 sequences
Filter to have the sequences of all unassigned features to blast them.
Blast commands in the ipynb in the same folder
## Obtain summary by otu of the frequencey
### read the table created with the barplot function where we kept all unassigned features of all samples
all_unassigned_table <- read.table("r_figures/quantitative_barplots/Absolute/Absolute/Table/Kingdom_Unclassified_Absolute>20_study_name_noyes_CURML_sample_source_abundancy_table.tsv", sep = "\t", header = T)
### Use summarise to have only once each OTU and the sum of the abundance for each
summarized_unassiged_table <- all_unassigned_table %>%
group_by(OTU) %>%
summarise(sum_abudance = sum(Abundance))
### Filter to keep only the ones with more than 20 reads, to reduce the number of sequences to blast
filtered_unassigned_table <- summarized_unassiged_table %>%
filter(sum_abudance >= 20)
### Read the fasta file of all sequences denoised by dada
all_fasta_seq <- read_fasta(file_path = "1_dada_denoised/dna-sequences.fasta")
### Transform the rownames as first column, needed to join with deplyr
all_fasta_seq <- as.data.frame(all_fasta_seq)
all_fasta_seq <- rownames_to_column(all_fasta_seq, var = "OTU")
### Keep only the sequences in the filtered table of all unassigned sequences
filtered_seq <- semi_join(all_fasta_seq, filtered_unassigned_table, by = c("OTU"))
### Write in into a fasta format table
df.fasta = dataframe2fas(filtered_seq, file="filtered_seq.fasta")
Create a function to have the length of the sequences
## Rename the sequence column in the all_fasta_seq table
names(all_fasta_seq)[2]<-paste("sequences")
## Calculate the sequences length
sequence_length <- all_fasta_seq %>%
mutate(length = nchar(as.character(all_fasta_seq$sequences)))
## Get a table with the overall abundance of all features et their assignment
all_sequences_counts <- physeq_df %>%
group_by(OTU, Kingdom) %>%
summarise(sum_abundance = sum(Abundance))
## Join the two just created table with all sequences and abundance
joined_sequences_tables <- full_join(all_sequences_counts, sequence_length, by = "OTU")
## Join by length and by kingdom retaining the sum of the abundance for plotting
joined_sequences_tables_length_sum <- joined_sequences_tables %>%
group_by(length, Kingdom) %>%
summarise(length_sum_abundance = sum(sum_abundance))
## Create a version without the 0
joined_sequences_no_zero <- joined_sequences_tables_length_sum %>%
ungroup() %>%
filter(length_sum_abundance > 0)
### Write it as tsv
write.table(x = joined_sequences_no_zero, file = "length_of_sequences.tsv", append = F, quote = F, sep = "\t", row.names = F, col.names = TRUE)
## Plot the size by Kingdom
read_length_plot <- joined_sequences_tables_length_sum %>%
ggplot(aes(x = length, y = length_sum_abundance, fill = Kingdom)) +
geom_col(position = "stack", width = 1)
##Save this plots
ggsave(read_length_plot, filename = "amplicons_length_distribution.png", width = 10, height = 5)
## Zoom to lower < values for better visualization
read_length_plot_zoomed <- read_length_plot + coord_cartesian(
xlim = c(250, 450), ylim = c(0, 50000))
##Save this plots
ggsave(read_length_plot_zoomed, filename = "amplicons_length_distribution_zoomed.png", width = 10, height = 5)
To compare sequences length and results, same analysis with vsearch join paired sequences
Load objects into R
## Function to load in R environnement the taxonomy table, the metadata table, the features counts table, the tree, the Shannon and the PCO objects
load_objects_fct <- function(taxonomy_class_table, replace_empty_tax = TRUE, features_counts_table , metadata_table, tree, shannon, pco) {
## Qiime2 .qza features counts table
### Load Qiime2 .qza counts table into R environment
features_counts_table<-read_qza(features_counts_table)
### Shows the unique identifier of the file
print("features counts table")
print(features_counts_table$uuid)
### Print information about the object
names(features_counts_table)
### Shows the unique identifier of the file
print("features count table")
print(features_counts_table$uuid)
### Show the first 5 samples and features
print(features_counts_table$data[1:5,1:5])
### Save this table in global environnement
features_counts_table <<- features_counts_table
}
## Run the function to load all needed object into R
load_objects_fct( features_counts_table = "1b_vsearch_joining/dereplicate/dereplicated_table.qza")
## Create a phyloseq object from what has been loaded in the previous chunks
### Generate a function to create a phyloseq object
create_phyloseq_fct <- function(physeq_name = "physeq", melted_df_name = "physeq_df", otu_table){
### Create the phyloseq object
physeq_obj<-phyloseq(
otu_table(otu_table$data, taxa_are_rows = T))
### Assign the object into the environnement
assign(value = physeq_obj, x = physeq_name, envir = .GlobalEnv)
### Melt the physeq objet into a dataframe with one row per feature.id and per sample, needed later
physeq_df <- psmelt(physeq_obj)
### Assign the object into the environnement
assign(value = physeq_df, x = melted_df_name, envir = .GlobalEnv)
}
### Use the function
create_phyloseq_fct(physeq_name = "physeq_vsearch", melted_df_name = "physeq_df_vsearch", otu_table = features_counts_table)
Filter to have the sequences of all unassigned features
### Read the fasta file of all sequences denoised by dada
all_fasta_seq_vsearch <- read_fasta(file_path = "1b_vsearch_joining/dereplicate/dna-sequences.fasta")
### Transform the rownames as first column, needed to join with deplyr
all_fasta_seq_vsearch <- as.data.frame(all_fasta_seq_vsearch)
all_fasta_seq_vsearch <- rownames_to_column(all_fasta_seq_vsearch, var = "OTU")
## Rename the sequence column in the all_fasta_seq table
names(all_fasta_seq_vsearch)[2]<-paste("sequences")
## Separate the first column where the feature ID and the sample name are merged
all_fasta_seq_vsearch <- separate(all_fasta_seq_vsearch, sep = " ", col = OTU, into = "OTU")
Create a function to have the length of the sequences
## Calculate the sequences length
sequence_length_vsearch <- all_fasta_seq_vsearch %>%
mutate(length = nchar(as.character(all_fasta_seq_vsearch$sequences)))
## Get a table with the overall abundance of all features et their assignment
all_sequences_counts_vsearch <- physeq_df_vsearch %>%
group_by(OTU) %>%
summarise(sum_abundance = sum(Abundance))
## Join the two just created table with all sequences and abundance
joined_sequences_tables_vsearch <- full_join(all_sequences_counts_vsearch, sequence_length_vsearch, by = "OTU")
### Write it as tsv
write.table(x = joined_sequences_tables_vsearch, file = "length_of_sequences_vsearch.tsv", append = F, quote = F, sep = "\t", row.names = F, col.names = TRUE)
## Join by length and by kingdom retaining the sum of the abundance for plotting
joined_sequences_tables_length_sum_vsearch <- joined_sequences_tables_vsearch %>%
group_by(length) %>%
summarise(length_sum_abundance = sum(sum_abundance))
## Plot the size by Kingdom
read_length_plot <- joined_sequences_tables_length_sum_vsearch %>%
ggplot(aes(x = length, y = length_sum_abundance)) +
geom_col(position = "stack", width = 1)
##Save this plots
ggsave(read_length_plot, filename = "amplicons_length_distribution_vsearch.png", width = 10, height = 5)
## Zoom to lower < values for better visualization
read_length_plot_zoomed <- read_length_plot + coord_cartesian(
xlim = c(0, 600), ylim = c(0, 50000))
##Save this plots
ggsave(read_length_plot_zoomed, filename = "amplicons_length_distribution_zoomed_vsearch.png", width = 10, height = 5)
The text was updated successfully, but these errors were encountered:
The length filtering is arbitrary: it would be interesting to have a plot showing the length disperstion with the taxonomic assignement. It would allow for instance to see if we could exlcue more sequence based on their length. I had made one previously (draft of code hereunder) but not adapted to the current output of the pipeline.
Dada2 sequences
Filter to have the sequences of all unassigned features to blast them.
Blast commands in the ipynb in the same folder
Create a function to have the length of the sequences
To compare sequences length and results, same analysis with vsearch join paired sequences
Load objects into R
Filter to have the sequences of all unassigned features
Create a function to have the length of the sequences
The text was updated successfully, but these errors were encountered: