v0.3.0

frankandreace · Feb 16, 2024 · bc8021e · bc8021e
1 parent 0a3ff9f
commit bc8021e
Show file tree

Hide file tree

Showing 7 changed files with 65 additions and 30 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "back_to_sequences"
-version = "0.2.13"
+version = "0.3.0"
 edition = "2021"
 
 authors = [

diff --git a/README.md b/README.md
@@ -70,20 +70,43 @@ We queried: from 10,000 reads to 200 million reads (+ 1 billion on the cluster),
 ```	
 Extract sequences that contain some kmers
 
-Usage: back_to_sequences [OPTIONS] --in-sequences <IN_SEQUENCES> --in-kmers <IN_KMERS> --out-sequences <OUT_SEQUENCES>
+Usage: back_to_sequences [OPTIONS] --in-kmers <IN_KMERS> [--in-sequences <IN_SEQUENCES>]
 
 Options:
-      --in-sequences <IN_SEQUENCES>    Input fasta or fastq [.gz] file containing the original sequences (eg. reads). THe stdin is used if not provided [default: ]
-      --in-kmers <IN_KMERS>            Input fasta file containing the original kmers
-      --out-sequences <OUT_SEQUENCES>  Output file containing the filtered original sequences (eg. reads). It will be automatically in fasta or fastq format depending on the input file
-      --out-kmers <OUT_KMERS>          If provided, output text file containing the kmers that occur in the reads with their number of occurrences [default: ]
-  -k, --kmer-size <KMER_SIZE>          Size of the kmers to index and search [default: 31]
-  -m, --min-threshold <MIN_THRESHOLD>  Minimal threshold of the ratio  (%) of kmers that must be found in a sequence to keep it (default 0%). Thus by default, if no kmer is found in a sequence, it is not output [default: 0]
-      --max-threshold <MAX_THRESHOLD>  Maximal threshold of the ratio (%) of kmers that must be found in a sequence to keep it (default 100%). Thus by default, there is no limitation on the maximal number of kmers found in a sequence [default: 100]
-      --stranded                       Used original kmer strand (else canonical kmers are considered)
-      --query-reverse                  Query the reverse complement of reads. Useless without the --stranded option
-  -h, --help                           Print help
-  -V, --version                        Print version
+      --in-sequences <IN_SEQUENCES>
+          Input fasta or fastq [.gz] file containing the original sequences (eg. reads). 
+          The stdin is used if not provided [default: ]
+      --in-kmers <IN_KMERS>
+          Input fasta file containing the original kmers
+      --out-sequences <OUT_SEQUENCES>
+          Output file containing the filtered original sequences (eg. reads). 
+          It will be automatically in fasta or fastq format depending on the input file.
+          If not provided, only the in_kmers with their count is output [default: ]
+      --out-kmers <OUT_KMERS>
+          If provided, output a text file containing the kmers that occur in the reads with their number of occurrences [default: ]
+      --counted-kmer-threshold <COUNTED_KMER_THRESHOLD>
+          If out_kmers is provided, output only reference kmers whose number of occurrences is at least equal to this value\n
+          If out_kmers is not provided, this option is ignored [default: 0]
+  -k, --kmer-size <KMER_SIZE>
+          Size of the kmers to index and search [default: 31]
+  -m, --min-threshold <MIN_THRESHOLD>
+          Output sequences are those whose ratio of indexed kmers is in ]min_threshold; max_threshold]
+          Minimal threshold of the ratio  (%) of kmers that must be found in a sequence to keep it (default 0%).
+          Thus by default, if no kmer is found in a sequence, it is not output. [default: 0]
+      --max-threshold <MAX_THRESHOLD>
+          Output sequences are those whose ratio of indexed kmers is in ]min_threshold; max_threshold]
+          Maximal threshold of the ratio (%) of kmers that must be found in a sequence to keep it (default 100%).
+          Thus by default, there is no limitation on the maximal number of kmers found in a sequence. [default: 100]
+      --stranded
+          Used original kmer strand (else canonical kmers are considered)
+      --query-reverse
+          Query the reverse complement of reads. Useless without the --stranded option
+      --no-low-complexity
+          Do not index low complexity kmers (ie. with a Shannon entropy < 1.0)
+  -h, --help
+          Print help
+  -V, --version
+          Print version
 ```
 
 ### Examples
@@ -149,3 +172,4 @@ python3 scripts/extract_random_sequences.py --input reads.fasta --min_size 31 --
 * [ ] Thinks about a way to adapt this to protein sequences
 * [X] Add an option to set the size of the bloom filter used by kmindex
 * [ ] Provide a way to index and query more than one set $K$ of kmers
+* [ ] Output the strand of matched kmers
diff --git a/VERSIONS.md b/VERSIONS.md
@@ -27,4 +27,7 @@
 * 0.2.12. 23/01/2024:
     * possiblity to not output filtered reads. In this case only counted kmers are output. 
 * 0.2.13. 23/01/2024:
-    * prints all kmers from kmer_set, whaterver their counts count
+    * prints all kmers from kmer_set, whaterver their counts count
+* 0.3.0 16/02/2024
+    * Add the counted_kmer_threshold option
+
diff --git a/src/cli.rs b/src/cli.rs
@@ -11,8 +11,9 @@ use clap::Parser;
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 pub struct Args {
-    /// Input fasta or fastq [.gz] file containing the original sequences (eg. reads). THe stdin is used if not provided
-    #[arg(long, default_value_t = String::from(""))]
+    /// Input fasta or fastq [.gz] file containing the original sequences (eg. reads). 
+    /// The stdin is used if not provided
+    #[arg(long, default_value_t = String::from(""), verbatim_doc_comment)]
     pub in_sequences: String,
 
     /// Input fasta file containing the original kmers
@@ -22,13 +23,18 @@ pub struct Args {
     /// Output file containing the filtered original sequences (eg. reads). 
     /// It will be automatically in fasta or fastq format depending on the input file.
     /// If not provided, only the in_kmers with their count is output
-    #[arg(long, default_value_t = String::from(""))]
+    #[arg(long, default_value_t = String::from(""), verbatim_doc_comment)]
     pub out_sequences: String, 
 
 
-    /// If provided, output text file containing the kmers that occur in the reads with their number of occurrences
+    /// If provided, output a text file containing the kmers that occur in the reads with their number of occurrences
     #[arg(long, default_value_t = String::from(""))]
     pub out_kmers: String,
+
+    /// If out_kmers is provided, output only reference kmers whose number of occurrences is at least equal to this value\n
+    /// If out_kmers is not provided, this option is ignored
+    #[arg(long, default_value_t = 0, verbatim_doc_comment)]
+    pub counted_kmer_threshold: usize,
 
     /// Size of the kmers to index and search
     #[arg(short, long, default_value_t = 31)]
@@ -37,13 +43,13 @@ pub struct Args {
     /// Output sequences are those whose ratio of indexed kmers is in ]min_threshold; max_threshold]
     /// Minimal threshold of the ratio  (%) of kmers that must be found in a sequence to keep it (default 0%).
     /// Thus by default, if no kmer is found in a sequence, it is not output.
-    #[arg(short, long, default_value_t = 0.0)]
+    #[arg(short, long, default_value_t = 0.0, verbatim_doc_comment)]
     pub min_threshold: f32,
 
     /// Output sequences are those whose ratio of indexed kmers is in ]min_threshold; max_threshold]
     /// Maximal threshold of the ratio (%) of kmers that must be found in a sequence to keep it (default 100%).
     /// Thus by default, there is no limitation on the maximal number of kmers found in a sequence.
-    #[arg(long, default_value_t = 100.0)]
+    #[arg(long, default_value_t = 100.0, verbatim_doc_comment)]
     pub max_threshold: f32,
 
     /// Used original kmer strand (else canonical kmers are considered)

diff --git a/src/count.rs b/src/count.rs
@@ -33,7 +33,7 @@ pub fn kmers_in_fasta_file_par(
     stranded: bool,
     query_reverse: bool,
 ) -> Result<(), ()> {
-    const CHUNK_SIZE: usize = 32; // number of records
+    const CHUNK_SIZE: usize = 32; // number of records  
     const INPUT_CHANNEL_SIZE: usize = 8; // in units of CHUNK_SIZE records
     const OUTPUT_CHANNEL_SIZE: usize = 8; // in units of CHUNK_SIZE records
 
@@ -181,7 +181,7 @@ pub fn only_kmers_in_fasta_file_par(
     }
 
     let (input_tx, input_rx) = std::sync::mpsc::sync_channel::<Chunk>(INPUT_CHANNEL_SIZE);
-    let (output_tx, output_rx) = std::sync::mpsc::sync_channel::<Chunk>(OUTPUT_CHANNEL_SIZE);
+    let (_output_tx, _output_rx) = std::sync::mpsc::sync_channel::<Chunk>(OUTPUT_CHANNEL_SIZE);
 
 
 
@@ -228,10 +228,10 @@ pub fn only_kmers_in_fasta_file_par(
         });
 
 
-    reader_thread
-        .join()
-        .unwrap()
-        .map_err(|e| eprintln!("Error reading the sequences: {}", e));
+        let _ = reader_thread
+                .join()
+                .unwrap()
+                .map_err(|e| eprintln!("Error reading the sequences: {}", e));
 }
 
 /// count the number of indexed kmers in a given read

diff --git a/src/lib.rs b/src/lib.rs
@@ -27,6 +27,7 @@ pub fn back_to_sequences(
     out_fasta_reads: String,
     out_txt_kmers: String,
     kmer_size: usize,
+    counted_kmer_threshold: usize,
     min_threshold: f32,
     max_threshold: f32,
     stranded: bool,
@@ -76,10 +77,10 @@ pub fn back_to_sequences(
             // prints all kmers from kmer_set, whaterver their counts count
             let mut output = std::fs::File::create(&out_txt_kmers)?;
             for (kmer, count) in kmer_set.iter() {
-                // if count.get() > 0 {
-                output.write_all(kmer)?;
-                writeln!(output, " {}", count.get())?;
-                // }
+                if count.get() >= counted_kmer_threshold {
+                    output.write_all(kmer)?;
+                    writeln!(output, " {}", count.get())?;
+                }
             }
             Ok(())
         })()

diff --git a/src/main.rs b/src/main.rs
@@ -39,6 +39,7 @@ fn main() {
             args.out_sequences,
             args.out_kmers,
             args.kmer_size,
+            args.counted_kmer_threshold,
             args.min_threshold,
             args.max_threshold,
             args.stranded,