Skip to content

Commit

Permalink
v0.3.0
Browse files Browse the repository at this point in the history
  • Loading branch information
pierrepeterlongo committed Feb 16, 2024
1 parent 0a3ff9f commit bc8021e
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 30 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "back_to_sequences"
version = "0.2.13"
version = "0.3.0"
edition = "2021"

authors = [
Expand Down
48 changes: 36 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,20 +70,43 @@ We queried: from 10,000 reads to 200 million reads (+ 1 billion on the cluster),
```
Extract sequences that contain some kmers
Usage: back_to_sequences [OPTIONS] --in-sequences <IN_SEQUENCES> --in-kmers <IN_KMERS> --out-sequences <OUT_SEQUENCES>
Usage: back_to_sequences [OPTIONS] --in-kmers <IN_KMERS> [--in-sequences <IN_SEQUENCES>]
Options:
--in-sequences <IN_SEQUENCES> Input fasta or fastq [.gz] file containing the original sequences (eg. reads). THe stdin is used if not provided [default: ]
--in-kmers <IN_KMERS> Input fasta file containing the original kmers
--out-sequences <OUT_SEQUENCES> Output file containing the filtered original sequences (eg. reads). It will be automatically in fasta or fastq format depending on the input file
--out-kmers <OUT_KMERS> If provided, output text file containing the kmers that occur in the reads with their number of occurrences [default: ]
-k, --kmer-size <KMER_SIZE> Size of the kmers to index and search [default: 31]
-m, --min-threshold <MIN_THRESHOLD> Minimal threshold of the ratio (%) of kmers that must be found in a sequence to keep it (default 0%). Thus by default, if no kmer is found in a sequence, it is not output [default: 0]
--max-threshold <MAX_THRESHOLD> Maximal threshold of the ratio (%) of kmers that must be found in a sequence to keep it (default 100%). Thus by default, there is no limitation on the maximal number of kmers found in a sequence [default: 100]
--stranded Used original kmer strand (else canonical kmers are considered)
--query-reverse Query the reverse complement of reads. Useless without the --stranded option
-h, --help Print help
-V, --version Print version
--in-sequences <IN_SEQUENCES>
Input fasta or fastq [.gz] file containing the original sequences (eg. reads).
The stdin is used if not provided [default: ]
--in-kmers <IN_KMERS>
Input fasta file containing the original kmers
--out-sequences <OUT_SEQUENCES>
Output file containing the filtered original sequences (eg. reads).
It will be automatically in fasta or fastq format depending on the input file.
If not provided, only the in_kmers with their count is output [default: ]
--out-kmers <OUT_KMERS>
If provided, output a text file containing the kmers that occur in the reads with their number of occurrences [default: ]
--counted-kmer-threshold <COUNTED_KMER_THRESHOLD>
If out_kmers is provided, output only reference kmers whose number of occurrences is at least equal to this value\n
If out_kmers is not provided, this option is ignored [default: 0]
-k, --kmer-size <KMER_SIZE>
Size of the kmers to index and search [default: 31]
-m, --min-threshold <MIN_THRESHOLD>
Output sequences are those whose ratio of indexed kmers is in ]min_threshold; max_threshold]
Minimal threshold of the ratio (%) of kmers that must be found in a sequence to keep it (default 0%).
Thus by default, if no kmer is found in a sequence, it is not output. [default: 0]
--max-threshold <MAX_THRESHOLD>
Output sequences are those whose ratio of indexed kmers is in ]min_threshold; max_threshold]
Maximal threshold of the ratio (%) of kmers that must be found in a sequence to keep it (default 100%).
Thus by default, there is no limitation on the maximal number of kmers found in a sequence. [default: 100]
--stranded
Used original kmer strand (else canonical kmers are considered)
--query-reverse
Query the reverse complement of reads. Useless without the --stranded option
--no-low-complexity
Do not index low complexity kmers (ie. with a Shannon entropy < 1.0)
-h, --help
Print help
-V, --version
Print version
```

### Examples
Expand Down Expand Up @@ -149,3 +172,4 @@ python3 scripts/extract_random_sequences.py --input reads.fasta --min_size 31 --
* [ ] Thinks about a way to adapt this to protein sequences
* [X] Add an option to set the size of the bloom filter used by kmindex
* [ ] Provide a way to index and query more than one set $K$ of kmers
* [ ] Output the strand of matched kmers
5 changes: 4 additions & 1 deletion VERSIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,7 @@
* 0.2.12. 23/01/2024:
* possiblity to not output filtered reads. In this case only counted kmers are output.
* 0.2.13. 23/01/2024:
* prints all kmers from kmer_set, whaterver their counts count
* prints all kmers from kmer_set, whaterver their counts count
* 0.3.0 16/02/2024
* Add the counted_kmer_threshold option

18 changes: 12 additions & 6 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ use clap::Parser;
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
pub struct Args {
/// Input fasta or fastq [.gz] file containing the original sequences (eg. reads). THe stdin is used if not provided
#[arg(long, default_value_t = String::from(""))]
/// Input fasta or fastq [.gz] file containing the original sequences (eg. reads).
/// The stdin is used if not provided
#[arg(long, default_value_t = String::from(""), verbatim_doc_comment)]
pub in_sequences: String,

/// Input fasta file containing the original kmers
Expand All @@ -22,13 +23,18 @@ pub struct Args {
/// Output file containing the filtered original sequences (eg. reads).
/// It will be automatically in fasta or fastq format depending on the input file.
/// If not provided, only the in_kmers with their count is output
#[arg(long, default_value_t = String::from(""))]
#[arg(long, default_value_t = String::from(""), verbatim_doc_comment)]
pub out_sequences: String,


/// If provided, output text file containing the kmers that occur in the reads with their number of occurrences
/// If provided, output a text file containing the kmers that occur in the reads with their number of occurrences
#[arg(long, default_value_t = String::from(""))]
pub out_kmers: String,

/// If out_kmers is provided, output only reference kmers whose number of occurrences is at least equal to this value\n
/// If out_kmers is not provided, this option is ignored
#[arg(long, default_value_t = 0, verbatim_doc_comment)]
pub counted_kmer_threshold: usize,

/// Size of the kmers to index and search
#[arg(short, long, default_value_t = 31)]
Expand All @@ -37,13 +43,13 @@ pub struct Args {
/// Output sequences are those whose ratio of indexed kmers is in ]min_threshold; max_threshold]
/// Minimal threshold of the ratio (%) of kmers that must be found in a sequence to keep it (default 0%).
/// Thus by default, if no kmer is found in a sequence, it is not output.
#[arg(short, long, default_value_t = 0.0)]
#[arg(short, long, default_value_t = 0.0, verbatim_doc_comment)]
pub min_threshold: f32,

/// Output sequences are those whose ratio of indexed kmers is in ]min_threshold; max_threshold]
/// Maximal threshold of the ratio (%) of kmers that must be found in a sequence to keep it (default 100%).
/// Thus by default, there is no limitation on the maximal number of kmers found in a sequence.
#[arg(long, default_value_t = 100.0)]
#[arg(long, default_value_t = 100.0, verbatim_doc_comment)]
pub max_threshold: f32,

/// Used original kmer strand (else canonical kmers are considered)
Expand Down
12 changes: 6 additions & 6 deletions src/count.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pub fn kmers_in_fasta_file_par(
stranded: bool,
query_reverse: bool,
) -> Result<(), ()> {
const CHUNK_SIZE: usize = 32; // number of records
const CHUNK_SIZE: usize = 32; // number of records
const INPUT_CHANNEL_SIZE: usize = 8; // in units of CHUNK_SIZE records
const OUTPUT_CHANNEL_SIZE: usize = 8; // in units of CHUNK_SIZE records

Expand Down Expand Up @@ -181,7 +181,7 @@ pub fn only_kmers_in_fasta_file_par(
}

let (input_tx, input_rx) = std::sync::mpsc::sync_channel::<Chunk>(INPUT_CHANNEL_SIZE);
let (output_tx, output_rx) = std::sync::mpsc::sync_channel::<Chunk>(OUTPUT_CHANNEL_SIZE);
let (_output_tx, _output_rx) = std::sync::mpsc::sync_channel::<Chunk>(OUTPUT_CHANNEL_SIZE);



Expand Down Expand Up @@ -228,10 +228,10 @@ pub fn only_kmers_in_fasta_file_par(
});


reader_thread
.join()
.unwrap()
.map_err(|e| eprintln!("Error reading the sequences: {}", e));
let _ = reader_thread
.join()
.unwrap()
.map_err(|e| eprintln!("Error reading the sequences: {}", e));
}

/// count the number of indexed kmers in a given read
Expand Down
9 changes: 5 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pub fn back_to_sequences(
out_fasta_reads: String,
out_txt_kmers: String,
kmer_size: usize,
counted_kmer_threshold: usize,
min_threshold: f32,
max_threshold: f32,
stranded: bool,
Expand Down Expand Up @@ -76,10 +77,10 @@ pub fn back_to_sequences(
// prints all kmers from kmer_set, whaterver their counts count
let mut output = std::fs::File::create(&out_txt_kmers)?;
for (kmer, count) in kmer_set.iter() {
// if count.get() > 0 {
output.write_all(kmer)?;
writeln!(output, " {}", count.get())?;
// }
if count.get() >= counted_kmer_threshold {
output.write_all(kmer)?;
writeln!(output, " {}", count.get())?;
}
}
Ok(())
})()
Expand Down
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ fn main() {
args.out_sequences,
args.out_kmers,
args.kmer_size,
args.counted_kmer_threshold,
args.min_threshold,
args.max_threshold,
args.stranded,
Expand Down

0 comments on commit bc8021e

Please sign in to comment.