From ee00e3f7d43423f087f59b0fa49c4e56dcbe22d5 Mon Sep 17 00:00:00 2001 From: Joseph Livesey Date: Sun, 12 Jun 2022 12:10:47 -0400 Subject: [PATCH] add `Complementary`, `Pack`, and `Unpack` traits --- Cargo.lock | 7 +++++ Cargo.toml | 3 +- src/bitpacked_kmer.rs | 56 ++++++++++++++++----------------- src/canonical_kmer.rs | 15 --------- src/kmer.rs | 66 +++++++++++++++++++++----------------- src/lib.rs | 1 - src/revcomp_kmer.rs | 57 ++++++++++++++++++++++++--------- src/startup.rs | 24 ++++++-------- src/unpacked_kmer.rs | 73 ++++++++++++++++++++++++++++--------------- 9 files changed, 174 insertions(+), 128 deletions(-) delete mode 100644 src/canonical_kmer.rs diff --git a/Cargo.lock b/Cargo.lock index 9add73f..d9b030e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -225,6 +225,12 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" +[[package]] +name = "custom_error" +version = "1.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f8a51dd197fa6ba5b4dc98a990a43cc13693c23eb0089ebb0fcc1f04152bca6" + [[package]] name = "dashmap" version = "4.0.2" @@ -411,6 +417,7 @@ name = "krust" version = "0.1.0" dependencies = [ "bio", + "custom_error", "dashmap", "fxhash", "insta", diff --git a/Cargo.toml b/Cargo.toml index c09edaa..8775c70 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,9 +7,10 @@ edition = "2021" [dependencies] bio = "*" +custom_error = "1.9.2" dashmap = "4.0.2" fxhash = "0.2.1" rayon = "*" [dev-dependencies] -insta = "1.14.1" \ No newline at end of file +insta = "1.14.1" \ No newline at end of file diff --git a/src/bitpacked_kmer.rs b/src/bitpacked_kmer.rs index 10dc580..d2034e9 100644 --- a/src/bitpacked_kmer.rs +++ b/src/bitpacked_kmer.rs @@ -7,46 +7,44 @@ impl BitpackedKmer { } fn pack(&mut self, elem: u8) { - self.0 <<= 2; - let mask = match elem { - b'A' => 0, - b'C' => 1, - b'G' => 2, - b'T' => 3, - _ => panic!("`BitpackerKmer` handling an invalid k-mer bytestring is unexpected behavior"), - }; - self.0 |= mask; + self.shift(); + let mask = elem.pack_convert(); + self.0 |= mask + } + + fn shift(&mut self) { + self.0 <<= 2 } } impl FromIterator for BitpackedKmer { - fn from_iter>(iter: I) -> Self { + fn from_iter>(iter: I) -> Self { let mut c = BitpackedKmer::new(); for i in iter { - c.pack(i); + c.pack(i) } c } } -impl From<&Vec> for BitpackedKmer { - fn from(sub: &Vec) -> Self { - let bitpacked_kmer: u64 = { - let mut k: u64 = 0; - for byte in sub.iter() { - k <<= 2; - let mask = match *byte { - b'A' => 0, - b'C' => 1, - b'G' => 2, - b'T' => 3, - _ => panic!("`BitpackerKmer` handling an invalid k-mer bytestring is unexpected behavior"), - }; - k |= mask; - } - k - }; - BitpackedKmer(bitpacked_kmer) +trait Pack { + fn pack_convert(self) -> u64 + where + Self: Sized; +} + +impl Pack for u8 { + fn pack_convert(self) -> u64 { + if self == b'A' { + 0 + } else if self == b'C' { + 1 + } else if self == b'G' { + 2 + } else { + // can only be b'T' + 3 + } } } diff --git a/src/canonical_kmer.rs b/src/canonical_kmer.rs deleted file mode 100644 index bbc0c00..0000000 --- a/src/canonical_kmer.rs +++ /dev/null @@ -1,15 +0,0 @@ -/// Find the canonical kmer -/// --the alphabetically smaller of the substring and its reverse complement. -pub struct CanonicalKmer(pub Vec); - -impl From<(Vec, Vec)> for CanonicalKmer { - fn from(comp: (Vec, Vec)) -> Self { - let (reverse_complement, kmer) = (comp.0, comp.1); - let canonical_kmer = if reverse_complement.cmp(&kmer) == std::cmp::Ordering::Less { - reverse_complement - } else { - kmer - }; - CanonicalKmer(canonical_kmer) - } -} diff --git a/src/kmer.rs b/src/kmer.rs index 3e0fae4..9366c73 100644 --- a/src/kmer.rs +++ b/src/kmer.rs @@ -1,7 +1,15 @@ +custom_error::custom_error! { pub ValidityError + InvalidByte = "", +} + /// Creating a valid k-mer bytestring. #[derive(Debug, PartialEq)] pub struct Kmer(pub Vec); +/// Find the canonical kmer +/// --the alphabetically smaller of the substring and its reverse complement. +pub struct CanonicalKmer(pub Vec); + impl Kmer { pub fn new() -> Kmer { Kmer(Vec::new()) @@ -11,21 +19,25 @@ impl Kmer { self.0.push(elem) } - pub fn from_substring(sub: &[u8]) -> Result { - sub.iter() - .map(|b| b.parse_valid_byte()) - .collect() + pub fn from_substring(sub: &[u8]) -> Result { + sub.iter().map(|b| b.parse_valid_byte()).collect() } - /// Find the index of the rightmost invalid byte in an invalid bytestring. - pub fn find_invalid(sub: &[u8]) -> usize { - match sub - .iter() - .rposition(|byte| byte.parse_valid_byte().is_err()) - { - Some(rightmost_invalid_byte_index) => rightmost_invalid_byte_index, - None => panic!("Valid bytestring passed to `find_invalid`, which is a bug."), - } + pub fn get_canonical_kmer(reverse_complement: Vec, kmer: Vec) -> Self { + let rc = { + if reverse_complement.cmp(&kmer) == std::cmp::Ordering::Less { + reverse_complement + } else { + kmer + } + }; + rc.into_iter().collect() + } + + pub fn find_invalid_byte_index(sub: &[u8]) -> usize { + sub.iter() + .rposition(|byte| byte.parse_valid_byte().is_err()) + .unwrap() } } @@ -47,17 +59,17 @@ impl FromIterator for Kmer { } trait Validity { - fn parse_valid_byte(self) -> Result + fn parse_valid_byte(self) -> Result where Self: Sized; } impl Validity for u8 { - fn parse_valid_byte(self) -> Result { + fn parse_valid_byte(self) -> Result { if [b'A', b'C', b'G', b'T'].contains(&self) { Ok(self) - } else { - Err(()) + } else { + Err(ValidityError::InvalidByte) } } } @@ -68,23 +80,21 @@ pub mod test { #[test] fn test_from_substring() { - let sub = &[b'C', b'A', b'G', b'T']; - match Kmer::from_substring(sub) { - Ok(k) => insta::assert_snapshot!(format!("{:?}", k), @"Kmer([67, 65, 71, 84])"), - Err(()) => panic!("this should not happen"), - } + let sub = &[b'C', b'A', b'G', b'T', b'G']; + let k = Kmer::from_substring(sub).unwrap(); + insta::assert_snapshot!(format!("{:?}", k), @"Kmer([67, 65, 71, 84])"); } - #[test] + #[test] fn test_parse_valid_byte() { - let sub = &[b'C', b'N', b'G', b'T']; - assert!(sub[1].parse_valid_byte().is_err()); + let sub = &[b'C', b'N', b'G', b'T', b'G']; + assert!(sub[1].parse_valid_byte().is_err()); } #[test] fn test_from_substring_returns_err_for_invalid_substring() { - let sub = &[b'C', b'N', b'G', b'T']; - let k = Kmer::from_substring(sub); - assert!(k.is_err()); + let sub = &[b'C', b'N', b'G', b'T']; + let k = Kmer::from_substring(sub); + assert!(k.is_err()); } } diff --git a/src/lib.rs b/src/lib.rs index 6f9e07f..c9517c9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,7 +26,6 @@ //! - Testing! pub mod bitpacked_kmer; -pub mod canonical_kmer; pub mod configuration; pub mod dashmaps; pub mod kmer; diff --git a/src/revcomp_kmer.rs b/src/revcomp_kmer.rs index 0a712c7..5a48d94 100644 --- a/src/revcomp_kmer.rs +++ b/src/revcomp_kmer.rs @@ -1,26 +1,53 @@ +use crate::kmer::Kmer; + +trait Complementary { + fn parse_complement_byte(self) -> Self + where + Self: Sized; +} + +impl Complementary for u8 { + fn parse_complement_byte(self) -> Self { + if self == b'A' { + b'T' + } else if self == b'C' { + b'G' + } else if self == b'G' { + b'C' + } else { + b'A' + } + } +} + /// Converting a DNA string slice into its [reverse compliment](https://en.wikipedia.org/wiki/Complementarity_(molecular_biology)#DNA_and_RNA_base_pair_complementarity). pub struct RevCompKmer(pub Vec); -impl From<&Vec> for RevCompKmer { - fn from(sub: &Vec) -> Self { - let mut revcomp = Vec::with_capacity(sub.len()); +impl FromIterator for RevCompKmer { + fn from_iter>(iter: I) -> Self { + let mut c = RevCompKmer::new(); - for byte in sub.iter().rev() { - let comp = RevCompKmer::complement(*byte); - revcomp.push(comp); + for i in iter { + c.add(i) } - RevCompKmer(revcomp) + c } } impl RevCompKmer { - fn complement(byte: u8) -> u8 { - match byte { - b'A' => b'T', - b'C' => b'G', - b'G' => b'C', - b'T' => b'A', - _ => panic!("`RevCompKmer::from` should only be passed valid k-mers"), - } + fn new() -> Self { + Self(vec![]) + } + + fn add(&mut self, elem: u8) { + self.0.push(elem) + } + + pub fn from_kmer(kmer: &Kmer) -> Self { + kmer.0 + .iter() + .rev() + .map(|byte| byte.parse_complement_byte()) + .collect() } } diff --git a/src/startup.rs b/src/startup.rs index 2ef90d3..8a437d7 100644 --- a/src/startup.rs +++ b/src/startup.rs @@ -1,5 +1,4 @@ use crate::bitpacked_kmer::BitpackedKmer; -use crate::canonical_kmer::CanonicalKmer; use crate::dashmaps::DashFx; use crate::kmer::Kmer; use crate::revcomp_kmer::RevCompKmer; @@ -21,7 +20,7 @@ pub fn run(filepath: String, k: usize) -> Result<(), Box> { let _print_results = build_kmer_map(filepath, k)? .into_iter() .par_bridge() - .map(|(bitpacked_kmer, freq)| (UnpackedKmer::from((bitpacked_kmer, k)).0, freq)) + .map(|(bitpacked_kmer, freq)| (UnpackedKmer::from_kmer_data(bitpacked_kmer, k).0, freq)) .map(|(unpacked_kmer, freq)| { let kmer_str = String::from_utf8(unpacked_kmer).unwrap(); (kmer_str, freq) @@ -61,35 +60,32 @@ fn process_seq(seq: &[u8], k: &usize, kmer_map: &DashFx) { let mut i = 0; while i <= seq.len() - k { let sub = &seq[i..i + k]; - let bytestring = Kmer::from_substring(sub); - //let bytestring = Kmer(sub.to_vec()); - if let Ok(Kmer(valid_bytestring)) = bytestring { - process_valid_bytes(kmer_map, valid_bytestring); + if let Ok(kmer) = Kmer::from_substring(sub) { + process_valid_bytes(kmer_map, kmer); i += 1; } else { - let invalid_byte_index = Kmer::find_invalid(sub); + let invalid_byte_index = Kmer::find_invalid_byte_index(sub); i += invalid_byte_index + 1; } } } /// Converts a valid sequence substring from a bytes string to a u64. -fn process_valid_bytes(kmer_map: &DashFx, valid_bytestring: Vec) { - let BitpackedKmer(bitpacked_kmer) = valid_bytestring.iter().cloned().collect(); - //let BitpackedKmer(bitpacked_kmer) = BitpackedKmer::from(&valid_bytestring); +fn process_valid_bytes(kmer_map: &DashFx, kmer: Kmer) { + let BitpackedKmer(bitpacked_kmer) = kmer.0.iter().cloned().collect(); // If the k-mer as found in the sequence is already a key in the `Dashmap`, // increment its value and move on. if let Some(mut freq) = kmer_map.get_mut(&bitpacked_kmer) { *freq += 1; } else { // Initialize the reverse complement of this so-far unrecorded k-mer. - let RevCompKmer(revcompkmer) = RevCompKmer::from(&valid_bytestring); + let RevCompKmer(revcompkmer) = RevCompKmer::from_kmer(&kmer); // Find the alphabetically less of the k-mer substring and its reverse complement. - let CanonicalKmer(canonical_kmer) = CanonicalKmer::from((revcompkmer, valid_bytestring)); + let canonical_kmer = Kmer::get_canonical_kmer(revcompkmer, kmer.0); // Compress the canonical k-mer into a bitpacked 64-bit unsigned integer. - let BitpackedKmer(kmer) = BitpackedKmer::from(&canonical_kmer); + let kmer: BitpackedKmer = canonical_kmer.0.into_iter().collect(); // Add k-mer key and initial value to results. - *kmer_map.entry(kmer).or_insert(0) += 1; + *kmer_map.entry(kmer.0).or_insert(0) += 1; } } diff --git a/src/unpacked_kmer.rs b/src/unpacked_kmer.rs index 79817cc..9054c69 100644 --- a/src/unpacked_kmer.rs +++ b/src/unpacked_kmer.rs @@ -3,41 +3,64 @@ pub struct UnpackedKmer(pub Vec); impl UnpackedKmer { - fn new(k: usize) -> UnpackedKmer { - UnpackedKmer(Vec::with_capacity(k)) + fn new() -> UnpackedKmer { + UnpackedKmer(Vec::new()) } fn add(&mut self, elem: u8) { self.0.push(elem); } + + pub fn from_kmer_data(kmer: u64, k: usize) -> Self { + (0..k) + .into_iter() + .map(|i| kmer.isolate_bits(i, k).replace_bits().unpack_bits()) + .collect() + } +} + +trait Unpack { + fn unpack_bits(self) -> u8 + where + Self: Sized; + + fn isolate_bits(self, i: usize, k: usize) -> Self + where + Self: Sized; + + fn replace_bits(self) -> Self + where + Self: Sized; } -impl From<(u64, usize)> for UnpackedKmer { - fn from(kmer_data: (u64, usize)) -> Self { - let (kmer, k) = (kmer_data.0, kmer_data.1); - let mut unpacked_kmer = UnpackedKmer::new(k); - for i in 0..k { - let isolate = kmer << ((i * 2) + 64 - (k * 2)); - let base = isolate >> 62; - let byte = UnpackedKmerByte::from(base); - unpacked_kmer.add(byte.0); +impl Unpack for u64 { + fn unpack_bits(self: u64) -> u8 { + if self == 0 { + b'A' + } else if self == 1 { + b'C' + } else if self == 2 { + b'G' + } else { + b'T' } - unpacked_kmer + } + fn isolate_bits(self: u64, i: usize, k: usize) -> Self { + self << ((i * 2) + 64 - (k * 2)) + } + + fn replace_bits(self) -> Self { + self >> 62 } } -/// Unpacking compressed, bitpacked k-mer data. -struct UnpackedKmerByte(u8); - -impl From for UnpackedKmerByte { - fn from(base: u64) -> Self { - let unpacked_byte = match base { - 0 => b'A', - 1 => b'C', - 2 => b'G', - 3 => b'T', - _ => panic!("An invalid k-mer passed to here means we have a serious bug"), - }; - UnpackedKmerByte(unpacked_byte) +impl FromIterator for UnpackedKmer { + fn from_iter>(iter: I) -> Self { + let mut c = UnpackedKmer::new(); + + for i in iter { + c.add(i) + } + c } }