Skip to content

Commit

Permalink
add Complementary, Pack, and Unpack traits
Browse files Browse the repository at this point in the history
  • Loading branch information
suchapalaver committed Jun 12, 2022
1 parent 427ab37 commit ee00e3f
Show file tree
Hide file tree
Showing 9 changed files with 174 additions and 128 deletions.
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ edition = "2021"

[dependencies]
bio = "*"
custom_error = "1.9.2"
dashmap = "4.0.2"
fxhash = "0.2.1"
rayon = "*"

[dev-dependencies]
insta = "1.14.1"
insta = "1.14.1"
56 changes: 27 additions & 29 deletions src/bitpacked_kmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,46 +7,44 @@ impl BitpackedKmer {
}

fn pack(&mut self, elem: u8) {
self.0 <<= 2;
let mask = match elem {
b'A' => 0,
b'C' => 1,
b'G' => 2,
b'T' => 3,
_ => panic!("`BitpackerKmer` handling an invalid k-mer bytestring is unexpected behavior"),
};
self.0 |= mask;
self.shift();
let mask = elem.pack_convert();
self.0 |= mask
}

fn shift(&mut self) {
self.0 <<= 2
}
}

impl FromIterator<u8> for BitpackedKmer {
fn from_iter<I: IntoIterator<Item=u8>>(iter: I) -> Self {
fn from_iter<I: IntoIterator<Item = u8>>(iter: I) -> Self {
let mut c = BitpackedKmer::new();

for i in iter {
c.pack(i);
c.pack(i)
}
c
}
}

impl From<&Vec<u8>> for BitpackedKmer {
fn from(sub: &Vec<u8>) -> Self {
let bitpacked_kmer: u64 = {
let mut k: u64 = 0;
for byte in sub.iter() {
k <<= 2;
let mask = match *byte {
b'A' => 0,
b'C' => 1,
b'G' => 2,
b'T' => 3,
_ => panic!("`BitpackerKmer` handling an invalid k-mer bytestring is unexpected behavior"),
};
k |= mask;
}
k
};
BitpackedKmer(bitpacked_kmer)
trait Pack {
fn pack_convert(self) -> u64
where
Self: Sized;
}

impl Pack for u8 {
fn pack_convert(self) -> u64 {
if self == b'A' {
0
} else if self == b'C' {
1
} else if self == b'G' {
2
} else {
// can only be b'T'
3
}
}
}
15 changes: 0 additions & 15 deletions src/canonical_kmer.rs

This file was deleted.

66 changes: 38 additions & 28 deletions src/kmer.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
custom_error::custom_error! { pub ValidityError
InvalidByte = "",
}

/// Creating a valid k-mer bytestring.
#[derive(Debug, PartialEq)]
pub struct Kmer(pub Vec<u8>);

/// Find the canonical kmer
/// --the alphabetically smaller of the substring and its reverse complement.
pub struct CanonicalKmer(pub Vec<u8>);

impl Kmer {
pub fn new() -> Kmer {
Kmer(Vec::new())
Expand All @@ -11,21 +19,25 @@ impl Kmer {
self.0.push(elem)
}

pub fn from_substring(sub: &[u8]) -> Result<Self, ()> {
sub.iter()
.map(|b| b.parse_valid_byte())
.collect()
pub fn from_substring(sub: &[u8]) -> Result<Kmer, ValidityError> {
sub.iter().map(|b| b.parse_valid_byte()).collect()
}

/// Find the index of the rightmost invalid byte in an invalid bytestring.
pub fn find_invalid(sub: &[u8]) -> usize {
match sub
.iter()
.rposition(|byte| byte.parse_valid_byte().is_err())
{
Some(rightmost_invalid_byte_index) => rightmost_invalid_byte_index,
None => panic!("Valid bytestring passed to `find_invalid`, which is a bug."),
}
pub fn get_canonical_kmer(reverse_complement: Vec<u8>, kmer: Vec<u8>) -> Self {
let rc = {
if reverse_complement.cmp(&kmer) == std::cmp::Ordering::Less {
reverse_complement
} else {
kmer
}
};
rc.into_iter().collect()
}

pub fn find_invalid_byte_index(sub: &[u8]) -> usize {
sub.iter()
.rposition(|byte| byte.parse_valid_byte().is_err())
.unwrap()
}
}

Expand All @@ -47,17 +59,17 @@ impl FromIterator<u8> for Kmer {
}

trait Validity {
fn parse_valid_byte(self) -> Result<Self, ()>
fn parse_valid_byte(self) -> Result<Self, ValidityError>
where
Self: Sized;
}

impl Validity for u8 {
fn parse_valid_byte(self) -> Result<Self, ()> {
fn parse_valid_byte(self) -> Result<Self, ValidityError> {
if [b'A', b'C', b'G', b'T'].contains(&self) {
Ok(self)
} else {
Err(())
} else {
Err(ValidityError::InvalidByte)
}
}
}
Expand All @@ -68,23 +80,21 @@ pub mod test {

#[test]
fn test_from_substring() {
let sub = &[b'C', b'A', b'G', b'T'];
match Kmer::from_substring(sub) {
Ok(k) => insta::assert_snapshot!(format!("{:?}", k), @"Kmer([67, 65, 71, 84])"),
Err(()) => panic!("this should not happen"),
}
let sub = &[b'C', b'A', b'G', b'T', b'G'];
let k = Kmer::from_substring(sub).unwrap();
insta::assert_snapshot!(format!("{:?}", k), @"Kmer([67, 65, 71, 84])");
}

#[test]
#[test]
fn test_parse_valid_byte() {
let sub = &[b'C', b'N', b'G', b'T'];
assert!(sub[1].parse_valid_byte().is_err());
let sub = &[b'C', b'N', b'G', b'T', b'G'];
assert!(sub[1].parse_valid_byte().is_err());
}

#[test]
fn test_from_substring_returns_err_for_invalid_substring() {
let sub = &[b'C', b'N', b'G', b'T'];
let k = Kmer::from_substring(sub);
assert!(k.is_err());
let sub = &[b'C', b'N', b'G', b'T'];
let k = Kmer::from_substring(sub);
assert!(k.is_err());
}
}
1 change: 0 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
//! - Testing!
pub mod bitpacked_kmer;
pub mod canonical_kmer;
pub mod configuration;
pub mod dashmaps;
pub mod kmer;
Expand Down
57 changes: 42 additions & 15 deletions src/revcomp_kmer.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,53 @@
use crate::kmer::Kmer;

trait Complementary {
fn parse_complement_byte(self) -> Self
where
Self: Sized;
}

impl Complementary for u8 {
fn parse_complement_byte(self) -> Self {
if self == b'A' {
b'T'
} else if self == b'C' {
b'G'
} else if self == b'G' {
b'C'
} else {
b'A'
}
}
}

/// Converting a DNA string slice into its [reverse compliment](https://en.wikipedia.org/wiki/Complementarity_(molecular_biology)#DNA_and_RNA_base_pair_complementarity).
pub struct RevCompKmer(pub Vec<u8>);

impl From<&Vec<u8>> for RevCompKmer {
fn from(sub: &Vec<u8>) -> Self {
let mut revcomp = Vec::with_capacity(sub.len());
impl FromIterator<u8> for RevCompKmer {
fn from_iter<I: IntoIterator<Item = u8>>(iter: I) -> Self {
let mut c = RevCompKmer::new();

for byte in sub.iter().rev() {
let comp = RevCompKmer::complement(*byte);
revcomp.push(comp);
for i in iter {
c.add(i)
}
RevCompKmer(revcomp)
c
}
}

impl RevCompKmer {
fn complement(byte: u8) -> u8 {
match byte {
b'A' => b'T',
b'C' => b'G',
b'G' => b'C',
b'T' => b'A',
_ => panic!("`RevCompKmer::from` should only be passed valid k-mers"),
}
fn new() -> Self {
Self(vec![])
}

fn add(&mut self, elem: u8) {
self.0.push(elem)
}

pub fn from_kmer(kmer: &Kmer) -> Self {
kmer.0
.iter()
.rev()
.map(|byte| byte.parse_complement_byte())
.collect()
}
}
24 changes: 10 additions & 14 deletions src/startup.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use crate::bitpacked_kmer::BitpackedKmer;
use crate::canonical_kmer::CanonicalKmer;
use crate::dashmaps::DashFx;
use crate::kmer::Kmer;
use crate::revcomp_kmer::RevCompKmer;
Expand All @@ -21,7 +20,7 @@ pub fn run(filepath: String, k: usize) -> Result<(), Box<dyn Error>> {
let _print_results = build_kmer_map(filepath, k)?
.into_iter()
.par_bridge()
.map(|(bitpacked_kmer, freq)| (UnpackedKmer::from((bitpacked_kmer, k)).0, freq))
.map(|(bitpacked_kmer, freq)| (UnpackedKmer::from_kmer_data(bitpacked_kmer, k).0, freq))
.map(|(unpacked_kmer, freq)| {
let kmer_str = String::from_utf8(unpacked_kmer).unwrap();
(kmer_str, freq)
Expand Down Expand Up @@ -61,35 +60,32 @@ fn process_seq(seq: &[u8], k: &usize, kmer_map: &DashFx) {
let mut i = 0;
while i <= seq.len() - k {
let sub = &seq[i..i + k];
let bytestring = Kmer::from_substring(sub);
//let bytestring = Kmer(sub.to_vec());
if let Ok(Kmer(valid_bytestring)) = bytestring {
process_valid_bytes(kmer_map, valid_bytestring);
if let Ok(kmer) = Kmer::from_substring(sub) {
process_valid_bytes(kmer_map, kmer);
i += 1;
} else {
let invalid_byte_index = Kmer::find_invalid(sub);
let invalid_byte_index = Kmer::find_invalid_byte_index(sub);
i += invalid_byte_index + 1;
}
}
}

/// Converts a valid sequence substring from a bytes string to a u64.
fn process_valid_bytes(kmer_map: &DashFx, valid_bytestring: Vec<u8>) {
let BitpackedKmer(bitpacked_kmer) = valid_bytestring.iter().cloned().collect();
//let BitpackedKmer(bitpacked_kmer) = BitpackedKmer::from(&valid_bytestring);
fn process_valid_bytes(kmer_map: &DashFx, kmer: Kmer) {
let BitpackedKmer(bitpacked_kmer) = kmer.0.iter().cloned().collect();
// If the k-mer as found in the sequence is already a key in the `Dashmap`,
// increment its value and move on.
if let Some(mut freq) = kmer_map.get_mut(&bitpacked_kmer) {
*freq += 1;
} else {
// Initialize the reverse complement of this so-far unrecorded k-mer.
let RevCompKmer(revcompkmer) = RevCompKmer::from(&valid_bytestring);
let RevCompKmer(revcompkmer) = RevCompKmer::from_kmer(&kmer);
// Find the alphabetically less of the k-mer substring and its reverse complement.
let CanonicalKmer(canonical_kmer) = CanonicalKmer::from((revcompkmer, valid_bytestring));
let canonical_kmer = Kmer::get_canonical_kmer(revcompkmer, kmer.0);
// Compress the canonical k-mer into a bitpacked 64-bit unsigned integer.
let BitpackedKmer(kmer) = BitpackedKmer::from(&canonical_kmer);
let kmer: BitpackedKmer = canonical_kmer.0.into_iter().collect();
// Add k-mer key and initial value to results.
*kmer_map.entry(kmer).or_insert(0) += 1;
*kmer_map.entry(kmer.0).or_insert(0) += 1;
}
}

Expand Down
Loading

0 comments on commit ee00e3f

Please sign in to comment.