From 24b618c6ac3332806b79ab579a542536d7414ee4 Mon Sep 17 00:00:00 2001 From: Jason Fan Date: Mon, 19 Aug 2024 12:51:37 -0400 Subject: [PATCH 1/2] fix: early return and counting of mismatches when n in barcodes. --- src/bin/commands/demux.rs | 120 ++++++++++++++++++++++++++++++++++++ src/lib/barcode_matching.rs | 19 +++++- 2 files changed, 137 insertions(+), 2 deletions(-) diff --git a/src/bin/commands/demux.rs b/src/bin/commands/demux.rs index ab727db..df36ee2 100644 --- a/src/bin/commands/demux.rs +++ b/src/bin/commands/demux.rs @@ -1371,6 +1371,126 @@ mod tests { ); } + #[test] + fn test_demux_with_catchall_barcode() { + let tmp = TempDir::new().unwrap(); + let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()]; + let s1_barcode = "NNNNNNN"; + let sample_metadata = metadata_file(&tmp, &[s1_barcode]); + let input_files = + vec![fastq_file(&tmp, "ex", "ex", &[&(s1_barcode.to_owned() + &"A".repeat(100))])]; + + let output_dir = tmp.path().to_path_buf().join("output"); + + let demux_inputs = Demux { + inputs: input_files, + read_structures, + sample_metadata, + output_types: vec!['T'], + output: output_dir.clone(), + unmatched_prefix: "unmatched".to_owned(), + max_mismatches: 0, + min_mismatch_delta: 2, + threads: 5, + compression_level: 5, + skip_reasons: vec![], + }; + demux_inputs.execute().unwrap(); + + let unmatched_path = output_dir.join("unmatched.R1.fq.gz"); + let unmatched_reads = read_fastq(&unmatched_path); + assert_eq!(unmatched_reads.len(), 0); + + let output_path = output_dir.join("Sample0000.R1.fq.gz"); + let fq_reads = read_fastq(&output_path); + + assert_eq!(fq_reads.len(), 1); + assert_equal( + &fq_reads[0], + &OwnedRecord { + head: b"ex_0 1:N:0:NNNNNNN".to_vec(), + seq: "A".repeat(100).as_bytes().to_vec(), + qual: ";".repeat(100).as_bytes().to_vec(), + }, + ); + } + + #[test] + fn test_demux_with_ns_in_barcode() { + let tmp = TempDir::new().unwrap(); + let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()]; + let s1_barcode = "NNAAAAA"; + let s2_barcode = "NNCCCCC"; + let sample_metadata = metadata_file(&tmp, &[s1_barcode, s2_barcode]); + let input_files = vec![fastq_file( + &tmp, + "ex", + "ex", + &[ + &("ANAAAAA".to_owned() + &"A".repeat(5)), + &("ANCCCCC".to_owned() + &"C".repeat(5)), + &("NNNAAAA".to_owned() + &"T".repeat(5)), + ], + )]; + + let output_dir = tmp.path().to_path_buf().join("output"); + + let demux_inputs = Demux { + inputs: input_files, + read_structures, + sample_metadata, + output_types: vec!['T'], + output: output_dir.clone(), + unmatched_prefix: "unmatched".to_owned(), + max_mismatches: 0, + min_mismatch_delta: 0, + threads: 5, + compression_level: 5, + skip_reasons: vec![], + }; + demux_inputs.execute().unwrap(); + + let output_path = output_dir.join("Sample0000.R1.fq.gz"); + let fq_reads = read_fastq(&output_path); + + assert_eq!(fq_reads.len(), 1); + assert_equal( + &fq_reads[0], + &OwnedRecord { + head: b"ex_0 1:N:0:ANAAAAA".to_vec(), + seq: "A".repeat(5).as_bytes().to_vec(), + qual: ";".repeat(5).as_bytes().to_vec(), + }, + ); + + let output_path = output_dir.join("Sample0001.R1.fq.gz"); + let fq_reads = read_fastq(&output_path); + + assert_eq!(fq_reads.len(), 1); + assert_equal( + &fq_reads[0], + &OwnedRecord { + head: b"ex_1 1:N:0:ANCCCCC".to_vec(), + seq: "C".repeat(5).as_bytes().to_vec(), + qual: ";".repeat(5).as_bytes().to_vec(), + }, + ); + + // Should not match since it has 3 no calls, and barcodes have at maximum 1 no-call + let unmatched_path = output_dir.join("unmatched.R1.fq.gz"); + let unmatched_reads = read_fastq(&unmatched_path); + assert_eq!(unmatched_reads.len(), 1); + + assert_equal( + &unmatched_reads[0], + &OwnedRecord { + head: b"ex_2 1:N:0:NNNAAAA".to_vec(), + seq: "T".repeat(5).as_bytes().to_vec(), + qual: ";".repeat(5).as_bytes().to_vec(), + }, + ); + } + #[test] fn test_demux_paired_reads_with_in_line_sample_barcodes() { let tmp = TempDir::new().unwrap(); diff --git a/src/lib/barcode_matching.rs b/src/lib/barcode_matching.rs index 4372c7c..02648bb 100644 --- a/src/lib/barcode_matching.rs +++ b/src/lib/barcode_matching.rs @@ -25,6 +25,8 @@ pub struct BarcodeMatcher { /// Note - this is to be replaced by a sample struct in task 3. For now we're keeping things /// very simple. sample_barcodes: Vec, + /// The maxium number of Ns in any barcode in set of sample barcodes + max_ns_in_barcodes: usize, /// The maximum mismatches to match a sample barcode. max_mismatches: u8, /// The minimum difference between number of mismatches in the best and second best barcodes @@ -56,12 +58,20 @@ impl BarcodeMatcher { "Sample barcode cannot be empty string" ); + let mut max_ns_in_barcodes = 0; let modified_sample_barcodes = sample_barcodes .iter() - .map(|barcode| BString::from(barcode.to_ascii_uppercase())) + .map(|barcode| { + let barcode = BString::from(barcode.to_ascii_uppercase()); + let num_ns = barcode.iter().filter(|&b| byte_is_nocall(*b)).count(); + max_ns_in_barcodes = max_ns_in_barcodes.max(num_ns); + barcode + }) .collect::>(); + Self { sample_barcodes: modified_sample_barcodes, + max_ns_in_barcodes, max_mismatches, min_mismatch_delta, use_cache, @@ -132,7 +142,7 @@ impl BarcodeMatcher { return None; } let num_no_calls = read_bases.iter().filter(|&&b| byte_is_nocall(b)).count(); - if num_no_calls > self.max_mismatches as usize { + if num_no_calls > (self.max_mismatches as usize) + self.max_ns_in_barcodes { None } else if self.use_cache { if let Some(cached_match) = self.cache.get(read_bases) { @@ -207,6 +217,11 @@ mod tests { assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GANNACA".as_bytes()), 0,); } + #[test] + fn all_ns_barcode_have_no_mismatches() { + assert_eq!(BarcodeMatcher::count_mismatches("GANNACA".as_bytes(), "NNNNNNN".as_bytes()), 0,); + } + #[test] fn find_two_mismatches() { assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GACCACA".as_bytes()), 2,); From 62d7e8cbec5b3a1e63856c8a10abaac07f237144 Mon Sep 17 00:00:00 2001 From: Jason Fan Date: Mon, 19 Aug 2024 13:05:35 -0400 Subject: [PATCH 2/2] style: whitespace in new test --- src/bin/commands/demux.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/bin/commands/demux.rs b/src/bin/commands/demux.rs index df36ee2..02c3717 100644 --- a/src/bin/commands/demux.rs +++ b/src/bin/commands/demux.rs @@ -1452,7 +1452,6 @@ mod tests { let output_path = output_dir.join("Sample0000.R1.fq.gz"); let fq_reads = read_fastq(&output_path); - assert_eq!(fq_reads.len(), 1); assert_equal( &fq_reads[0], @@ -1465,7 +1464,6 @@ mod tests { let output_path = output_dir.join("Sample0001.R1.fq.gz"); let fq_reads = read_fastq(&output_path); - assert_eq!(fq_reads.len(), 1); assert_equal( &fq_reads[0], @@ -1480,7 +1478,6 @@ mod tests { let unmatched_path = output_dir.join("unmatched.R1.fq.gz"); let unmatched_reads = read_fastq(&unmatched_path); assert_eq!(unmatched_reads.len(), 1); - assert_equal( &unmatched_reads[0], &OwnedRecord {