move all computations to samplesheet validation

TORCH-Consortium · Jun 23, 2024 · c99cf80 · c99cf80
1 parent 7dd1f74
commit c99cf80
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 44 deletions.
diff --git a/bin/samplesheet_validation.py b/bin/samplesheet_validation.py
@@ -2,7 +2,7 @@
 
 import re
 import argparse
-import pandas as pd
+import csv
 
 from sys import exit
 
@@ -14,43 +14,40 @@
 
     name_re = re.compile('^[a-zA-Z0-9\-_]*$')
 
-    ss = pd.read_csv(args['input_file'])
-
-    # Create another column by adding Sample and Attempt columns
-    ss['magma_sample_name'] = ss['Study'].astype(str) + \
-                              "." + ss['Sample'].astype(str) + \
-                              ".L" + ss['Library'].astype(str) + \
-                              ".A" + ss['Attempt'].astype(str) + \
-                              "." + ss['Flowcell'].astype(str) + \
-                              "." + ss['Lane'].astype(str) + \
-                              "." + ss['Index Sequence'].astype(str)  # Corrected column name
-
-    # FIXME Add the info for BWA MEM mapipng using a derived column into the dataframe
-    # bam_rg_string ="@RG\\tID:${flowcell}.${lane}\\tSM:${study}.${sample}\\tPL:illumina\\tLB:lib${library}\\tPU:${flowcell}.${lane}.${index_sequence}"
-
-    ss['magma_bwa_rg_string'] = "@RG\\tID:" + ss['Flowcell'].astype(str) + "." + ss['Lane'].astype(str) + \
-                                "\\tSM:" + ss['Study'].astype(str) + "." + ss['Sample'].astype(str) + \
-                                "\\tPL:illumina" + \
-                                "\\tLB:lib" + ss['Library'].astype(str) + \
-                                "\\tPU:" + ss['Flowcell'].astype(str) + "." + ss['Lane'].astype(str) + "." + ss['Index Sequence'].astype(str)
-
-
-    fail = False
-    for idx, row in ss.iterrows():
-        if not name_re.match(row['Study']):
-            print('Row {}: {}  Illegal character in STUDY id'.format(idx, row['Study']))
-            fail = True
-        if not name_re.match(row['Sample']):
-            print('Row {}: {} - Illegal character in SAMPLE id'.format(idx, row['Sample']))
-            fail = True
-        if row['R1'] == row['R2']:
-            print('Row {}: {}, {} - DUPLICATED fastq file specified'.format(idx, row['R1'], row['R2']))
-            fail = True
-
-    if not fail:
-        ss.to_csv(args['output_file'], index=False)
-        print('Samplesheet format validation checks PASSED')
-        exit(0)
-    else:
-        print('Samplesheet format validation checks FAILED')
-        exit(1)
+    # Read the CSV file using the csv library
+    with open(args['input_file'], 'r') as infile, open(args['output_file'], 'w', newline='') as outfile:
+        reader = csv.DictReader(infile)
+        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames + ['magma_sample_name', 'magma_bam_rg_string'])
+        writer.writeheader()
+
+        fail = False
+        for row in reader:
+
+            # Perform validation checks
+            if not name_re.match(row['Study']):
+                print(f'Row {reader.line_num}: {row["Study"]}  Illegal character in STUDY id')
+                fail = True
+            if not name_re.match(row['Sample']):
+                print(f'Row {reader.line_num}: {row["Sample"]} - Illegal character in SAMPLE id')
+                fail = True
+            if row['R1'] == row['R2']:
+                print(f'Row {reader.line_num}: {row["R1"]}, {row["R2"]} - DUPLICATED fastq file specified')
+                fail = True
+
+
+            # Create the magma_sample_name column
+            row['magma_sample_name'] = f"{row['Study']}.{row['Sample']}.L{row['Library']}.A{row['Attempt']}.{row['Flowcell']}.{row['Lane']}.{row['Index Sequence']}"
+
+            # Create the magma_bwa_rg_string column
+            row['magma_bam_rg_string'] = f"@RG\\tID:{row['Flowcell']}.{row['Lane']}\\tSM:{row['Study']}.{row['Sample']}\\tPL:illumina\\tLB:lib{row['Library']}\\tPU:{row['Flowcell']}.{row['Lane']}.{row['Index Sequence']}"
+
+
+            # Write the validated row to the output file
+            writer.writerow(row)
+
+        if not fail:
+            print('Samplesheet format validation checks PASSED')
+            exit(0)
+        else:
+            print('Samplesheet format validation checks FAILED')
+            exit(1)
diff --git a/workflows/validate_fastqs_wf.nf b/workflows/validate_fastqs_wf.nf
@@ -28,17 +28,18 @@ workflow VALIDATE_FASTQS_WF {
                                 flowcell        = row[6]
                                 lane            = row[7]
                                 index_sequence  = row[8]
-                                magma_derived_name = row[9]
+                                magma_sample_name = row[9]
+                                magma_bam_rg_string = row[10]
 
 
                 //Accomodate single/multi reads
                 if (read1 && read2) {
 
-                    return [magma_derived_name, [file(read1, checkIfExists: true), file(read2, checkIfExists: true)]]
+                    return [[id: magma_sample_name, paired: true, bam_rg_string:magma_bam_rg_string ], [file(read1, checkIfExists: true), file(read2, checkIfExists: true)]]
 
                 } else {
 
-                    return [magma_derived_name,  [file(read1, checkIfExists: true)]]
+                    return [[id: magma_sample_name, paired: true, bam_rg_string:magma_bam_rg_string ],  [file(read1, checkIfExists: true)]]
 
                     }
                 }