Skip to content

Commit

Permalink
move all computations to samplesheet validation
Browse files Browse the repository at this point in the history
  • Loading branch information
abhi18av committed Jun 23, 2024
1 parent 7dd1f74 commit c99cf80
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 44 deletions.
79 changes: 38 additions & 41 deletions bin/samplesheet_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import re
import argparse
import pandas as pd
import csv

from sys import exit

Expand All @@ -14,43 +14,40 @@

name_re = re.compile('^[a-zA-Z0-9\-_]*$')

ss = pd.read_csv(args['input_file'])

# Create another column by adding Sample and Attempt columns
ss['magma_sample_name'] = ss['Study'].astype(str) + \
"." + ss['Sample'].astype(str) + \
".L" + ss['Library'].astype(str) + \
".A" + ss['Attempt'].astype(str) + \
"." + ss['Flowcell'].astype(str) + \
"." + ss['Lane'].astype(str) + \
"." + ss['Index Sequence'].astype(str) # Corrected column name

# FIXME Add the info for BWA MEM mapipng using a derived column into the dataframe
# bam_rg_string ="@RG\\tID:${flowcell}.${lane}\\tSM:${study}.${sample}\\tPL:illumina\\tLB:lib${library}\\tPU:${flowcell}.${lane}.${index_sequence}"

ss['magma_bwa_rg_string'] = "@RG\\tID:" + ss['Flowcell'].astype(str) + "." + ss['Lane'].astype(str) + \
"\\tSM:" + ss['Study'].astype(str) + "." + ss['Sample'].astype(str) + \
"\\tPL:illumina" + \
"\\tLB:lib" + ss['Library'].astype(str) + \
"\\tPU:" + ss['Flowcell'].astype(str) + "." + ss['Lane'].astype(str) + "." + ss['Index Sequence'].astype(str)


fail = False
for idx, row in ss.iterrows():
if not name_re.match(row['Study']):
print('Row {}: {} Illegal character in STUDY id'.format(idx, row['Study']))
fail = True
if not name_re.match(row['Sample']):
print('Row {}: {} - Illegal character in SAMPLE id'.format(idx, row['Sample']))
fail = True
if row['R1'] == row['R2']:
print('Row {}: {}, {} - DUPLICATED fastq file specified'.format(idx, row['R1'], row['R2']))
fail = True

if not fail:
ss.to_csv(args['output_file'], index=False)
print('Samplesheet format validation checks PASSED')
exit(0)
else:
print('Samplesheet format validation checks FAILED')
exit(1)
# Read the CSV file using the csv library
with open(args['input_file'], 'r') as infile, open(args['output_file'], 'w', newline='') as outfile:
reader = csv.DictReader(infile)
writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames + ['magma_sample_name', 'magma_bam_rg_string'])
writer.writeheader()

fail = False
for row in reader:

# Perform validation checks
if not name_re.match(row['Study']):
print(f'Row {reader.line_num}: {row["Study"]} Illegal character in STUDY id')
fail = True
if not name_re.match(row['Sample']):
print(f'Row {reader.line_num}: {row["Sample"]} - Illegal character in SAMPLE id')
fail = True
if row['R1'] == row['R2']:
print(f'Row {reader.line_num}: {row["R1"]}, {row["R2"]} - DUPLICATED fastq file specified')
fail = True


# Create the magma_sample_name column
row['magma_sample_name'] = f"{row['Study']}.{row['Sample']}.L{row['Library']}.A{row['Attempt']}.{row['Flowcell']}.{row['Lane']}.{row['Index Sequence']}"

# Create the magma_bwa_rg_string column
row['magma_bam_rg_string'] = f"@RG\\tID:{row['Flowcell']}.{row['Lane']}\\tSM:{row['Study']}.{row['Sample']}\\tPL:illumina\\tLB:lib{row['Library']}\\tPU:{row['Flowcell']}.{row['Lane']}.{row['Index Sequence']}"


# Write the validated row to the output file
writer.writerow(row)

if not fail:
print('Samplesheet format validation checks PASSED')
exit(0)
else:
print('Samplesheet format validation checks FAILED')
exit(1)
7 changes: 4 additions & 3 deletions workflows/validate_fastqs_wf.nf
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,18 @@ workflow VALIDATE_FASTQS_WF {
flowcell = row[6]
lane = row[7]
index_sequence = row[8]
magma_derived_name = row[9]
magma_sample_name = row[9]
magma_bam_rg_string = row[10]


//Accomodate single/multi reads
if (read1 && read2) {

return [magma_derived_name, [file(read1, checkIfExists: true), file(read2, checkIfExists: true)]]
return [[id: magma_sample_name, paired: true, bam_rg_string:magma_bam_rg_string ], [file(read1, checkIfExists: true), file(read2, checkIfExists: true)]]

} else {

return [magma_derived_name, [file(read1, checkIfExists: true)]]
return [[id: magma_sample_name, paired: true, bam_rg_string:magma_bam_rg_string ], [file(read1, checkIfExists: true)]]

}
}
Expand Down

0 comments on commit c99cf80

Please sign in to comment.