Skip to content

Commit

Permalink
dev iteration
Browse files Browse the repository at this point in the history
  • Loading branch information
abhi18av committed Jun 19, 2024
1 parent 91ffa02 commit da83561
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 0 deletions.
62 changes: 62 additions & 0 deletions bin/fastq_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python3

import ast
import argparse
import re

import pandas as pd

re_mapped_p = re.compile(r'\d* mapped \((.*)%\)')

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process the sample stats')
parser.add_argument('--sample_name', dest='sample_name', required=True, metavar='sample_name', type=str, help='The sample name')
parser.add_argument('--seqkit_stats_file', dest='seqkit_stats_file', required=True, metavar='seqkit_stats_file', type=str, help='The seqkit stats file')
parser.add_argument('--du_stats_file', dest='du_stats_file', required=True, metavar='du_stats_file', type=str, help='The du stats file')
parser.add_argument('--md5sum_stats_file', dest='md5sum_stats_file', required=True, metavar='md5sum_stats_file', type=str, help='The md5sum metrics file')
args = vars(parser.parse_args())

with open(args['wgsmetrics_file']) as f:
for line in f:
if '## METRICS CLASS' in line:
rows = [f.readline().strip(), f.readline().strip()]
wgsmetrics = pd.DataFrame([rows[1].split('\t')], columns=rows[0].split('\t'))
with open(args['ntmfraction_file']) as f:
ntm_fraction = float(f.read().strip())
with open(args['samtoolsstats_file']) as f:
for line in f:
if 'insert size average' in line:
ins_size = float(line.strip().split('\t')[2])
if 'raw total sequences' in line:
total_seqs = int(line.strip().split('\t')[2])
if 'average quality' in line:
avg_qual = float(line.strip().split('\t')[2])
with open(args['flagstat_file']) as f:
for line in f:
m = re_mapped_p.match(line)
if m:
mapped_p = float(m[1])

if int(wgsmetrics.loc[0, 'MEDIAN_COVERAGE']) >= args['median_coverage_cutoff']:
coverage_threshold_met = 1
else:
coverage_threshold_met = 0

if float(wgsmetrics.loc[0, 'PCT_1X']) >= args['breadth_of_coverage_cutoff']:
breadth_of_coverage_threshold_met = 1
else:
breadth_of_coverage_threshold_met = 0

if ntm_fraction <= args['ntm_fraction_cutoff']:
ntm_fraction_threshold_met = 1
else:
ntm_fraction_threshold_met = 0

if coverage_threshold_met and breadth_of_coverage_threshold_met and ntm_fraction_threshold_met:
all_thresholds_met = 1
else:
all_thresholds_met = 0

with open('{}.fastq_stats.tsv'.format(args['sample_name']), 'w') as f:
f.write('\t'.join([str(i) for i in [args['sample_name'], ins_size, mapped_p, total_seqs, avg_qual] + list(wgsmetrics.loc[0, ['MEAN_COVERAGE', 'SD_COVERAGE', 'MEDIAN_COVERAGE', 'MAD_COVERAGE', 'PCT_EXC_ADAPTER', 'PCT_EXC_MAPQ', 'PCT_EXC_DUPE', 'PCT_EXC_UNPAIRED', 'PCT_EXC_BASEQ', 'PCT_EXC_OVERLAP', 'PCT_EXC_CAPPED', 'PCT_EXC_TOTAL', 'PCT_1X', 'PCT_5X', 'PCT_10X', 'PCT_30X', 'PCT_50X', 'PCT_100X']]) + [ntm_fraction, ntm_fraction_threshold_met, coverage_threshold_met, breadth_of_coverage_threshold_met, all_thresholds_met]]))
f.write('\n')
9 changes: 9 additions & 0 deletions modules/utils/fastq_stats.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@ process UTILS_FASTQ_STATS {
du -shL *fastq* > ${sampleName}.du.txt
cat ${sampleName}.du.txt | csvtk tab2csv | csvtk add-header -n size,file > ${sampleName}.du_stats.csv
csvtk join -f file \\
${sampleName}.seqkit_stats.final.csv \\
${sampleName}.md5sum_stats.csv \\
${sampleName}.du_stats.csv \\
> ${sampleName}.fastq_stats.csv \\
"""

stub:
Expand Down

0 comments on commit da83561

Please sign in to comment.