-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompare.two.FastQ.files.py
executable file
·209 lines (181 loc) · 6.69 KB
/
compare.two.FastQ.files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python3
import gzip
import os
import shutil
import sys
from argparse import ArgumentParser
from tempfile import mkdtemp
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
import numpy as np
from Bio import SeqIO
from Bio.SeqIO.QualityIO import FastqGeneralIterator
def parse_args():
parser = ArgumentParser(
description='Compares two FastQ files and reports differences between'
' the two. Initially developed to audit mutated FastQ files and'
' report the number of shared and unshared sequence headers as well'
' as the SNP frequency of artificially mutated sequences.',
epilog='NOTE: order between the two files is expected to be sorted '
'based on the header names with no missing sequences between the two',
add_help=False,
)
req = parser.add_argument_group('Required')
req.add_argument(
'-a',
'--after',
required=True,
metavar='FILE',
help='modified or processed FastQ sequence file'
' , optionally gunzip compressed',
)
req.add_argument(
'-b',
'--before',
required=True,
metavar='FILE',
help='initial FastQ sequence file , optionally gunzip compressed',
)
opt = parser.add_argument_group('Optional')
opt.add_argument(
'-o',
'--outdir',
metavar='PATH',
type=str,
default=None,
help='outpath to save two files (after and before) with differing'
' sequence records [None]',
)
opt.add_argument(
'-h', '--help', action='help', help='show this help message and exit'
)
return parser.parse_args()
def decompress_file(infile, outdir):
# Create a new output file name
uncompressed_file = os.path.basename(infile).rstrip('.gz')
outfile = os.path.join(outdir, uncompressed_file)
# Decompress the gunzipped file
with gzip.open(infile, 'rb') as ifh, open(outfile, 'wb') as ofh:
shutil.copyfileobj(ifh, ofh)
# Send the full path of the decompressed file back
return outfile
def compare_two_fastq_files(after, before, outdir):
comparisons_counted_snps = []
sequence_records_with_different_sequences_after = []
sequence_records_with_different_sequences_before = []
for (
title_after,
sequence_after,
quality_after,
title_before,
sequence_before,
quality_before,
) in zip(FastqGeneralIterator(after), FastqGeneralIterator(before)):
# Verify these two records have the same sequence header
if str(title_after) != str(title_before):
sys.stderr.write(
'INFO: skipping {} in {} and {} in {} due to different'
' sequence headers.'.format(
title_after,
os.path.basename(after),
title_before,
os.path.basename(before),
)
)
continue
# Verify these two records have the same sequence length
if len(sequence_after) != len(sequence_before):
sys.stderr.write(
'INFO: skipping {} in {} and {} in {} due to different'
' sequence lengths.'.format(
title_after,
os.path.basename(after),
title_before,
os.path.basename(before),
)
)
continue
# Compare the sequence composition between the two for SNPs
cnt_SNPs = 0
for nucleotide_after, nucleotide_before in zip(
sequence_after, sequence_before
):
if nucleotide_after != nucleotide_before:
cnt_SNPs += 1
comparisons_counted_snps.append(cnt_SNPs)
if outdir is not None:
# Construct new FastQ sequence record objects
fastq_record_string = '@{}\n{}\n+\n{}\n'.format(
title_after, sequence_after, quality_after
)
new_rec = SeqIO.read(StringIO(fastq_record_string), 'fastq')
sequence_records_with_different_sequences_after.append(new_rec)
fastq_record_string = '@{}\n{}\n+\n{}\n'.format(
title_before, sequence_before, quality_before
)
new_rec = SeqIO.read(StringIO(fastq_record_string), 'fastq')
sequence_records_with_different_sequences_before.append(new_rec)
# Optionally, write only mutated output
if outdir is not None:
out_after = os.path.join(outdir, 'different_sequences.after.fastq')
out_before = os.path.join(outdir, 'different_sequences.before.fastq')
SeqIO.write(
sequence_records_with_different_sequences_after,
out_after,
'fastq',
)
SeqIO.write(
sequence_records_with_different_sequences_before,
out_before,
'fastq',
)
print(
'INFO: saved all {} sequence records with different'
' sequences'.format(
len(sequence_records_with_different_sequences_after)
)
)
return comparisons_counted_snps
def calculate_stats(list_of_integers):
stats = {}
stats['q1'] = int(
np.percentile(list_of_integers, 25, interpolation='midpoint')
)
stats['median'] = int(np.median(list_of_integers))
stats['q3'] = int(
np.percentile(list_of_integers, 75, interpolation='midpoint')
)
stats['mean'] = int(np.mean(list_of_integers))
stats['stdev'] = int(np.std(list_of_integers))
stats['total_sequence_records'] = len(list_of_integers)
stats['total_SNPs'] = int(np.sum(list_of_integers))
stats['most_SNPs_in_a_record'] = int(np.amax(list_of_integers))
stats['fewest_SNPs_in_a_record'] = int(np.amin(list_of_integers))
return stats
def main():
opt = parse_args()
# I/O handling
after = os.path.realpath(os.path.expanduser(opt.after))
before = os.path.realpath(os.path.expanduser(opt.before))
if opt.outdir is None:
outdir = None
else:
outdir = os.path.realpath(os.path.expanduser(opt.outdir))
# Auto-handle gunzip compressed input
tmp = mkdtemp()
if after.endswith('.gz'):
after = decompress_file(after, tmp)
if before.endswith('.gz'):
before = decompress_file(before, tmp)
# Compare the two sequence files
comparisons_counted_snps = compare_two_fastq_files(after, before, outdir)
# Cleanup
shutil.rmtree(tmp)
# Report statistics on SNPs observed between the two sequence sets
stats = calculate_stats(comparisons_counted_snps)
for k, v in stats.items():
print('{} = {}'.format(k.replace('_', ' '), v))
if __name__ == '__main__':
main()