-
Notifications
You must be signed in to change notification settings - Fork 19
/
consensus.py
executable file
·139 lines (124 loc) · 4.59 KB
/
consensus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python
import sys, os, array, random, subprocess
from optparse import OptionParser
from Bio import SeqIO
from Bio.SeqIO.QualityIO import FastqGeneralIterator
a = array.array('L')
c = array.array('L')
g = array.array('L')
t = array.array('L')
n = array.array('L')
def seq_iter(file_hdl, stype):
if stype == 'fastq':
return FastqGeneralIterator(file_hdl)
else:
return SeqIO.parse(file_hdl, stype)
def split_rec(rec, stype):
if stype == 'fastq':
return rec[0].split()[0], rec[1].upper(), rec[2]
else:
return rec.id, str(rec.seq).upper(), None
def determinetype(infile):
cmd = ["head", "-n", "1", infile]
p1 = subprocess.check_output(cmd)
firstchar = p1[0]
if firstchar == "@":
return "fastq"
elif firstchar == ">":
return "fasta"
sys.stderr.write("Cannot determine file type of %s\n"%(infile))
exit(1)
def bp_max_from_stats(infile):
fhdl = open(infile, 'r')
maxl = 600
hasmax = False
for line in fhdl:
if line.startswith('length_max'):
parts = line.split("\t")
maxl = min(maxl, int(parts[1]))
hasmax = True
if not hasmax:
maxl = 100
return maxl
def countseqs(infile, stype):
if stype == 'fasta':
cmd = ['grep', '-c', '^>', infile]
elif stype == 'fastq':
cmd = ['wc', '-l', infile]
else:
sys.stderr.write("%s is invalid %s file\n"%(infile, stype))
exit(1)
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = proc.communicate()
if proc.returncode != 0:
raise IOError("%s\n%s"%(" ".join(cmd), stderr))
slen = stdout.strip()
if not slen:
sys.stderr.write("%s is invalid %s file\n"%(infile, stype))
exit(1)
if stype == 'fastq':
slenNum = int( slen.split()[0] ) / 4
else:
slenNum = int(slen)
return slenNum
def initialize(Nmax):
for i in range(0, Nmax):
a.append(0)
c.append(0)
g.append(0)
t.append(0)
n.append(0)
def populate(infile, stype, Nmax, Sratio):
"""puts nucleotide data into matrix."""
seqnum = 0
in_hdl = open(infile, 'rU')
for i, rec in enumerate(seq_iter(in_hdl, stype)):
head, seq, qual = split_rec(rec, stype)
if Sratio < random.random():
continue
seqnum += 1
for i in range(0, min(len(seq), Nmax)):
if seq[i] == "A":
a[i] += 1
elif seq[i] == "C":
c[i] += 1
elif seq[i] == "G":
g[i] += 1
elif seq[i] == "T":
t[i] += 1
elif seq[i] == "N":
n[i] += 1
in_hdl.close()
return seqnum
def printtable(outfile, Nmax):
outhdl = open(outfile, 'w')
outhdl.write("\t".join(['#', 'A', 'C', 'G', 'T', 'N', 'total'])+"\n")
for i in range(0, Nmax):
outhdl.write("\t".join(map(str,[i, a[i], c[i], g[i], t[i], n[i], (a[i]+c[i]+g[i]+t[i]+n[i])]))+"\n")
outhdl.close()
if __name__ == '__main__':
usage = "usage: %prog -i <input sequence file> -o <output file>"
parser = OptionParser(usage)
parser.add_option("-i", "--input", dest="input", default=None, help="Input sequence file.")
parser.add_option("-o", "--output", dest="output", default=None, help="Output file.")
parser.add_option("-t", "--type", dest="type", default=None, help="file type: fasta, fastq")
parser.add_option("-b", "--bp_max", dest="bp_max", default=100, type="int", help="max number of bps to process [default 100]")
parser.add_option("-s", "--seq_max", dest="seq_max", default=100000, type="int", help="max number of seqs to process [default 100000]")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Wordy [default off]")
parser.add_option("--stats", dest="stats", default=None, help="optional: sequence stats for input file, overrides --bp_max")
(opts, args) = parser.parse_args()
if not (opts.input and os.path.isfile(opts.input) and opts.output):
parser.error("Missing input/output files")
if not opts.type:
opts.type = determinetype(opts.input)
if opts.stats and os.path.isfile(opts.stats):
opts.bp_max = bp_max_from_stats(opts.stats)
if opts.verbose: sys.stdout.write("Counting sequences in %s ... "%opts.input)
seqnum = countseqs(opts.input, opts.type)
seqper = (opts.seq_max * 1.0) / seqnum
if opts.verbose: sys.stdout.write("Done: %d seqs found, %f %% of sequences will be processed\n"%(seqnum, (seqper*100)))
if opts.verbose: sys.stdout.write("Populating bp matrixes ... ")
initialize(opts.bp_max)
seqs = populate(opts.input, opts.type, opts.bp_max, seqper)
printtable(opts.output, opts.bp_max)
if opts.verbose: sys.stdout.write("Done: %d of %d sequences processed\n"%(seqs, seqnum))