-
Notifications
You must be signed in to change notification settings - Fork 19
/
bleachsims
executable file
·124 lines (105 loc) · 4.17 KB
/
bleachsims
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
import os, sys, operator, re, gzip
from optparse import OptionParser
usage = "usage: %prog -s <sims_file> [options]"
parser = OptionParser(usage)
parser.add_option('-s', '--sims', dest="sims_file", help="sorted sims input to be bleached. use '-' or 'stdin' to read from STDIN")
parser.add_option('-o', '--out', dest="out_file", help="bleached sims output. use '-' or 'stdout' to write to STDOUT")
parser.add_option('-m', '--min', type="int", dest="min", help="minimum # of results per query, default=20", default=20)
parser.add_option('-r', '--range', type="int", dest="range", help="best evalue plus this exponent that will be returned (0 means no range), default=10", default=10)
parser.add_option('-c', '--cutoff', type="int", dest="cutoff", help="remove all evalues with an exponent lower than cutoff, default=3", default=3)
parser.add_option('', '--eval_only', dest="eval_only", action="store_true", default=False, help="only filter by evalue cutoff")
parser.add_option('', '--min_hit_only', dest="min_hit_only", action="store_true", default=False, help="only filter by minimum hits per query")
ID_RE = re.compile(r'^(\S+)\t')
EVAL_RE = re.compile(r'^(\d\.\d)e([-+])(\d+)$')
(options, args) = parser.parse_args()
if not ( options.sims_file and options.out_file ):
parser.print_help()
print "sims file and/or output file name required"
sys.exit(0)
def minHit(sims, out):
min_num = options.min
if len(sims) > min_num:
sims = sims[:min_num]
for s in sims:
out.write(s)
def bleach(sims, out):
count = 0
no_zero = 1
min_num = options.min
cutoff = options.cutoff
e_range = 0
if options.range > 0:
e_range = options.range
# split columns than re-sort, remove bad lines
sims = map(lambda x: x.strip().split("\t"), sims)
sims = filter(lambda x: len(x) == 12, sims)
if len(sims) == 0:
return
sims.sort(cmp=lambda a,b: cmp(float(a), float(b)), key=operator.itemgetter(11), reverse=True)
# get top eval
e_match = EVAL_RE.match(sims[0][10])
if not e_match:
return
base_num = e_match.group(1)
best_exp = e_match.group(3)
if (base_num == '0.0') and (best_exp == '00'):
no_zero = 0
min_num = 2 * min_num
best_exp = int(best_exp)
for s in sims:
cur_match = EVAL_RE.match(s[10])
if not cur_match:
continue
cur_exp = int(cur_match.group(3))
if no_zero and ((cur_match.group(2) == '+') or (cur_exp < cutoff)):
continue
elif count < min_num:
out.write("%s\n" % ("\t".join(s)))
count += 1
elif no_zero and (e_range > 0) and ((best_exp - cur_exp) <= e_range):
out.write("%s\n" % ("\t".join(s)))
if __name__ == "__main__":
if (options.sims_file == '-') or (options.sims_file == 'stdin'):
sims_handle = sys.stdin
else:
sims_handle = open(options.sims_file, "Ur")
if (options.out_file == '-') or (options.out_file == 'stdout'):
out_handle = sys.stdout
else:
out_handle = open(options.out_file, "w")
curid = None
cursims = []
eval_max = float("1e-%d"%(options.cutoff))
if options.eval_only:
# evalue only filter
for l in sims_handle:
v = l.strip().split("\t")
if float(v[10]) <= eval_max:
out_handle.write(l)
else:
for l in sims_handle:
# group by ID blocks
idmatch = ID_RE.match(l)
if idmatch == None:
continue
thisid = idmatch.group(1)
if curid == None:
curid = thisid
if thisid == curid:
cursims.append(l)
else:
if options.min_hit_only:
# min hit only filter
minHit(cursims, out_handle)
else:
# full bleach
bleach(cursims, out_handle)
curid = thisid
cursims = [l]
if options.min_hit_only:
minHit(cursims, out_handle)
else:
bleach(cursims, out_handle)
sims_handle.close()
out_handle.close()