-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathildemux.py
193 lines (167 loc) · 6.84 KB
/
ildemux.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
from io import BytesIO
from pathlib import Path
import sys
from sys import stdin
from tqdm import tqdm
from collections import Counter
from ._utils import fqparse
def hamming_cloud(seq, distance=1):
"""Generate DNA seqs whose Hamming distance from seq is <= distance
>>> sorted(hamming_cloud('GAT', 0))
['GAT']
>>> sorted(hamming_cloud('GAT', 1))
['AAT', 'CAT', 'GAA', 'GAC', 'GAG', 'GAN', 'GCT', 'GGT', 'GNT', 'GTT', 'NAT', 'TAT']
>>> max(hammdist(x, "GATTACA") for x in hamming_cloud('GAT', 2))
2
"""
from itertools import combinations, product
seqs = set([seq, ])
if distance == 0:
return seqs
elif distance > 0:
seqs.update(hamming_cloud(seq, distance=distance-1))
ACGTN = "ACGTN"
for idx in combinations(range(len(seq)), distance):
for rep in product(range(len(ACGTN) - 1), repeat=distance):
mutated = list(seq)
for p, r in zip(idx, rep):
if mutated[p] == ACGTN[r]:
# we have N-1 replacements (hence -1 above), so we only
# cacluated the product above for N-1. So when we find the
# "replacement" that keeps the base the same, it's actually
# the replacement by the final letter in the alphabet.
r = -1
mutated[p] = ACGTN[r]
seqs.add(''.join(mutated))
return seqs
class ByteBucket(object):
"""
This bullshit is needed to avoid having too many files open (linux
typically limits this to 1024). This thing buffers lines and only opens one
file at a time to write output.
And yes it needs pigz installed, if you're reading this that's why it
crashed.
"""
threads = 32
ziplevel = 6
def biff2pigz(self, somebytes, outfile):
from subprocess import Popen, PIPE
with Popen(["pigz", "-n", "-p", str(self.threads), f"-{self.ziplevel}"], stdin=PIPE, stdout=outfile, bufsize=0) as proc:
proc.stdin.write(somebytes)
def __init__(self, outfile, size=2**27, sname=""):
self.outfile = str(outfile)
self.buf = BytesIO()
self.maxsize = size
self.first = True
self.sname = sname
def writelines(self, lines):
self.write("\n".join(lines).encode('ascii'))
self.write(b"\n")
def write(self, somebytes):
if self.first:
# test write
with open(self.outfile, "wb"):
pass
self.first = False
if isinstance(somebytes, str):
somebytes.encode('ascii')
self.buf.write(somebytes)
if len(self.buf.getbuffer()) > self.maxsize:
self.flush()
def flush(self):
if len(self.buf.getbuffer()) > 0:
#assert(self.first)
mode = "wb" if self.first else "ab"
self.first=False
with open(self.outfile, mode, buffering=0) as fh:
if self.outfile.endswith(".gz"):
self.biff2pigz(self.buf.getbuffer(), fh)
else:
fh.write(self.buf.getbuffer())
self.buf = BytesIO()
def __del__(self):
self.flush()
def gethdrdat(hdr):
spot, idx = hdr.rstrip("\n").split(" ")
idxpair = idx.split(":")[3]
return spot, idx, idxpair
def fqp2idx(fqp):
"""fastq pair to index pair ACGT+ACGT"""
if len(fqp) == 4:
s1, _, ip1 = gethdrdat(fqp[0])
return ip1
else:
assert(len(fqp)==8)
s1, _, ip1 = gethdrdat(fqp[0])
s2, _, ip2 = gethdrdat(fqp[4])
assert(s1==s2 and ip1==ip2)
return ip1
def make_sample_map(tablefile, outdir, fileext=".fq", distance=1):
from csv import DictReader
from itertools import product
smap = {}
with open(tablefile) as tabfh:
for sample in DictReader(tabfh, dialect="excel-tab"):
sname = sample["sample"]
opath = outdir / f"{sname}.{fileext}"
bucket = ByteBucket(opath, sname=sname)
for i7, i5 in product(hamming_cloud(sample["i7"], distance=distance),
hamming_cloud(sample["i5"], distance=distance)):
i7plusi5 = f"{i7}+{i5}"
if i7plusi5 in smap:
print("ERROR: duplicate i7+i5. reduce -m or cry into your beer.")
sys.exit(1)
smap[i7plusi5] = bucket
return smap
def main(argv=None):
"""Demultiplex modern illumina reads from read headers.
Supports *ONLY* out-of-read indicies (in @header lines e.g. 1:N:0:ACGT+TGCA), and only reads with i7 + i5.
"""
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("-o", "--outdir", required=True, type=Path,
help="Output directory (must exist)")
ap.add_argument("-c", "--justcount", action="store_true",
help="Write nothing, only tablulate (largely useless)")
ap.add_argument("-g", "--guess", action="store_true",
help="Write nothing, tablulate all indicies found (don't need a keyfile)")
ap.add_argument("-z", "--zip", nargs="?", default=None, const=6, type=int,
help="Compress outputs with pigz (must be installed).")
ap.add_argument("-j", "--threads", default=8,
help="Number of compression threads")
ap.add_argument("-m", "--mismatch", default=1, type=int,
help="Number tolerable mismatches (hamming distance).")
ap.add_argument("-s", "--singleend", action="store_const", const=4, default=8, dest="nlines",
help="Single-end reads (default: paired end, interleaved)")
ap.add_argument("-k", "--keyfile",
help="Mapping of i7/i5 -> sample as tsv (with cols i7, i5, sample)")
args=ap.parse_args(argv)
ByteBucket.threads = args.threads
if args.zip is not None:
ByteBucket.ziplevel = args.zip
if args.guess:
args.justcount=True
fileext = "il.fastq.gz" if args.zip is not None else "il.fastq"
if not args.guess:
samps = make_sample_map(args.keyfile, args.outdir, fileext=fileext, distance=args.mismatch)
print("set up sample map with", len(samps), "mappings of barcodes to files")
file_undef = ByteBucket(args.outdir/f"undefined.{fileext}", sname="undefined")
file_undef.sname = "undefined"
stats = Counter()
try:
for pair in tqdm(fqparse(stdin, n=args.nlines)):
idxpair = fqp2idx(pair)
if args.guess:
stats[idxpair] += 1
continue
ofile = samps.get(idxpair, file_undef)
stats[ofile.sname] += 1
if not args.justcount:
ofile.writelines(pair)
finally:
with open(args.outdir / "stats.tsv", "w") as fh:
print("sample", "nspots", file=fh, sep="\t")
for samp, n in stats.most_common():
print(samp, n, file=fh, sep="\t")
if __name__ == "__main__":
main()