diff --git a/bin/demultiplex2index.py b/bin/demultiplex2index.py new file mode 100755 index 0000000..fd03129 --- /dev/null +++ b/bin/demultiplex2index.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python + +import os +import re +import sys +from collections import defaultdict +from optparse import OptionParser +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.Alphabet import generic_dna +from Bio.SeqIO.QualityIO import FastqGeneralIterator +from pprint import pprint + +__doc__ = """ +Demultiplex fasta or fastq file with given barcode list. Barcode is trimmed from beginning of sequence. +List may contain only barcodes (output filename is barcode) or barcode \\t name (output filename is name). +Allows multiple barcodes to write to one file and mutliple files to get seqs from one barcode. +""" +BCRE = re.compile(r'^[ATGCatgc-]+$') +BCLEN = 0 + +def seq_iter(file_hdl, stype): + if stype == 'fastq': + return FastqGeneralIterator(file_hdl) + else: + return SeqIO.parse(file_hdl, stype) + +def split_rec(rec, stype): + if stype == 'fastq': + return rec[0].split()[0], rec[1].upper(), rec[2] + else: + return rec.id, str(rec.seq).upper(), None + +def write_rec(out_hdl, head, seq, qual): + if qual is None: + out_hdl.write(">%s\n%s\n" %(head, seq)) + else: + out_hdl.write("@%s\n%s\n+\n%s\n" %(head, seq, qual)) + +def barcode_files(bfile, odir, stype, prefix, rc_bar): + global BCLEN + bar_name = [] # set of (barcode_seq, sample_name) + mapping_index_file = bfile + ".subset" + with open(bfile, 'rU') as x: + blines = x.readlines() + # skip header: Illumina or Qiime style + if blines[0].startswith("#SampleID") or blines[0].startswith("SampleID"): + blines.pop(0) + for i, b in enumerate(blines): + if not b: + continue + bset = b.strip().split("\t") + if not (bset[0] and bset[1]): + continue + # find which column has barcodes + if BCRE.match(bset[1]): + bar_name.append([bset[1], bset[0]]) + elif BCRE.match(bset[0]): + bar_name.append([bset[0], bset[1]]) + else: + sys.stderr.write("[error] row %d missing valid barcode\n"%i) + os._exit(1) + # set files + uniq_fname = {} + barc_fname = defaultdict(list) + barcode2index = defaultdict(list) + + for bn in bar_name: + barc = prefix.upper() + bn[0].upper() + if rc_bar: + rcb = Seq(barc, generic_dna) + barc = str(rcb.reverse_complement()).upper() + name = os.path.join(odir, "%s.%s"%(bn[1], stype)) + clen = len(barc) + if BCLEN == 0: + BCLEN = clen + elif BCLEN != clen: + sys.stderr.write("[error] barcode lengths are not the same\n") + os._exit(1) + barc_fname[barc].append(name) + barcode2index[barc].append(name + ".subset") + barcode2index[barc].append(mapping_index_file) + # barcode2mapping_file[barc] = ... + + uniq_fname[name] = None + uniq_fname[name + ".subset"] = None + uniq_fname[mapping_index_file] = None + return barcode2index , barc_fname, uniq_fname + +def match_barcode(barcodemap, seqbar): + # first exact + if seqbar in barcodemap: + return True, seqbar + else: + # match may be off by 1 bp + for bar in barcodemap : + mismatch = 0 + for i in range(BCLEN): + if seqbar[i] != bar[i]: + mismatch += 1 + if mismatch < 2: + return True, bar + return False, None + +def main(args): + usage = "usage: %prog [options] -b -i -o "+__doc__ + parser = OptionParser(usage) + parser.add_option("-i", "--input", dest="input", default=None, help="Input sequence file.") + parser.add_option("-o", "--output", dest="output", default=".", help="Output dir (default cwd), filenames will be 'barcode.type' or 'name.type'") + parser.add_option("-f", "--format", dest="format", default='fasta', help="File format: fasta, fastq [default 'fasta']") + parser.add_option("-b", "--barcode", dest="barcode", default=None, help="File with list of name / barcode pairs") + parser.add_option("-r", "--rc_bar", dest="rc_bar", action="store_true", default=False, help="Barcodes in sequences file are reverse complement of barcodes file [default is same].") + parser.add_option("-p", "--prefix", dest="prefix", default="", help="Optional sequence to prepend to barcodes") + parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Wordy [default off]") + + (opts, args) = parser.parse_args() + if not (opts.input and os.path.isfile(opts.input) and opts.output and os.path.isdir(opts.output)): + parser.error("Missing input and/or output") + if not (opts.barcode and os.path.isfile(opts.barcode)): + parser.error("Missing barcode list") + + # get barcode / filename + barcode2index , barc_fname, uniq_fname = barcode_files(opts.barcode, opts.output, opts.format, opts.prefix, opts.rc_bar) + # open filehandles + for f in uniq_fname : + print(f) + uniq_fname[f] = open(f, 'w') + missing = open(os.path.join(opts.output, 'unmatched.'+opts.format), 'w') + + # parse sequence file + # allows multiple barcodes to write to one file and mutliple files to get seqs from one barcode + bar_count = defaultdict(int) + miss_count = 0 + input_hdl = open(opts.input, 'r') + index_hdl = open(opts.input + ".record.idx", 'w') + record_counter = 0 + + pointer = open(opts.input , "r") + offset = pointer.tell() + + for rec in seq_iter(input_hdl, opts.format): + + record_counter += 1 + head, seq, qual = split_rec(rec, opts.format) + + line_nr = 0 + while line_nr < 10 : + line = pointer.readline() + line_nr += 1 + if (qual + "\n") == line : + # print("Found" , line_nr , pointer.tell() ) + break + else: + pass + print(record_counter , offset , pointer.tell() , str( pointer.tell() - offset ) ) + index_hdl.write( str(offset) + "\t" + str( pointer.tell() - offset ) + "\n" ) + offset = pointer.tell() + + seqbar = seq[:BCLEN] + seq = seq[BCLEN:] + qual = qual[BCLEN:] if qual is not None else None + match, mbar = match_barcode(barc_fname, seqbar) + if match: + bar_count[mbar] += 1 + for f in barc_fname[mbar]: + write_rec(uniq_fname[f], head, seq, qual) + for f in barcode2index[mbar]: + uniq_fname[f].write( "%d\n" %(record_counter) ) + else: + miss_count += 1 + write_rec(missing, head, seq, qual) + + #close filehandles + missing.close() + for k, fhdl in uniq_fname.items(): + fhdl.close() + + if opts.verbose: + print( + "%d sequences split amoung %d barcodes. %d sequences without barcodes"%(sum(bar_count.values()), len(bar_count.keys()), miss_count) + ) + + return 0 + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/bin/fastq2byte_idx.py b/bin/fastq2byte_idx.py new file mode 100644 index 0000000..47a4098 --- /dev/null +++ b/bin/fastq2byte_idx.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +import os +import re +import sys +import statistics as s +from struct import * +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.Alphabet import generic_dna +from Bio.SeqIO.QualityIO import FastqGeneralIterator +from pprint import pprint + +from optparse import OptionParser + +usage = "usage: %prog [options] -i -o " +parser = OptionParser(usage) +parser.add_option("-i", "--input", dest="input", default=None, help="Input mapping file.") +parser.add_option("-o", "--index", dest="output", default="tmp.idx", help="Output binary index file") +parser.add_option("-r", "--record", dest="record", default="tmp.rec", help="Output record index file") +parser.add_option("--id2rec" , dest="id" , default="tmp.id2rec" , help="Output id file") + + +(opts, args) = parser.parse_args() + +if opts.input and os.path.isfile(opts.input) : + print("# Processing " + opts.input) +else : + print("Missing input file") + sys.exit(1) + +fastq_file_handle = open(opts.input , "r") +pointer = open(opts.input , "r") +index = open(opts.output , "wb") +record = open(opts.record , "w") +id2rec = open(opts.id , "w") + +record_counter = 0 +offset = pointer.tell() + +llist = [] + +# for fastq +for rec in FastqGeneralIterator(fastq_file_handle) : + record_counter += 1 + head, seq, qual = ( rec[0].split()[0], rec[1].upper(), rec[2] ) + + line_nr = 0 + while line_nr < 10 : + line = pointer.readline() + line_nr += 1 + if (qual + "\n") == line : + # print("Found" , line_nr , pointer.tell() ) + break + else: + pass + + + + # Write index + length = pointer.tell() - offset + index.write( pack('