Initial version

MG-RAST · Jul 18, 2019 · 2bda627 · 2bda627
1 parent 963ca6a
commit 2bda627
Show file tree

Hide file tree

Showing 2 changed files with 268 additions and 0 deletions.
diff --git a/bin/demultiplex2index.py b/bin/demultiplex2index.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python
+
+import os
+import re
+import sys
+from collections import defaultdict
+from optparse import OptionParser
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import generic_dna
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+from pprint import pprint
+
+__doc__ = """
+Demultiplex fasta or fastq file with given barcode list. Barcode is trimmed from beginning of sequence.
+List may contain only barcodes (output filename is barcode) or barcode \\t name (output filename is name).
+Allows multiple barcodes to write to one file and mutliple files to get seqs from one barcode.
+"""
+BCRE = re.compile(r'^[ATGCatgc-]+$')
+BCLEN = 0
+
+def seq_iter(file_hdl, stype):
+    if stype == 'fastq':
+        return FastqGeneralIterator(file_hdl)
+    else:
+        return SeqIO.parse(file_hdl, stype)
+
+def split_rec(rec, stype):
+    if stype == 'fastq':
+        return rec[0].split()[0], rec[1].upper(), rec[2]
+    else:
+        return rec.id, str(rec.seq).upper(), None
+
+def write_rec(out_hdl, head, seq, qual):
+    if qual is None:
+        out_hdl.write(">%s\n%s\n" %(head, seq))
+    else:
+        out_hdl.write("@%s\n%s\n+\n%s\n" %(head, seq, qual))
+
+def barcode_files(bfile, odir, stype, prefix, rc_bar):
+    global BCLEN
+    bar_name = [] # set of (barcode_seq, sample_name)
+    mapping_index_file = bfile + ".subset"
+    with open(bfile, 'rU') as x:
+        blines = x.readlines()
+    # skip header: Illumina or Qiime style
+    if blines[0].startswith("#SampleID") or blines[0].startswith("SampleID"):
+        blines.pop(0)
+    for i, b in enumerate(blines):
+        if not b:
+            continue
+        bset = b.strip().split("\t")
+        if not (bset[0] and bset[1]):
+            continue
+        # find which column has barcodes
+        if BCRE.match(bset[1]):
+            bar_name.append([bset[1], bset[0]])
+        elif BCRE.match(bset[0]):
+            bar_name.append([bset[0], bset[1]])
+        else:
+            sys.stderr.write("[error] row %d missing valid barcode\n"%i)
+            os._exit(1)
+    # set files
+    uniq_fname = {}
+    barc_fname = defaultdict(list)
+    barcode2index = defaultdict(list)
+
+    for bn in bar_name:
+        barc = prefix.upper() + bn[0].upper()
+        if rc_bar:
+            rcb = Seq(barc, generic_dna)
+            barc = str(rcb.reverse_complement()).upper()
+        name = os.path.join(odir, "%s.%s"%(bn[1], stype))
+        clen = len(barc)
+        if BCLEN == 0:
+            BCLEN = clen
+        elif BCLEN != clen:
+            sys.stderr.write("[error] barcode lengths are not the same\n")
+            os._exit(1)
+        barc_fname[barc].append(name)
+        barcode2index[barc].append(name + ".subset")
+        barcode2index[barc].append(mapping_index_file)
+        #  barcode2mapping_file[barc] = ...
+
+        uniq_fname[name] = None
+        uniq_fname[name + ".subset"] = None
+        uniq_fname[mapping_index_file] = None
+    return barcode2index , barc_fname, uniq_fname
+
+def match_barcode(barcodemap, seqbar):
+    # first exact
+    if seqbar in barcodemap:
+        return True, seqbar
+    else:
+        # match may be off by 1 bp
+        for bar in barcodemap :
+            mismatch = 0
+            for i in range(BCLEN):
+                if seqbar[i] != bar[i]:
+                    mismatch += 1
+            if mismatch < 2:
+                return True, bar
+        return False, None
+
+def main(args):
+    usage  = "usage: %prog [options] -b <barcode list> -i <input sequence file> -o <output dir>"+__doc__
+    parser = OptionParser(usage)
+    parser.add_option("-i", "--input", dest="input", default=None, help="Input sequence file.")
+    parser.add_option("-o", "--output", dest="output", default=".", help="Output dir (default cwd), filenames will be 'barcode.type' or 'name.type'")
+    parser.add_option("-f", "--format", dest="format", default='fasta', help="File format: fasta, fastq [default 'fasta']")
+    parser.add_option("-b", "--barcode", dest="barcode", default=None, help="File with list of name / barcode pairs")
+    parser.add_option("-r", "--rc_bar", dest="rc_bar", action="store_true", default=False, help="Barcodes in sequences file are reverse complement of barcodes file [default is same].")
+    parser.add_option("-p", "--prefix", dest="prefix", default="", help="Optional sequence to prepend to barcodes")
+    parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Wordy [default off]")
+
+    (opts, args) = parser.parse_args()
+    if not (opts.input and os.path.isfile(opts.input) and opts.output and os.path.isdir(opts.output)):
+        parser.error("Missing input and/or output")
+    if not (opts.barcode and os.path.isfile(opts.barcode)):
+        parser.error("Missing barcode list")
+
+    # get barcode / filename
+    barcode2index , barc_fname, uniq_fname = barcode_files(opts.barcode, opts.output, opts.format, opts.prefix, opts.rc_bar)
+    # open filehandles
+    for f in uniq_fname :
+        print(f)
+        uniq_fname[f] = open(f, 'w')
+    missing = open(os.path.join(opts.output, 'unmatched.'+opts.format), 'w')
+
+    # parse sequence file
+    # allows multiple barcodes to write to one file and mutliple files to get seqs from one barcode
+    bar_count  = defaultdict(int)
+    miss_count = 0
+    input_hdl  = open(opts.input, 'r')
+    index_hdl  = open(opts.input + ".record.idx", 'w')
+    record_counter = 0 
+
+    pointer = open(opts.input , "r") 
+    offset = pointer.tell() 
+
+    for rec in seq_iter(input_hdl, opts.format):
+
+        record_counter += 1
+        head, seq, qual = split_rec(rec, opts.format)
+
+        line_nr = 0
+        while line_nr < 10 :
+            line = pointer.readline()
+            line_nr += 1
+            if (qual + "\n") == line :
+                # print("Found" , line_nr , pointer.tell() )
+                break
+            else:
+                pass
+        print(record_counter , offset , pointer.tell() , str( pointer.tell() - offset )  )
+        index_hdl.write( str(offset) + "\t" + str( pointer.tell() - offset ) + "\n" ) 
+        offset = pointer.tell()
+
+        seqbar = seq[:BCLEN]
+        seq  = seq[BCLEN:]
+        qual = qual[BCLEN:] if qual is not None else None
+        match, mbar = match_barcode(barc_fname, seqbar)
+        if match:
+            bar_count[mbar] += 1
+            for f in barc_fname[mbar]:
+                write_rec(uniq_fname[f], head, seq, qual)
+            for f in barcode2index[mbar]:
+                uniq_fname[f].write( "%d\n" %(record_counter)  )
+        else:
+            miss_count += 1
+            write_rec(missing, head, seq, qual)
+
+    #close filehandles
+    missing.close()
+    for k, fhdl in uniq_fname.items():
+        fhdl.close()
+
+    if opts.verbose:
+        print(
+            "%d sequences split amoung %d barcodes. %d sequences without barcodes"%(sum(bar_count.values()), len(bar_count.keys()), miss_count)
+        )
+
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
diff --git a/bin/fastq2byte_idx.py b/bin/fastq2byte_idx.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+import os
+import re
+import sys
+import statistics as s
+from struct import *
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import generic_dna
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+from pprint import pprint
+
+from optparse import OptionParser
+
+usage  = "usage: %prog [options] -i <input file> -o <output file>"
+parser = OptionParser(usage)
+parser.add_option("-i", "--input", dest="input", default=None, help="Input mapping file.")
+parser.add_option("-o", "--index", dest="output", default="tmp.idx", help="Output binary index file")
+parser.add_option("-r", "--record", dest="record", default="tmp.rec", help="Output record index file")
+parser.add_option("--id2rec" , dest="id" , default="tmp.id2rec" , help="Output id file")
+
+
+(opts, args) = parser.parse_args()
+
+if opts.input and os.path.isfile(opts.input) :
+    print("# Processing " + opts.input)
+else :
+    print("Missing input file")
+    sys.exit(1)
+
+fastq_file_handle   = open(opts.input , "r")
+pointer             = open(opts.input , "r") 
+index               = open(opts.output , "wb")
+record              = open(opts.record , "w")
+id2rec              = open(opts.id , "w")
+
+record_counter = 0
+offset = pointer.tell() 
+
+llist = []
+
+# for fastq
+for rec in FastqGeneralIterator(fastq_file_handle) :
+    record_counter += 1
+    head, seq, qual = ( rec[0].split()[0], rec[1].upper(), rec[2] )
+
+    line_nr = 0
+    while line_nr < 10 :
+            line = pointer.readline()
+            line_nr += 1
+            if (qual + "\n") == line :
+                # print("Found" , line_nr , pointer.tell() )
+                break
+            else:
+                pass
+
+
+
+    # Write index    
+    length = pointer.tell() - offset    
+    index.write( pack('<QQ' , int(offset) , int(length) ) ) 
+    record.write( "\t".join( [ str(record_counter) , str(offset) , str( pointer.tell() - offset )] ) + "\n"  )
+    id2rec.write( "\t".join( [ head , str(record_counter) , str(offset) , str( pointer.tell() - offset )] ) + "\n" ) 
+
+    # keep for stats
+    llist.append( length )
+
+    # New offset 
+    offset = pointer.tell()
+
+
+record.close()
+index.close()
+
+mean = s.mean(llist)
+
+print( "Records:\t" + str( len(llist)))
+print( "Mean:\t" + str( mean ) ) 
+print( "Median:\t" + str( s.median(llist) ) )
+print( "Variance:\t" + str ( s.pvariance( llist, mu=mean) ) )
+print( "Stdev:\t" + str( s.pstdev(llist , mu=mean ) ) )