diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..a51ff46 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,39 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Build python pkg + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Build and Test CLI + run: | + pip install . + blsl help diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 634faad..5a6b1d7 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -11,7 +11,7 @@ name: Upload Python Package on: push: tags: - - * + - '*' workflow_dispatch: permissions: diff --git a/README.md b/README.md index 4ad8b3c..e694605 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # blindschleiche -Misc sequence tools in python. These tools are small things which I have at some point needed to write because I couldn't find a solution I liked. This is in no way a comprehensive toolkit. It is a companion to [seqhax](https://github.com/kdm9/seqhax), execpt that seqhax is written in C/C++ and generally contains tools to handle very large datasets where performance is somewhat important. This is all in python for ease of development, and so typically these tools perform less data- or compute-intensive tasks. +A collection of bioinformatics / sequence utilities needed for my research, and hopefully useful for yours. + +[![DOI](https://zenodo.org/badge/509693094.svg)](https://zenodo.org/doi/10.5281/zenodo.10049825) ## Install @@ -15,19 +17,30 @@ pip install blindschleiche ``` USAGE: blsl [options...] - Where is one of: - genigvjs: Generate a simple IGV.js visualisation of some bioinf files. - telogrep: Search contigs for known telomere repeats - n50: Calculate N50 and total length of a set of contigs + deepclust2fa: Split a .faa by the clusters diamond deepclust finds + ebiosra2rl2s: INTERNAL: MPI Tübingen tool. Make a runlib-to-sample map table from ebio sra files + equalbestblast: Output only the best blast hits. + esearchandfetch: Use the Entrez API to search for and download something. A CLI companion to the NCBI search box falen: Tabulate the lengths of sequences in a FASTA file - mask2bed: The inverse of bedtools maskfasta: softmasked fasta -> unmasked fasta + mask.bed - liftoff-gff3: Obtain an actually-useful GFF3 from Liftoff by fixing basic GFF3 format errors - pansn-rename: Add, remove, or modify PanSN-style prefixes to contig/chromosome names in references + farename: Rename sequences in a fasta file sequentially + galhist: Make a summary histogram of git-annex-list output + genigvjs: Generate a simple IGV.js visualisation of some bioinf files. + gffcat: Concatenate GFF3 files, resepcting header lines and FASTA sections + gg2k: Summarise a table with GreenGenes-style lineages into a kraken-style report. ildemux: Demultiplex modern illumina reads from read headers. ilsample: Sample a fraction of read pairs from an interleaved fastq file + liftoff-gff3: Obtain an actually-useful GFF3 from Liftoff by fixing basic GFF3 format errors + mask2bed: The inverse of bedtools maskfasta: softmasked fasta -> unmasked fasta + mask.bed + n50: Calculate N50 and total length of a set of contigs + nstitch: Combine R1 + R2 into single sequences, with an N in the middle + pairslash: Add an old-style /1 /2 pair indicator to paired-end fastq files + pansn-rename: Add, remove, or modify PanSN-style prefixes to contig/chromosome names in references regionbed: Make a bed/region file of genome windows + shannon-entropy: Calculate Shannon's entropy (in bits) at each column of one or more alignments + tabcat: Concatenate table (c/tsv) files, adding the filename as a column + telogrep: Search contigs for known telomere repeats uniref-acc2taxid: Make a ncbi-style acc2taxid.map file for a uniref fasta help: Print this help message @@ -35,9 +48,9 @@ Where is one of: Use blsl subtool --help to get help about a specific tool ``` -## Why Blindschleiche +## Why the name Blindschleiche? 1) [They're awesome animals](https://www.google.com/search?q=blindschleiche&tbm=isch) 2) Their English name is Slow Worm, which is appropriate for this set of low-performance tools in Python. -3) All tools implemented in python must be named with a snake pun, and they're kinda a snake (not really, they're legless lizards) +3) All tools implemented in Python must be named with a snake pun, and they're kinda a snake (not really, they're legless lizards) diff --git a/blsl/__init__.py b/blsl/__init__.py index 6602adc..6fbd7ae 100644 --- a/blsl/__init__.py +++ b/blsl/__init__.py @@ -4,9 +4,9 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -from sys import argv, exit +from sys import argv, exit, stderr -__version__ = "0.1.10" +__version__ = "0.1.11" cmds = {} @@ -43,6 +43,48 @@ from .uniref_accessionmap import main as uniref_main cmds["uniref-acc2taxid"] = uniref_main +from .nstitch import nstitch_main +cmds["nstitch"] = nstitch_main + +from .gg2k import gg2k_main +cmds["gg2k"] = gg2k_main + +from .equalbestblast import equalbestblast_main +cmds["equalbestblast"] = equalbestblast_main + +from .tabcat import tabcat_main +cmds["tabcat"] = tabcat_main + +from .esearchandfetch import main as esf_main +cmds["esearchandfetch"] = esf_main + +from .deepclust2fa import deepclust2fa_main +cmds["deepclust2fa"] = deepclust2fa_main + +from .farename import farename_main +cmds["farename"] = farename_main + +from .ebiosra2rl2s import main as rl2s_main +cmds["ebiosra2rl2s"] = rl2s_main + +from .galhist import main as galhist_main +cmds["galhist"] = galhist_main + +from .gffcat import gffcat_main +cmds["gffcat"] = gffcat_main + + +from .pairslash import main as pairslash_main +cmds["pairslash"] = pairslash_main + +try: + from .vcfstats import main as vcfstats_main + cmds["vcfstats"] = vcfstats_main +except ImportError as exc: + print(str(exc), "-- disabling vcfstats command", file=stderr) + +from .shannon import main as shannon_main +cmds["shannon-entropy"] = shannon_main def mainhelp(argv=None): """Print this help message""" diff --git a/blsl/_utils.py b/blsl/_utils.py new file mode 100644 index 0000000..3e6a8d4 --- /dev/null +++ b/blsl/_utils.py @@ -0,0 +1,30 @@ +import gzip + +def rc(seq): + d = {"A": "T", "G":"C", "C":"G", "T":"A"} + return "".join([d.get(x, "N") for x in reversed(list(seq.upper()))]) + +def fqpair(stream): + fqp = list() + for line in stream: + fqp.append(line.rstrip("\n")) + if len(fqp) == 8: + yield fqp + fqp = list() + if len(fqp) == 8: + yield fqp + else: + assert len(fqp) == 0 + +def fqparse(stream): + fqp = list() + for line in stream: + fqp.append(line.rstrip("\n")) + if len(fqp) == 4: + yield fqp + fqp = list() + if len(fqp) == 4: + yield fqp + else: + assert len(fqp) == 0 + diff --git a/blsl/deepclust2fa.py b/blsl/deepclust2fa.py new file mode 100644 index 0000000..6e9f82d --- /dev/null +++ b/blsl/deepclust2fa.py @@ -0,0 +1,95 @@ +# Copyright (c) 2023 Dr. K. D. Murray/Gekkonid Consulting +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import csv +import sys +import argparse +from collections import defaultdict +from pathlib import Path +import gzip + +from Bio.SeqIO import parse +from tqdm import tqdm + + +class tsv(csv.unix_dialect): + delimiter = "\t" + quotechar = '"' + doublequote = False + escapechar = "\\" + skipinitialspace = True + + +def deepclust2fa_main(argv=None): + """Split a .faa by the clusters diamond deepclust finds""" + ap = argparse.ArgumentParser() + ap.add_argument("-z", "--min-cluster-size", type=int, default=1, + help="Minimum cluster size") + ap.add_argument("-b", "--buffer", action="store_true", + help="Buffer sequences rather than writing (more ram, more speed)") + ap.add_argument("-c", "--clusters", required=True, + help="Diamond deepclust result") + ap.add_argument("-f", "--faa", required=True, + help="Fasta sequences (that you gave to diamond deepclust --db)") + ap.add_argument("-o", "--outdir", required=True, type=Path, + help="Output directory (files named by their centroids)") + args = ap.parse_args(argv) + + seq2cent = {} + cent2seq = defaultdict(list) + with open(args.clusters) as fh: + for rec in csv.reader(fh, dialect=tsv): + if rec[0] == "centroid" and rec[1] == "member": + continue + c, s = rec[0:2] + seq2cent[s] = c + cent2seq[c].append(s) + cent2seq_new = {} + seq2cent_new = {} + for cent, seqs in cent2seq.items(): + if len(seqs) >= args.min_cluster_size: + cent2seq_new[cent] = seqs + for seq in seqs: + seq2cent_new[seq] = cent + cent2seq = cent2seq_new + seq2cent = seq2cent_new + print(f"Read clusters, {len(cent2seq)} clusters with size >= {args.min_cluster_size}", file=sys.stderr) + + for cent in cent2seq: + outf = args.outdir/ (c+".fa") + with open(outf, "w") as fh: + pass + + if args.faa.endswith(".gz"): + fh = gzip.open(args.faa, "rt") + else: + fh = open(args.faa) + + buffers = defaultdict(list) + for seq in tqdm(parse(fh, "fasta")): + centroid = seq2cent.get(seq.id) + if centroid is None: + continue + if args.buffer: + buffers[centroid].append((seq.id, seq.seq)) + else: + outf = args.outdir/ (centroid+".fa") + with open(outf, "a") as ofh: + print(f">{seq.id}", file=ofh) + print(seq.seq, file=ofh) + fh.close() + + for centroid, seqs in buffers.items(): + outf = args.outdir/ (centroid+".fa") + with open(outf, "w") as ofh: + for seq in seqs: + print(f">{seq[0]}", file=ofh) + print(seq[1], file=ofh) + + +if __name__ == "__main__": + deepclust2fa_main() + diff --git a/blsl/ebiosra2rl2s.py b/blsl/ebiosra2rl2s.py new file mode 100644 index 0000000..1eab030 --- /dev/null +++ b/blsl/ebiosra2rl2s.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +import re +import json +import select +from sys import stdin, stdout, stderr, exit +import sys + +def main(argv=None): + """INTERNAL: MPI Tübingen tool. Make a runlib-to-sample map table from ebio sra files""" + samples = {} + if not select.select([stdin,],[],[],2.0)[0]: + print("This is an internal MPI Tübingen tool to make a table that mapps runs of a given library to sample names, e.g. for Acanthophis.", file=stderr) + print(f"USAGE: find /ebio/abt6_sra/... -type f -name '*.fq.gz' | ebiosra2rl2s > run-lib2sample.tsv", file=stderr) + exit(1) + for line in stdin: + m = re.match(r"(.*/illumina_.+_flowcell.+_SampleId(\w+)_(RunId\d+_LaneId\d+)/.+_(R\d)_\d+.fastq.gz)", line.strip()) + if m is not None: + path, sample, run, read = m.groups() + if sample not in samples: + samples[sample] = {run: {read: path}} + elif run not in samples[sample]: + samples[sample][run] = {read: path} + else: + samples[sample][run][read] = path + #json.dump(samples, stdout, indent=2) + print("library", "run", "read1_uri", "read2_uri", file=stdout, sep='\t') + for sample, dat in samples.items(): + for run, reads in dat.items(): + print(sample, run, reads["R1"], reads["R2"], file=stdout, sep='\t') + + +if __name__ == "__main__": + main() diff --git a/blsl/equalbestblast.py b/blsl/equalbestblast.py new file mode 100644 index 0000000..2fa9fa8 --- /dev/null +++ b/blsl/equalbestblast.py @@ -0,0 +1,62 @@ +# Copyright (c) 2023 Dr. K. D. Murray/Gekkonid Consulting +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import csv +import sys +import math +import argparse + +class blast6(csv.unix_dialect): + delimiter = "\t" + quotechar = '"' + doublequote = False + escapechar = "\\" + skipinitialspace = True + +def output_best(recs, fields): + nbegin = len(recs) + mineval =None + maxalnlen=None + if nbegin < 1: + return + mineval = min(float(x["evalue"]) for x in recs) + recs = [ + rec for rec in recs if float(rec["evalue"]) <= mineval + ] + maxalnlen = max(float(x["length"]) for x in recs) + recs = [ + rec for rec in recs if float(rec["length"]) >= maxalnlen + ] + nend=len(recs) + for rec in recs: + print(*[rec[x] for x in fields], sep="\t") + + +def equalbestblast_main(argv=None): + """Output only the best blast hits.""" + ap = argparse.ArgumentParser() + ap.add_argument("-f", "--outfmt", default="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore", + help="Blast outfmt headers (default is the default for blast -outfmt 6)") + ap.add_argument("table", help="Table of blast hits") + args = ap.parse_args(argv) + + fields = args.outfmt.split(" ") + query = None + hits = [] + with open(args.table) as fh: + for rec in csv.DictReader(fh, dialect=blast6, fieldnames=fields): + if query != rec["qseqid"]: + if query is not None: + output_best(hits, fields) + hits = [] + query = rec["qseqid"] + hits.append(rec) + if query is not None: + output_best(hits, fields) + +if __name__ == "__main__": + equalbestblast_main() + diff --git a/blsl/esearchandfetch.py b/blsl/esearchandfetch.py new file mode 100644 index 0000000..c4adf18 --- /dev/null +++ b/blsl/esearchandfetch.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +from Bio import Entrez +from tqdm import tqdm + +import argparse +from time import sleep +from sys import stdout, stdin, stderr, exit +from pathlib import Path +import io + +def esearch(*args, chunks=10000, **kwargs): + kwargs["retmax"] = chunks + ids = [] + n = None + i = 0 + with tqdm(desc="esearch IDs") as progress: + while n is None or i < n: + this = Entrez.esearch(*args, **kwargs, retstart=i) + for x in range(3): + try: + sleep(0.5) + this = Entrez.read(this) + break + except RuntimeError: + sleep(0.5) + pass + else: + this = Entrez.read(this) + nret = len(this["IdList"]) + ids.extend(this["IdList"]) + i += nret + n = int(this["Count"]) + progress.total = n + progress.update(nret) + return ids + + +def efetch(db="nucleotide", rettype="genbank", retmode="text", ids=None, chunks=2000): + with tqdm(desc=f"efetch {rettype}", total=len(ids)) as progress: + for start in range(0, len(ids), chunks): + stop = min(start + chunks, len(ids)) + idstr = ",".join(ids[start:stop]) + res = Entrez.efetch(db=db, retmode=retmode, rettype=rettype, id=idstr) + if start != 0 and rettype=="runinfo": + # Skip header on non-first one + hdr = next(res) + for line in res: + if isinstance(line, bytes): + line=line.decode('utf-8') + yield line + progress.update(stop-start) + + +def main(argv=None): + """Use the Entrez API to search for and download something. A CLI companion to the NCBI search box""" + ap = argparse.ArgumentParser() + ap.add_argument("-e", "--email", type=str, + help="Email for Entrez API") + ap.add_argument("-c", "--chunksize", type=int, default=1000, + help="Number of records to download per chunk") + ap.add_argument("-o", "--out", type=argparse.FileType('w'), default="-", + help="Genbank output") + ap.add_argument("-d", "--db", required=True, + help="db api param") + ap.add_argument("-r", "--rettype", required=True, + help="rettype api param") + ap.add_argument("-m", "--retmode", default="text", + help="API return mode") + ap.add_argument("-i", "--id-file", type=Path, + help="List of IDs (give with --term to save a list of IDs to this file, or without to use this list of IDs)") + ap.add_argument("-t", "--term", type=str) + args = ap.parse_args(argv) + + Entrez.email = args.email + + if args.term: + ids = esearch(term=args.term, db=args.db) + if args.id_file: + with args.id_file.open("w") as fh: + for id in ids: + print(id, file=fh) + elif args.id_file: + ids = [] + with args.id_file.open("r") as fh: + for line in fh: + ids.append(line.rstrip()) + else: + print("ERROR: must give either --term to search for, or a list of ids in --id-file", file=stderr) + exit(1) + + for line in efetch(ids=ids, db=args.db, rettype=args.rettype, retmode=args.retmode, chunks=1000): + args.out.write(line) + + +if __name__ == "__main__": + main() diff --git a/blsl/farename.py b/blsl/farename.py new file mode 100644 index 0000000..3c03bfb --- /dev/null +++ b/blsl/farename.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +from sys import argv +import argparse + +from Bio import SeqIO + +def farename_main(argv): + """Rename sequences in a fasta file sequentially""" + ap = argparse.ArgumentParser() + ap.add_argument("-p", "--prefix", type=str, default="", + help="Prefix for each sequence ID (e.g. SEQ_ -> SEQ_1, SEQ_2 etc)") + ap.add_argument("fastas", help="FASTA file of contigs", nargs="+") + args = ap.parse_args(argv) + + for infile in args.fastas: + i = 0 + with open(infile) as fh: + for seq in SeqIO.parse(fh, 'fasta'): + i += 1 + print(f">{args.prefix}{i} {seq.id} {seq.description}") + print(seq.seq) + + +if __name__ == "__main__": + farename_main() diff --git a/blsl/galhist.py b/blsl/galhist.py new file mode 100644 index 0000000..8bc39e0 --- /dev/null +++ b/blsl/galhist.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +from shutil import get_terminal_size +from collections import Counter +from sys import stdin, stdout, stderr, exit +from math import ceil, log10 +import select + +def main(argv=None): + """Make a summary histogram of git-annex-list output""" + if not select.select([stdin], [], [], 1)[0]: + print("USAGE: git annex list ... | blsl galhist", file=stderr) + print("This tool makes a histogram of file occurence patterns to summarise the output of git annex list. Just pipe git anex list in on stdin.", file=stderr) + exit(1) + + twidth = 80 + try: + if stdout.isatty(): + twidth = get_terminal_size()[0] + except: + pass + + header = [] + counts = Counter() + for line in stdin: + if not header: + header.append(line.rstrip()) + elif line.startswith("|"): + header.append(line.rstrip()) + else: + key, file = line.split(" ", maxsplit=1) + counts[key]+=1 + + for l in header: + print(l) + _, mcv = counts.most_common(1)[0] + nw = int(ceil(log10(mcv))) + remaining = twidth - len(header) - 1 - nw - 2 + + for k, v in counts.most_common(): + vbarw = min(round(v/mcv*remaining), remaining) + vbar = ("#" * vbarw).ljust(remaining, " ") + npad = str(v).rjust(nw, " ") + print(f"{k} {vbar} ({npad})") diff --git a/blsl/genigvjs.py b/blsl/genigvjs.py index 883deb8..544727d 100644 --- a/blsl/genigvjs.py +++ b/blsl/genigvjs.py @@ -75,6 +75,8 @@ def genigvjs_main(argv=None): help="Webpage title") ap.add_argument("--reference", "-r", required=True, help="reference file. must be *.fa or *.fasta, with associated *.fai, and can also have *.gff as annotation.") + ap.add_argument("--locus", "-l", + help="Initial region of interest.") ap.add_argument("--outdir", "-o", required=True, help="Output directory. Data files will be softlinked there, and 'index.html' will be generated.") ap.add_argument("tracks", help="Any files to be used as tracks. Can be gff/bed/vcf/bcf/bam/cram. Must be indexed", nargs="+") @@ -93,6 +95,8 @@ def genigvjs_main(argv=None): "reference": {}, "tracks": [], } + if args.locus: + data["locus"] = args.locus ref = Path(args.reference) refbase = ref.stem reffai = Path(str(ref) + ".fai") @@ -116,7 +120,10 @@ def genigvjs_main(argv=None): for i, track in enumerate(args.tracks): track = Path(track) base = track.stem - format = track.suffix.lstrip(".") + dots = track.name.split('.') + format = dots[-1] + if format == "gz": + format = dots[-2] index = None if format == "bam": index = Path(str(track) + ".bai") @@ -126,7 +133,7 @@ def genigvjs_main(argv=None): index = None trackdat = { "name": base, - "format": track.suffix.lstrip("."), + "format": format, "autoHeight": True, "minHeight": 50, "maxHeight": 500, @@ -146,7 +153,7 @@ def genigvjs_main(argv=None): with open(outdir / "index.html", "w") as fh: html = template \ .replace("__title__", args.title) \ - .replace("__data__", json.dumps(data)) + .replace("__data__", json.dumps(data, indent=4)) fh.write(html) if __name__ == "__main__": diff --git a/blsl/gffcat.py b/blsl/gffcat.py new file mode 100755 index 0000000..6232554 --- /dev/null +++ b/blsl/gffcat.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +import argparse +from sys import stderr, stdout, stdin + +KNOWN_HEADERS = { + "##gff-version", + "##feature-ontology", +} + + +def gffcat_main(argv=None): + """Concatenate GFF3 files, resepcting header lines and FASTA sections""" + ap = argparse.ArgumentParser("gffcat") + ap.add_argument("-o", "--output", type=argparse.FileType("wt"), default=stdout, + help="Output gff file") + ap.add_argument("inputs", nargs="+") + args = ap.parse_args(argv) + + fastalines = [] + bodylines = [] + headerlines = {} + + for file in args.inputs: + with open(file) as fh: + in_fasta = False + for line in fh: + line = line.rstrip() + if not line: + continue + ll = line.lower().split()[0] + if ll in KNOWN_HEADERS: + if ll in headerlines: + if headerlines[ll] != line: + print("WARN: header line with different value in differnt files. Only using value from first file", file=stderr) + else: + headerlines[ll] = line + elif ll.startswith("##fasta"): + in_fasta = True + else: + if in_fasta: + fastalines.append(line) + else: + bodylines.append(line) + + for hdr in KNOWN_HEADERS: + if hdr in headerlines: + print(headerlines[hdr], file=args.output) + + for line in bodylines: + print(line, file=args.output) + + if fastalines: + print("##fasta", file=args.output) + + for line in fastalines: + print(line, file=args.output) + + print(f"DONE! {len(headerlines)} headers, {len(bodylines)} body lines, {len(fastalines)} FASTA lines", file=stderr) + + +if __name__ == "__main__": + gffcat_main() diff --git a/blsl/gffparse.py b/blsl/gffparse.py new file mode 100644 index 0000000..d237da6 --- /dev/null +++ b/blsl/gffparse.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 +""" +A simple parser for the GFF3 format. + +Test with transcripts.gff3 from +http://www.broadinstitute.org/annotation/gebo/help/gff3.html. + +Format specification source: +http://www.sequenceontology.org/gff3.shtml + +Version 1.1: Python3 ready +Version 2.0: @kdm9's version. Add code for comparison & normalisationG +""" +from collections import namedtuple, Counter +import gzip +import urllib.request, urllib.parse, urllib.error +from sys import stderr, stdout + +from tqdm.auto import tqdm + +__author__ = "Uli Köhler" +__license__ = "Apache License v2.0" +__version__ = "1.1" + +#Initialized GeneInfo named tuple. Note: namedtuple is immutable +gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"] +GFFRecord = namedtuple("GFFRecord", gffInfoFields) + +ontological_order = [ + "chromosome", "gene", "pseudogene", "pseudogenic_region", "mRNA", "transcript", "pseudogeneic_transcript", + "exon", "pseudogenic_exon", "five_prime_UTR", "three_prime_UTR", "CDS", "pseudogenic_CDS", +] + +def parseGFFAttributes(attributeString): + """Parse the GFF3 attribute column and return a dict"""# + if attributeString == ".": return {} + ret = {} + for attribute in attributeString.split(";"): + if "=" in attribute: + key, value = attribute.split("=") + else: + key = attribute + value = "True" + if not key and not value: + continue + ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value) + return ret + +def parseGFF3(filename, return_as=dict): + """ + A minimalistic GFF3 format parser. + Yields objects that contain info about a single GFF3 feature. + + Supports transparent gzip decompression. + """ + #Parse with transparent decompression + openFunc = gzip.open if str(filename).endswith(".gz") else open + with openFunc(filename) as infile: + for line in infile: + #if line.startswith("###"): + # ### Yield last gene if we move that here + # pass + if line.startswith("#") or line.strip() == "": + continue + parts = line.strip().split("\t") + #If this fails, the file format is not standard-compatible + assert len(parts) == len(gffInfoFields) + #Normalize data + normalizedInfo = { + "seqid": None if parts[0] == "." else urllib.parse.unquote(parts[0]), + "source": None if parts[1] == "." else urllib.parse.unquote(parts[1]), + "type": None if parts[2] == "." else urllib.parse.unquote(parts[2]), + "start": None if parts[3] == "." else int(parts[3]), + "end": None if parts[4] == "." else int(parts[4]), + "score": None if parts[5] == "." else float(parts[5]), + "strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]), + "phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]), + "attributes": parseGFFAttributes(parts[8]) + } + #Alternatively, you can emit the dictionary here, if you need mutability: + # yield normalizedInfo + yield return_as(**normalizedInfo) + + +def gff_heirarchy(filename, progress=None): + last = {"l1": None, "l2": None, "l3": None} + level_canonicaliser = { + "transcript": "mRNA", + } + levels = { + "repeat_region": "l1", + "pseudogene": "l1", + "pseuodgenic_region": "l1", + "transposable_element_gene": "l1", + "gene": "l1", + "tRNA": "l2", + "tmRNA": "l2", + "mRNA": "l2", + "exon": "l3", + "CDS": "l3", + "five_prime_UTR": "l3", + "three_prime_UTR": "l3", + } + ignore = { + "source", + } + records = {} + l2l1 = {} + i = 0 + recordsrc = parseGFF3(filename, return_as=dict) + if progress is not None: + if progress == True: + desc=None + else: + desc=progress + recordsrc = tqdm(recordsrc, desc=desc) + for record in recordsrc: + typ = record["type"] + typ = level_canonicaliser.get(typ, typ) + record["type"] = typ + lvl = None + if "RNA" in typ.upper(): + lvl = "l2" + lvl = levels.get(typ, lvl) + if lvl is None: + if typ not in ignore: + print(f"WARNING: {typ} is not a nice feature, skipping", file=stderr) + continue + id = record["attributes"]["ID"] + if lvl == "l1": + record["children"] = {} + records[id] = record + i = 0 + elif lvl == "l2": + parent = record["attributes"]["Parent"] + l2l1[id] = parent + record["children"] = {} + try: + records[parent]["children"][id] = record + except KeyError: + print(f"L2 entry {id} parent {parent} not in records? {record}") + else: + try: + parent = record["attributes"]["Parent"] + top = l2l1[parent] + if id in records[top]["children"][parent]["children"]: + i += 1 + id = f"{id}_{i}" + record["attributes"]["ID"] = id + records[top]["children"][parent]["children"][id] = record + except KeyError: + print(f"L3 entry {id} parent not in records? {record}") + return records + + +def attr2line(attrs): + from urllib.parse import quote as Q + return ";".join("=".join((Q(k), Q(v))) for k, v in attrs.items()) + + +def reformat_names(gene, geneid=None, changenames=True): + def prefix_name(entry, sid): + name = entry["attributes"].get("Name") + if name: + name = f"{sid}_{name}" + else: + name = sid + entry["attributes"]["Name"] = name + if not geneid: + geneid = gene["attributes"]["ID"] + gene["attributes"]["ID"] = geneid + subids = Counter() + for child in gene.get("children", {}).values(): + ct = child["type"] + subids[ct] += 1 + ci = subids[ct] + subid = f"{geneid}_{ct}{ci:02d}" + child["attributes"]["ID"] = subid + child["attributes"]["Parent"] = geneid + if changenames: + prefix_name(child, subid) + #print("E", child["attributes"]["Name"]) + gcids = Counter() + for gchild in child.get("children", {}).values(): + gt = gchild["type"] + gcids[gt] += 1 + gi = gcids[gt] + gcid = f"{geneid}_{gt}{gi:02d}" + #print("F", gt, gcid) + gchild["attributes"]["ID"] = gcid + gchild["attributes"]["Parent"] = subid + #print("G", gcid, gchild["attributes"].get("Name")) + if changenames: + prefix_name(gchild, gcid) + #print("H", gcid, gchild["attributes"]["Name"]) + +def write_gene(gene, geneid=None, file=None, changenames=False): + def write_line(entry): + x = [entry[field] if entry[field] is not None else "." for i, field in enumerate(gffInfoFields)] + x[-1] = attr2line(x[-1]) + print(*x, sep="\t", file=file) + #print(gene, geneid) + #print("## gff-version 3", file=file) + if geneid or changenames: + reformat_names(gene, geneid, changenames) + write_line(gene) + for child in gene.get("children", {}).values(): + write_line(child) + for gchild in child.get("children", {}).values(): + write_line(gchild) + print("###", file=file) + + +def get_all_features(annot, ftype): + feat = set() + if annot["type"] == ftype: + feat.add((annot["start"], annot["end"], annot["strand"], annot["type"])) + for child in annot.get("children", {}).values(): + feat.update(get_all_features(child, ftype)) + return feat + + +def features_equal(a, b, ftype): + a = set(get_all_features(a, ftype)) + b = set(get_all_features(b, ftype)) + return a == b + + +def feature_distance(a, b, ftype): + def overlap(a, b): + return a[0] <= b[0] <= a[1] or a[0] <= b[1] <= a[1] + a = list(get_all_features(a, ftype)) + b = list(get_all_features(b, ftype)) + ovl = list() + noovl = list() + b_with_ovl = set() + for ai in range(len(a)): + for bi in range(len(b)): + if overlap(a[ai], b[bi]): + ovl.append((a[ai], b[bi])) + b_with_ovl.add(b[bi]) + break + else: + # a does not overlap with an b + noovl.append(a[ai]) + for bi in range(len(b)): + if b[bi] not in b_with_ovl: + noovl.append(b[bi]) + dist = 0 + for A, B in ovl: + dist += abs(A[1] - B[1]) + abs(A[0] - B[0]) + for x in noovl: + dist += x[1] - x[0] + return dist + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("file", help="The GFF3 input file (.gz allowed)") + args = parser.parse_args() + import json + json.dump(gff_heirarchy(args.file), stdout, indent=4) diff --git a/blsl/gg2k.py b/blsl/gg2k.py new file mode 100644 index 0000000..373c73e --- /dev/null +++ b/blsl/gg2k.py @@ -0,0 +1,108 @@ +# Copyright (c) 2023 Dr. K. D. Murray/Gekkonid Consulting +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import csv +import sys +import math +import argparse +from collections import defaultdict + +def recursive_defaultdict(): + x = defaultdict(recursive_defaultdict) + x["__count__"] = 0 + return x + +def recursive_taxtable(lineage, level=0): + yield (level, lineage["__name__"], lineage["__count__"], lineage["__line__"]) + for k, v in reversed(sorted(filter(lambda x: x[0] not in ["__count__", "__name__", "__line__"], + lineage.items()), + key=lambda x: x[1]["__count__"])): + yield from recursive_taxtable(v, level=level+1) + +def insert_lineage(tree, lin, max_level=None, n=1): + X = tree + X["__count__"] += n + X["__name__"] = "root" + X["__line__"] = "NA" + for i, t in enumerate(lin): + X = X[t] + X["__count__"] += n + X["__name__"] = t + X["__line__"] = ";".join(lin) + if max_level is not None and i >= max_level: + break + +def gg2k_main(argv=None): + """Summarise a table with GreenGenes-style lineages into a kraken-style report.""" + ap = argparse.ArgumentParser() + ap.add_argument("-l", "--lca-column", type=lambda x: int(x) - 1, + help="Take the lowest common ancestor per value in this column") + ap.add_argument("-f", "--column", required=True, type=lambda x: int(x) - 1, + help="Which column contains lineages?") + ap.add_argument("-d", "--column-delim", default="\t", + help="Quality score for glue sequences") + ap.add_argument("-D", "--lineage-delim", default=";", + help="Delimiter between nodes in a taxon lineage") + ap.add_argument("-L", "--max-level", default=None, type=lambda x: int(x) - 1, + help="Maximum level to descend to.") + ap.add_argument("-V", "--vsearch-size", default=None, type=lambda x: int(x) - 1, + help="Parse vsearch size=NNNN headers from this column, use NNNN as a count rather than # hits/reads.") + ap.add_argument("table", help="Table of taxon hits") + args = ap.parse_args(argv) + + + class OurDialect(csv.unix_dialect): + delimiter = args.column_delim + quotechar = '"' + doublequote = True + skipinitialspace = True + + lineage = recursive_defaultdict() + with open(args.table) as fh: + current_id = None + lca = [] + for rec in csv.reader(fh, dialect=OurDialect): + linstr = rec[args.column] + lin = linstr.split(args.lineage_delim) + n = 1 + if args.lca_column is not None: + if current_id != rec[args.lca_column]: + if args.vsearch_size is not None and current_id is not None: + hdr = current_id + fields = dict(x.split("=") for x in hdr.split(";") if "=" in x) + n = int(fields["size"]) + insert_lineage(lineage, lca, args.max_level, n=n) + lca = lin + current_id = rec[args.lca_column] + for i, tax in enumerate(lin): + if i >= len(lca) or lca[i] != tax: + break + lca = lca[:i+1] + else: + if args.vsearch_size is not None: + hdr = rec[args.vsearch_size] + fields = dict(x.split("=") for x in hdr.split(";") if "=" in x) + n = int(fields["size"]) + insert_lineage(lineage, lin, args.max_level, n=n) + if lca: + if args.vsearch_size is not None: + hdr = current_id + fields = dict(x.split("=") for x in hdr.split(";") if "=" in x) + n = int(fields["size"]) + insert_lineage(lineage, lca, args.max_level, n=n) + + n = int(math.ceil(math.log10(lineage["__count__"]+1))) + taxtbl = list(recursive_taxtable(lineage)) + max_name = max(len(t[1]) for t in taxtbl) + max_level = max(t[0] for t in taxtbl) + for level, name, count, linstr in taxtbl: + linstr = linstr.split(args.lineage_delim, level) + linstr = args.lineage_delim.join(linstr[:-1]) + print(name.ljust(max_name + (max_level-level)*2).rjust(max_name + max_level * 2), str(count).rjust(n), level, linstr, sep="|") + +if __name__ == "__main__": + gg2k_main() + diff --git a/blsl/ildemux.py b/blsl/ildemux.py index 89ff2fe..c072280 100644 --- a/blsl/ildemux.py +++ b/blsl/ildemux.py @@ -1,9 +1,12 @@ from io import BytesIO from pathlib import Path +import sys from sys import stdin from tqdm import tqdm from collections import Counter +from ._utils import fqpair, rc + def hamming_cloud(seq, distance=1): """Generate DNA seqs whose Hamming distance from seq is <= distance @@ -88,16 +91,6 @@ def flush(self): def __del__(self): self.flush() -def fqpair(stream): - fqp = list() - for line in stream: - fqp.append(line) - if len(fqp) == 8: - yield fqp - fqp = list() - if len(fqp) == 8: - yield fqp - def gethdrdat(hdr): spot, idx = hdr.rstrip("\n").split(" ") idxpair = idx.split(":")[3] @@ -111,10 +104,6 @@ def fqp2idx(fqp): assert(ip1 == ip2) return ip1 -def rc(seq): - d = {"A": "T", "G":"C", "C":"G", "T":"A"} - return "".join([d.get(x, "N") for x in reversed(list(seq.upper()))]) - def make_sample_map(tablefile, outdir, fileext=".fq", distance=1): from csv import DictReader from itertools import product @@ -134,7 +123,7 @@ def make_sample_map(tablefile, outdir, fileext=".fq", distance=1): return smap -def main(): +def main(argv=None): """Demultiplex modern illumina reads from read headers. Supports *ONLY* out-of-read indicies (in @header lines e.g. 1:N:0:ACGT+TGCA), and only reads with i7 + i5. @@ -145,7 +134,7 @@ def main(): ap.add_argument("-o", "--outdir", required=True, type=Path, help="Output directory (must exist)") ap.add_argument("-c", "--justcount", action="store_true", - help="Compress outputs with gzip.") + help="Write nothing, only tablulate (largely useless)") ap.add_argument("-z", "--zip", nargs="?", default=None, const=6, type=int, help="Compress outputs with gzip.") ap.add_argument("-j", "--threads", default=8, @@ -154,7 +143,7 @@ def main(): help="Number tolerable mismatches (hamming distance).") ap.add_argument("-k", "--keyfile", help="Mapping of i7/i5 -> sample as tsv (with cols i7, i5, sample)") - args=ap.parse_args() + args=ap.parse_args(argv) ByteBucket.threads = args.threads if args.zip is not None: diff --git a/blsl/ilsample.py b/blsl/ilsample.py index fb2a36b..3ebfbb8 100644 --- a/blsl/ilsample.py +++ b/blsl/ilsample.py @@ -4,6 +4,7 @@ from tqdm import tqdm from collections import Counter import random +import gzip def fqpair(stream): fqp = list() diff --git a/blsl/mask2bed.py b/blsl/mask2bed.py index 42bbfca..e430032 100644 --- a/blsl/mask2bed.py +++ b/blsl/mask2bed.py @@ -8,6 +8,7 @@ from Bio import SeqIO from shlex import quote import argparse +from tqdm import tqdm full_help = """ @@ -29,7 +30,7 @@ def mask2bed_main(argv=None): args = ap.parse_args(argv) seqs = SeqIO.parse(args.fasta, "fasta") - for seq in seqs: + for seq in tqdm(seqs): last_start = 0 last_masked = None for i, base in enumerate(str(seq.seq)): diff --git a/blsl/nstitch.py b/blsl/nstitch.py new file mode 100644 index 0000000..dddc02a --- /dev/null +++ b/blsl/nstitch.py @@ -0,0 +1,52 @@ +# Copyright (c) 2023 Dr. K. D. Murray/Gekkonid Consulting +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import gzip +import sys +import argparse +from ._utils import fqparse, rc + +def nstitch_main(argv=None): + """Combine R1 + R2 into single sequences, with an N in the middle""" + ap = argparse.ArgumentParser() + ap.add_argument("-s", "--seq", default="N", + help="Sequence used to glue together R1 + R2 of each pair") + ap.add_argument("-q", "--qual", default="!", + help="Quality score for glue sequences") + ap.add_argument("-I", "--input-interleaved", action="store_true", + help="Only read R1 which contains interleaved sequences") + ap.add_argument("r1", help="R1 fastq.gz") + ap.add_argument("r2", help="R2 fastq.gz", nargs="?") + args = ap.parse_args(argv) + + if args.input_interleaved: + r1 = gzip.open(args.r1, "rt") + seqs = zip(fqparse(r1), fqparse(r1)) + else: + r1 = gzip.open(args.r1, "rt") + r2 = gzip.open(args.r2, "rt") + seqs = zip(fqparse(r1), fqparse(r2)) + for s1, s2 in seqs: + n1, n2 = s1[0].lstrip("@"), s2[0].lstrip("@") + i1 = n1.split(" ", 1)[0] + i2 = n2.split(" ", 1)[0] + if i1 != i2: + print(f"ERROR: read IDs don't match: {i1} {i2}", file=sys.stderr) + sys.exit(1) + newseq = f"{s1[1]}{args.seq}{rc(s2[1])}" + newqual = f"{s1[3]}{args.qual*len(args.seq)}{s2[3]}" + assert len(s1[1]) == len(s1[3]) + assert len(s2[1]) == len(s2[3]) + assert len(newseq) == len(newqual) + print(f"@{n1} {n2}") + print(newseq) + print("+") + print(newqual) + r1.close() + r2.close() + +if __name__ == "__main__": + nstitch_main() diff --git a/blsl/pairslash.py b/blsl/pairslash.py new file mode 100644 index 0000000..20ab2d1 --- /dev/null +++ b/blsl/pairslash.py @@ -0,0 +1,42 @@ +import gzip + +from ._utils import fqparse + +def seqnum(n, c): + if n.endswith("/1"): + return 1 + if n.endswith("/2"): + return 2 + if c: + if c[0].startswith("1:"): + return 1 + if c[0].startswith("2:"): + return 2 + raise ValueError(f"Unknown pair from fastq header: '{n} {''.join(c)}'") + +def main(argv=None): + """Add an old-style /1 /2 pair indicator to paired-end fastq files""" + + import argparse + ap = argparse.ArgumentParser("blsl pairslash") + ap.add_argument("readfile", nargs='?', default="/dev/fd/0") + args=ap.parse_args(argv) + + + if args.readfile.endswith(".gz"): + reads = gzip.open(args.readfile, "rt") + else: + reads = open(args.readfile, "r") + + for s in fqparse(reads): + n, *c = s[0].split(" ", 1) + if n.endswith("/1") or n.endswith("/2"): + print(s[0]) + else: + i = seqnum(n, c) + print(f"{n}/{i} {''.join(c)}") + print(s[1], s[2], s[3], sep="\n") + +if __name__ == "__main__": + main() + diff --git a/blsl/pansn_rename.py b/blsl/pansn_rename.py index 5b43e16..2378e53 100644 --- a/blsl/pansn_rename.py +++ b/blsl/pansn_rename.py @@ -34,10 +34,11 @@ def make_replacer(mode, reffa=None, org_name=None, delim="~", outdelim="~", refd return fromto -def do_tsv(infh, outfh, cols, replacer, coldelim="\t"): +def do_tsv(infh, outfh, cols, replacer, coldelim="\t", skipcomment=False): for line in tqdm(infh): if line.startswith("#"): - outfh.write(line) + if not skipcomment: + outfh.write(line) continue fields = line.rstrip("\r\n").split(coldelim) for col in cols: diff --git a/blsl/regionbed.py b/blsl/regionbed.py index 8dfdf46..7ad8d95 100644 --- a/blsl/regionbed.py +++ b/blsl/regionbed.py @@ -20,7 +20,7 @@ def make_regions(fas, window=1e6): def main(argv): """Make a bed/region file of genome windows""" ap = argparse.ArgumentParser() - ap.add_argument("--windowsize", "-w", default=100000, + ap.add_argument("--windowsize", "-w", default=100000, type=int, help="Window size of each non-overlapping region") ap.add_argument("--bed", "-b", type=argparse.FileType("w"), help="Output bed file") @@ -29,7 +29,7 @@ def main(argv): ap.add_argument("fasta", help="Fasta reference (must have fai)") args = ap.parse_args(argv) - for chrm, start, stop, leng in make_regions(args.fasta): + for chrm, start, stop, leng in make_regions(args.fasta, window=args.windowsize): region = f"{chrm}:{start+1}-{stop}" if args.bed is not None: print(chrm, start, stop, region, file=args.bed, sep="\t") diff --git a/blsl/shannon.py b/blsl/shannon.py new file mode 100644 index 0000000..8947442 --- /dev/null +++ b/blsl/shannon.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +from Bio import AlignIO +from tqdm import tqdm + +import argparse +from sys import stdin, stdout, stderr +from math import log +from statistics import mean +from collections import Counter + +class FreqCounter(Counter): + def freqs(self): + N = sum(self.values()) + return {b:v/N for b, v in self.most_common()} + +def shannon_entropy(alncol): + alncol = [b for b in alncol if b not in ("-", "*", "X")] + aafreq = FreqCounter(alncol) + entropy = -0 + for aa, freq in aafreq.freqs().items(): + entropy += freq*(log(freq,2)) + return -entropy + +def shanent_aln(alignment, alnformat="fasta"): + aln = AlignIO.read(alignment, alnformat) + if len(aln) < 15: + print(f"WARN: {alignment} has <15 seqs, treat SE values with a grain of salt", file=stderr) + se = [] + for i in range(aln.get_alignment_length()): + se.append(shannon_entropy(aln[:,i])) + return aln, se + +def main(argv=None): + """Calculate Shannon's entropy (in bits) at each column of one or more alignments""" + ap = argparse.ArgumentParser() + ap.add_argument("-o", "--out-tsv", default=stdout, type=argparse.FileType("wt"), + help="Output TSV file") + ap.add_argument('alignments', nargs="+", + help='One or more MSAs, in fasta format') + args = ap.parse_args() + + print("alnfile", "nseqs", "column", "shannon_entropy", sep="\t", file=args.out_tsv) + for alnfile in tqdm(args.alignments, unit=" Alignment"): + aln, se = shanent_aln(alnfile) + for i, s in enumerate(se): + print(alnfile, len(aln), i+1, s, sep="\t", file=args.out_tsv) + mean(se) + print(alnfile, len(aln), "*", mean(se), sep="\t", file=args.out_tsv) + +if __name__ == '__main__': + main() diff --git a/blsl/tabcat.py b/blsl/tabcat.py new file mode 100644 index 0000000..ed0ee14 --- /dev/null +++ b/blsl/tabcat.py @@ -0,0 +1,55 @@ +# Copyright (c) 2023 Dr. K. D. Murray/Gekkonid Consulting +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import csv +import sys +import math +import argparse + + +def tabcat_main(argv=None): + """Concatenate table (c/tsv) files, adding the filename as a column""" + ap = argparse.ArgumentParser() + ap.add_argument("-d", "--delim", default=",", + help="Column delimiter.") + ap.add_argument("-o", "--output", default="-", type=argparse.FileType("wt"), + help="Column delimiter.") + ap.add_argument("-N", "--add-name", action="store_true", + help="Add a column with the file name") + ap.add_argument("-f", "--fofn", action="store_true", + help="Each input is a file of filenames") + ap.add_argument("xsvs", help="Input {C,T}SVs to concatenate", nargs="+") + args = ap.parse_args(argv) + + class xsv(csv.unix_dialect): + delimiter = args.delim + quotechar = '"' + doublequote = True + skipinitialspace = True + quoting=csv.QUOTE_MINIMAL + + if args.fofn: + files = [] + for fofn in args.xsvs: + with open(fofn) as fh: + for file in fh: + files.append(file.rstrip()) + else: + files = args.xsvs + outcsv = None + for file in files: + with open(file) as fh: + for rec in csv.DictReader(fh, dialect=xsv): + rec["tabcat_filename"] = file + if outcsv is None: + outcsv = csv.DictWriter(args.output, dialect=xsv, fieldnames=[k for k in rec]) + outcsv.writeheader() + outcsv.writerow(rec) + +if __name__ == "__main__": + tabcat_main() + + diff --git a/blsl/vcfstats.py b/blsl/vcfstats.py new file mode 100755 index 0000000..86ce112 --- /dev/null +++ b/blsl/vcfstats.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +from tqdm import tqdm +from cyvcf2 import VCF +import pandas as pd + +import sys +from sys import stdin, stdout, stderr +from subprocess import Popen, PIPE +import argparse +import math +from concurrent.futures import ProcessPoolExecutor, as_completed + +def parallel_regions(vcf, cores=1): + V = VCF(vcf) + # 10 chunks per chrom per core + chunks = len(V.seqlens)*10*cores + for cname, clen in zip(V.seqnames, V.seqlens): + chunk = int(math.ceil(clen/chunks)) + for start in range(0, clen, chunk): + s = start+1 + e = start+chunk+1 + yield f"{cname}:{s}-{e}" + +def variant2dict(v): + for i, alt in enumerate(v.ALT): + dat = {"CHROM": v.CHROM, "POS": v.POS, "REF": v.REF, "ALT": alt, "QUAL": v.QUAL} + dat["call_rate"] = v.call_rate + for key, val in v.INFO: + if isinstance(val, str) and "," in val: + val = val.split(',') + if isinstance(val, tuple) or isinstance(val, list): + val = val[i] + dat[f"INFO_{key}"] = val + yield dat + +def bcftools_info_with_tags(vbcf): + res = [] + cmd=f"bcftools +fill-tags {vbcf} -Ou -- -d -t all,F_MISSING" + with Popen(cmd, shell=True, stdout=PIPE) as proc: + for v in tqdm(VCF(proc.stdout)): + for r in variant2dict(v): + res.append(r) + return res + +def one_chunk_stats(vcf, chunk, fill=True): + cmd=f"bcftools view -r {chunk} {vcf} -Ou" + if fill: + cmd = f"{cmd} | bcftools +fill-tags - -Ou -- -d -t all,F_MISSING" + res = [] + with Popen(cmd, shell=True, stdout=PIPE) as proc: + for v in VCF(proc.stdout): + for r in variant2dict(v): + res.append(r) + return res + +def chunkwise_bcfools_stats(args): + res = [] + with ProcessPoolExecutor(args.threads) as exc: + jobs = [] + for region in parallel_regions(args.vcf): + jobs.append(exc.submit(one_chunk_stats, args.vcf, region, fill=args.fill_tags_first)) + for job in tqdm(as_completed(jobs), total=len(jobs), unit="chunk"): + res.extend(job.result()) + return res + +def filter_fields(dat, fields=None): + if fields is None: + return dat + return {k: dat[k] for k in fields if k in dat} + +def main(argv=None): + """Use bcftools to calculate various statistics, outputing an R-ready table""" + ap = argparse.ArgumentParser() + ap.add_argument("-o", "--output", default=stdout, type=argparse.FileType("wt"), + help="Output TSV file") + ap.add_argument("-f", "--fields", nargs="+", + help="Use only these fields") + ap.add_argument("-F", "--fill-tags-first", action="store_true", + help="Use bcftools +fill-tags to pre-fill more fields (forces no parallelism)") + ap.add_argument("-t", "--threads", default=1, type=int, + help="Number of parallel threads") + + ap.add_argument("vcf") + args = ap.parse_args(argv) + + #if args.fill_tags_first: + # variants = bcftools_info_with_tags(args.vcf) + #else: + variants = chunkwise_bcfools_stats(args) + if args.fields: + variants = [filter_fields(x, args.fields) for x in variants] + + df = pd.DataFrame(variants) + df.to_csv(args.output, sep="\t", index=False) + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index a281f00..b219d00 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,14 +4,19 @@ description = "Miscelanelous python-based bioinformatics utils" authors = [ { name = "Dr. K. D. Murray", email = "info@gekkonid.com" } ] -requires_python = ">=3.8" +requires-python = ">=3.8" dependencies = [ + "tqdm", "biopython", + "cyvcf2", + "pandas", ] dynamic = [ "version", ] - +[project.readme] +file = "README.md" +content-type = "text/markdown" [project.scripts] blsl = "blsl:main"