From 3ef6341eb8ca15692f51275016c227ede84f5ed4 Mon Sep 17 00:00:00 2001
From: "Dr. K.D. Murray" <foss@kdmurray.id.au>
Date: Wed, 19 Apr 2023 06:24:03 +0200
Subject: [PATCH 01/28] fix gh action

---
 .github/workflows/python-publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 634faad..5a6b1d7 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -11,7 +11,7 @@ name: Upload Python Package
 on:
   push:
     tags:
-      - *
+      - '*'
   workflow_dispatch:
 
 permissions:

From 8ade0b1bffcac3e7961c53bcf49d950e36d8d62a Mon Sep 17 00:00:00 2001
From: "Dr. K.D. Murray" <foss@kdmurray.id.au>
Date: Wed, 19 Apr 2023 06:29:15 +0200
Subject: [PATCH 02/28] Fixes to pyproject

---
 pyproject.toml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a281f00..dfbcac1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,14 +4,16 @@ description = "Miscelanelous python-based bioinformatics utils"
 authors = [
     { name = "Dr. K. D. Murray", email = "info@gekkonid.com" }
 ]
-requires_python = ">=3.8"
+requires-python = ">=3.8"
 dependencies = [
     "biopython",
 ]
 dynamic = [
     "version",
 ]
-
+[project.readme]
+file = "README.md" 
+content-type = "text/markdown"
 
 [project.scripts]
 blsl = "blsl:main"

From 22b5bd0307e697e05c8daca884fe9f667d5f4c5d Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Wed, 3 May 2023 09:57:32 +0200
Subject: [PATCH 03/28] add gg2k and nstitch

---
 blsl/__init__.py |  5 ++++
 blsl/_utils.py   | 32 +++++++++++++++++++++
 blsl/gg2k.py     | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 blsl/ildemux.py  | 16 ++---------
 blsl/nstitch.py  | 52 ++++++++++++++++++++++++++++++++++
 5 files changed, 164 insertions(+), 14 deletions(-)
 create mode 100644 blsl/_utils.py
 create mode 100644 blsl/gg2k.py
 create mode 100644 blsl/nstitch.py

diff --git a/blsl/__init__.py b/blsl/__init__.py
index 6602adc..892a088 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -43,6 +43,11 @@
 from .uniref_accessionmap import main as uniref_main
 cmds["uniref-acc2taxid"] = uniref_main
 
+from .nstitch import nstitch_main
+cmds["nstitch"] = nstitch_main
+
+from .gg2k import gg2k_main
+cmds["gg2k"] = gg2k_main
 
 def mainhelp(argv=None):
     """Print this help message"""
diff --git a/blsl/_utils.py b/blsl/_utils.py
new file mode 100644
index 0000000..33b4bad
--- /dev/null
+++ b/blsl/_utils.py
@@ -0,0 +1,32 @@
+
+def rc(seq):
+    d = {"A": "T", "G":"C", "C":"G", "T":"A"}
+    return "".join([d.get(x, "N") for x in reversed(list(seq.upper()))])
+
+def fqpair(stream):
+    fqp = list()
+    for line in stream:
+        fqp.append(line.rstrip("\n"))
+        if len(fqp) == 8:
+            yield fqp
+            fqp = list()
+    if len(fqp) == 8:
+        yield fqp
+    else:
+        assert len(fqp) == 0
+
+def fqparse(stream):
+    fqp = list()
+    for line in stream:
+        fqp.append(line.rstrip("\n"))
+        if len(fqp) == 4:
+            yield fqp
+            fqp = list()
+    if len(fqp) == 4:
+        yield fqp
+    else:
+        assert len(fqp) == 0
+
+
+
+
diff --git a/blsl/gg2k.py b/blsl/gg2k.py
new file mode 100644
index 0000000..f40305a
--- /dev/null
+++ b/blsl/gg2k.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2023 Dr. K. D. Murray/Gekkonid Consulting <spam@gekkonid.com>
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import csv
+import sys
+import math
+import argparse
+from collections import defaultdict
+
+def recursive_defaultdict():
+    x = defaultdict(recursive_defaultdict)
+    x["__count__"] = 0
+    return x
+
+def recursive_taxtable(lineage, level=0):
+    yield (level, lineage["__name__"], lineage["__count__"], lineage["__line__"])
+    for k, v in reversed(sorted(filter(lambda x: x[0] not in ["__count__", "__name__", "__line__"],
+                                       lineage.items()),
+                                key=lambda x: x[1]["__count__"])):
+        yield from recursive_taxtable(v, level=level+1)
+
+
+def gg2k_main(argv=None):
+    """Summarise a table with GreenGenes-style lineages into a kraken-style report."""
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-f", "--column", required=True, type=lambda x: int(x) - 1,
+            help="Which column contains lineages?")
+    ap.add_argument("-d", "--column-delim", default="\t",
+            help="Quality score for glue sequences")
+    ap.add_argument("-D", "--lineage-delim", default=";",
+            help="Delimiter between nodes in a taxon lineage")
+    ap.add_argument("-L", "--max-level", default=None, type=lambda x: int(x) - 1,
+            help="Maximum level to descend to.")
+    ap.add_argument("table", help="Table of taxon hits")
+    args = ap.parse_args(argv)
+    
+    
+    class OurDialect(csv.unix_dialect):
+        delimiter = args.column_delim
+        quotechar = '"'
+        doublequote = True
+        skipinitialspace = True
+
+
+    lineage = recursive_defaultdict()
+    with open(args.table) as fh:
+        for rec in csv.reader(fh, dialect=OurDialect):
+            linstr = rec[args.column]
+            lin = linstr.split(args.lineage_delim)
+            X = lineage
+            X["__count__"] += 1
+            X["__name__"] = "root"
+            X["__line__"] = "NA"
+            for i, t in enumerate(lin):
+                X = X[t]
+                X["__count__"] += 1
+                X["__name__"] = t
+                X["__line__"] = linstr
+                if args.max_level is not None and i >= args.max_level:
+                    break
+
+    n = int(math.ceil(math.log10(lineage["__count__"])))
+    for level, name, count, linstr in recursive_taxtable(lineage):
+        linstr = linstr.split(args.lineage_delim, level)
+        linstr = args.lineage_delim.join(linstr[:-1])
+        print(str(count).rjust(n + level * 2), name.ljust(15), level, linstr, sep="|")
+
+if __name__ == "__main__":
+    gg2k_main()
+
diff --git a/blsl/ildemux.py b/blsl/ildemux.py
index 89ff2fe..aac5c87 100644
--- a/blsl/ildemux.py
+++ b/blsl/ildemux.py
@@ -4,6 +4,8 @@
 from tqdm import tqdm
 from collections import Counter
 
+from ._utils import fqpair, rc
+
 def hamming_cloud(seq, distance=1):
     """Generate DNA seqs whose Hamming distance from seq is <= distance
 
@@ -88,16 +90,6 @@ def flush(self):
     def __del__(self):
         self.flush()
 
-def fqpair(stream):
-    fqp = list()
-    for line in stream:
-        fqp.append(line)
-        if len(fqp) == 8:
-            yield fqp
-            fqp = list()
-    if len(fqp) == 8:
-        yield fqp
-
 def gethdrdat(hdr):
     spot, idx = hdr.rstrip("\n").split(" ")
     idxpair = idx.split(":")[3]
@@ -111,10 +103,6 @@ def fqp2idx(fqp):
     assert(ip1 == ip2)
     return ip1
 
-def rc(seq):
-    d = {"A": "T", "G":"C", "C":"G", "T":"A"}
-    return "".join([d.get(x, "N") for x in reversed(list(seq.upper()))])
-
 def make_sample_map(tablefile, outdir, fileext=".fq", distance=1):
     from csv import DictReader
     from itertools import product
diff --git a/blsl/nstitch.py b/blsl/nstitch.py
new file mode 100644
index 0000000..02556a6
--- /dev/null
+++ b/blsl/nstitch.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2023 Dr. K. D. Murray/Gekkonid Consulting <spam@gekkonid.com>
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import gzip
+import sys
+import argparse
+from ._utils import fqparse, rc
+
+def nstitch_main(argv=None):
+    """Combine R1 + R2 into single sequences, with an N in the middle"""
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-s", "--seq", default="N",
+            help="Sequence used to glue together R1 + R2 of each pair")
+    ap.add_argument("-q", "--qual", default="!",
+            help="Quality score for glue sequences")
+    ap.add_argument("-I", "--input-interleaved", action="store_true",
+            help="Only read R1 which contains interleaved sequences")
+    ap.add_argument("r1", help="R1 fastq.gz")
+    ap.add_argument("r2", help="R2 fastq.gz", nargs="?")
+    args = ap.parse_args(argv)
+    
+    if args.input_interleaved:
+        r1 = gzip.open(args.r1, "rt")
+        seqs = zip(fqparse(r1), fqparse(r1))
+    else:
+        r1 = gzip.open(args.r1, "rt")
+        r2 = gzip.open(args.r2, "rt")
+        seqs = zip(fqparse(r1), fqparse(r2))
+    for s1, s2 in seqs:
+        n1, n2 = s1[0].lstrip("@"), s2[0].lstrip("@")
+        i1 = n1.split(" ", 1)[0]
+        i2 = n2.split(" ", 1)[0]
+        if i1 != i2:
+            print(f"ERROR: read IDs don't match: {i1} {i2}", file=sys.stderr)
+            sys.exit(1)
+        newseq = f"{s1[1]}{args.seq}{rc(s2[1])}"
+        newqual = f"{s1[3]}{args.qual*len(args.seq)}{s2[3]}"
+        assert len(s1[1]) == len(s1[3])
+        assert len(s2[1]) == len(s2[3])
+        assert len(newseq) == len(newqual)
+        print(f"@{n1} {n2}")
+        print(newseq)
+        print("+")
+        print(newqual)
+    r1.close()
+    r2.close()
+
+if __name__ == "__main__":
+    nstich_main()

From 4fffc2ace222fc9dc5394ca9b3a3a0242d9afd7f Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Sat, 6 May 2023 10:24:02 +0200
Subject: [PATCH 04/28] Add equalbestblast and tabcat; add lca to gg2k

---
 blsl/__init__.py       |  7 +++++
 blsl/equalbestblast.py | 64 ++++++++++++++++++++++++++++++++++++++++++
 blsl/gg2k.py           | 63 ++++++++++++++++++++++++++++++++---------
 blsl/tabcat.py         | 45 +++++++++++++++++++++++++++++
 4 files changed, 165 insertions(+), 14 deletions(-)
 create mode 100644 blsl/equalbestblast.py
 create mode 100644 blsl/tabcat.py

diff --git a/blsl/__init__.py b/blsl/__init__.py
index 892a088..4fda850 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -49,6 +49,13 @@
 from .gg2k import gg2k_main
 cmds["gg2k"] = gg2k_main
 
+from .equalbestblast import equalbestblast_main
+cmds["equalbestblast"] = equalbestblast_main
+
+from .tabcat import tabcat_main
+cmds["tabcat"] = tabcat_main
+
+
 def mainhelp(argv=None):
     """Print this help message"""
     print("USAGE: blsl <subtool> [options...]\n\n")
diff --git a/blsl/equalbestblast.py b/blsl/equalbestblast.py
new file mode 100644
index 0000000..31d7340
--- /dev/null
+++ b/blsl/equalbestblast.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2023 Dr. K. D. Murray/Gekkonid Consulting <spam@gekkonid.com>
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import csv
+import sys
+import math
+import argparse
+
+class blast6(csv.unix_dialect):
+    delimiter = "\t"
+    quotechar = '"'
+    doublequote = False
+    escapechar = "\\"
+    skipinitialspace = True
+
+def output_best(recs, fields):
+    nbegin = len(recs)
+    mineval =None
+    maxalnlen=None
+    if nbegin < 1:
+        return
+    mineval = min(float(x["evalue"]) for x in recs)
+    recs = [
+        rec for rec in recs if float(rec["evalue"]) <= mineval
+    ]
+    maxalnlen = max(float(x["length"]) for x in recs)
+    recs = [
+        rec for rec in recs if float(rec["length"]) >= maxalnlen
+    ]
+    nend=len(recs)
+    print(recs[0]["qseqid"], nbegin, nend, mineval, maxalnlen, file=sys.stderr)
+    for rec in recs:
+        print(*[rec[x] for x in fields], sep="\t")
+    
+
+def equalbestblast_main(argv=None):
+    """Output only the best blast hits."""
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-f", "--outfmt", default="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore",
+            help="Blast outfmt headers (default is the default for blast -outfmt 6)")
+    ap.add_argument("table", help="Table of blast hits")
+    args = ap.parse_args(argv)
+    
+
+    fields = args.outfmt.split(" ")
+    query = None
+    hits = []
+    with open(args.table) as fh:
+        for rec in csv.DictReader(fh, dialect=blast6, fieldnames=fields):
+            if query != rec["qseqid"]:
+                if query is not None:
+                    output_best(hits, fields)
+                hits = []
+                query = rec["qseqid"]
+            hits.append(rec)
+        if query is not None:
+            output_best(hits, fields)
+
+if __name__ == "__main__":
+    equalbestblast_main()
+
diff --git a/blsl/gg2k.py b/blsl/gg2k.py
index f40305a..188f0fa 100644
--- a/blsl/gg2k.py
+++ b/blsl/gg2k.py
@@ -22,10 +22,24 @@ def recursive_taxtable(lineage, level=0):
                                 key=lambda x: x[1]["__count__"])):
         yield from recursive_taxtable(v, level=level+1)
 
+def insert_lineage(tree, lin, max_level=None, n=1):
+    X = tree
+    X["__count__"] += n
+    X["__name__"] = "root"
+    X["__line__"] = "NA"
+    for i, t in enumerate(lin):
+        X = X[t]
+        X["__count__"] += n
+        X["__name__"] = t
+        X["__line__"] = ";".join(lin)
+        if max_level is not None and i >= max_level:
+            break
 
 def gg2k_main(argv=None):
     """Summarise a table with GreenGenes-style lineages into a kraken-style report."""
     ap = argparse.ArgumentParser()
+    ap.add_argument("-l", "--lca-column", type=lambda x: int(x) - 1,
+            help="Take the lowest common ancestor per value in this column")
     ap.add_argument("-f", "--column", required=True, type=lambda x: int(x) - 1,
             help="Which column contains lineages?")
     ap.add_argument("-d", "--column-delim", default="\t",
@@ -34,6 +48,8 @@ def gg2k_main(argv=None):
             help="Delimiter between nodes in a taxon lineage")
     ap.add_argument("-L", "--max-level", default=None, type=lambda x: int(x) - 1,
             help="Maximum level to descend to.")
+    ap.add_argument("-V", "--vsearch-size", default=None, type=lambda x: int(x) - 1,
+            help="Parse vsearch size=NNNN headers from this column, use NNNN as a count rather than # hits/reads.")
     ap.add_argument("table", help="Table of taxon hits")
     args = ap.parse_args(argv)
     
@@ -44,29 +60,48 @@ class OurDialect(csv.unix_dialect):
         doublequote = True
         skipinitialspace = True
 
-
     lineage = recursive_defaultdict()
     with open(args.table) as fh:
+        current_id = None
+        lca = []
         for rec in csv.reader(fh, dialect=OurDialect):
             linstr = rec[args.column]
             lin = linstr.split(args.lineage_delim)
-            X = lineage
-            X["__count__"] += 1
-            X["__name__"] = "root"
-            X["__line__"] = "NA"
-            for i, t in enumerate(lin):
-                X = X[t]
-                X["__count__"] += 1
-                X["__name__"] = t
-                X["__line__"] = linstr
-                if args.max_level is not None and i >= args.max_level:
-                    break
+            n = 1
+            if args.lca_column is not None:
+                if current_id != rec[args.lca_column]:
+                    if args.vsearch_size is not None and current_id is not None:
+                        hdr = current_id
+                        fields = dict(x.split("=") for x in hdr.split(";") if "=" in x)
+                        n = int(fields["size"])
+                    insert_lineage(lineage, lca, args.max_level, n=n)
+                    lca = lin
+                    current_id = rec[args.lca_column]
+                for i, tax in enumerate(lin):
+                    if i >= len(lca) or lca[i] != tax:
+                        break
+                lca = lca[:i+1]
+            else:
+                if args.vsearch_size is not None:
+                    hdr = rec[args.vsearch_size]
+                    fields = dict(x.split("=") for x in hdr.split(";") if "=" in x)
+                    n = int(fields["size"])
+                insert_lineage(lineage, lin, args.max_level, n=n)
+        if lca:
+            if args.vsearch_size is not None:
+                hdr = current_id
+                fields = dict(x.split("=") for x in hdr.split(";") if "=" in x)
+                n = int(fields["size"])
+            insert_lineage(lineage, lca, args.max_level, n=n)
 
     n = int(math.ceil(math.log10(lineage["__count__"])))
-    for level, name, count, linstr in recursive_taxtable(lineage):
+    taxtbl = list(recursive_taxtable(lineage))
+    max_name = max(len(t[1]) for t in taxtbl)
+    max_level = max(t[0] for t in taxtbl)
+    for level, name, count, linstr in taxtbl:
         linstr = linstr.split(args.lineage_delim, level)
         linstr = args.lineage_delim.join(linstr[:-1])
-        print(str(count).rjust(n + level * 2), name.ljust(15), level, linstr, sep="|")
+        print(name.ljust(max_name + (max_level-level)*2).rjust(max_name + max_level * 2), str(count).rjust(n), level, linstr, sep="|")
 
 if __name__ == "__main__":
     gg2k_main()
diff --git a/blsl/tabcat.py b/blsl/tabcat.py
new file mode 100644
index 0000000..52ec6aa
--- /dev/null
+++ b/blsl/tabcat.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2023 Dr. K. D. Murray/Gekkonid Consulting <spam@gekkonid.com>
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import csv
+import sys
+import math
+import argparse
+
+
+def tabcat_main(argv=None):
+    """Output only the best blast hits."""
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-d", "--delim", default=",",
+            help="Column delimiter.")
+    ap.add_argument("-o", "--output", default="-", type=argparse.FileType("wt"),
+            help="Column delimiter.")
+    ap.add_argument("-N", "--add-name", action="store_true",
+            help="Add a column with the file name")
+    ap.add_argument("xsvs", help="Input {C,T}SVs to concatenate", nargs="+")
+    args = ap.parse_args(argv)
+
+    class xsv(csv.unix_dialect):
+        delimiter = args.delim
+        quotechar = '"'
+        doublequote = True
+        skipinitialspace = True
+        quoting=csv.QUOTE_MINIMAL
+
+    outcsv = None
+    for file in args.xsvs:
+        with open(file) as fh:
+            for rec in csv.DictReader(fh, dialect=xsv):
+                rec["filename"] = file
+                if outcsv is None:
+                    outcsv = csv.DictWriter(args.output, dialect=xsv, fieldnames=[k for k in rec])
+                    outcsv.writeheader()
+                outcsv.writerow(rec)
+
+if __name__ == "__main__":
+    tabcat_main()
+
+

From f7817f2a795262c4036a8211170b11237f6cd0d5 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Tue, 9 May 2023 17:09:42 +0200
Subject: [PATCH 05/28] Minor fixes to gg2k & equalbestblast

---
 blsl/__init__.py       | 2 +-
 blsl/equalbestblast.py | 2 --
 blsl/gg2k.py           | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/blsl/__init__.py b/blsl/__init__.py
index 4fda850..a9133fc 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -6,7 +6,7 @@
 
 from sys import argv, exit
 
-__version__ = "0.1.10"
+__version__ = "0.1.11"
 
 cmds = {}
 
diff --git a/blsl/equalbestblast.py b/blsl/equalbestblast.py
index 31d7340..2fa9fa8 100644
--- a/blsl/equalbestblast.py
+++ b/blsl/equalbestblast.py
@@ -31,7 +31,6 @@ def output_best(recs, fields):
         rec for rec in recs if float(rec["length"]) >= maxalnlen
     ]
     nend=len(recs)
-    print(recs[0]["qseqid"], nbegin, nend, mineval, maxalnlen, file=sys.stderr)
     for rec in recs:
         print(*[rec[x] for x in fields], sep="\t")
     
@@ -44,7 +43,6 @@ def equalbestblast_main(argv=None):
     ap.add_argument("table", help="Table of blast hits")
     args = ap.parse_args(argv)
     
-
     fields = args.outfmt.split(" ")
     query = None
     hits = []
diff --git a/blsl/gg2k.py b/blsl/gg2k.py
index 188f0fa..373c73e 100644
--- a/blsl/gg2k.py
+++ b/blsl/gg2k.py
@@ -94,7 +94,7 @@ class OurDialect(csv.unix_dialect):
                 n = int(fields["size"])
             insert_lineage(lineage, lca, args.max_level, n=n)
 
-    n = int(math.ceil(math.log10(lineage["__count__"])))
+    n = int(math.ceil(math.log10(lineage["__count__"]+1)))
     taxtbl = list(recursive_taxtable(lineage))
     max_name = max(len(t[1]) for t in taxtbl)
     max_level = max(t[0] for t in taxtbl)

From b8f852609df62ec24c875dfb216f35134ec957e6 Mon Sep 17 00:00:00 2001
From: "Dr. K.D. Murray" <foss@kdmurray.id.au>
Date: Fri, 12 May 2023 16:04:55 +0200
Subject: [PATCH 06/28] add esf

---
 blsl/__init__.py        |  3 ++
 blsl/esearchandfetch.py | 95 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)
 create mode 100644 blsl/esearchandfetch.py

diff --git a/blsl/__init__.py b/blsl/__init__.py
index a9133fc..8047fbc 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -55,6 +55,9 @@
 from .tabcat import tabcat_main
 cmds["tabcat"] = tabcat_main
 
+from .esearchandfetch import main as esf_main
+cmds["esearchandfetch"] = esf_main
+
 
 def mainhelp(argv=None):
     """Print this help message"""
diff --git a/blsl/esearchandfetch.py b/blsl/esearchandfetch.py
new file mode 100644
index 0000000..78f1db6
--- /dev/null
+++ b/blsl/esearchandfetch.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+from Bio import Entrez
+from tqdm import tqdm
+
+import argparse
+from time import sleep
+from sys import stdout, stdin, stderr, exit
+from pathlib import Path
+import io
+
+def esearch(*args, chunks=10000, **kwargs):
+    kwargs["retmax"] = chunks
+    ids = []
+    n = None
+    i = 0
+    with tqdm(desc="esearch IDs") as progress:
+        while n is None or i < n:
+            this = Entrez.esearch(*args, **kwargs, retstart=i)
+            for x in range(3):
+                try:
+                    sleep(0.5)
+                    this = Entrez.read(this)
+                    break
+                except RuntimeError:
+                    sleep(0.5)
+                    pass
+            else:
+                this = Entrez.read(this)
+            nret = len(this["IdList"])
+            ids.extend(this["IdList"])
+            i += nret
+            n = int(this["Count"])
+            progress.total = n
+            progress.update(nret)
+    return ids
+            
+
+def efetch(db="nucleotide", rettype="genbank", retmode="text", ids=None, chunks=2000):
+    with tqdm(desc=f"efetch {rettype}", total=len(ids)) as progress:
+        for start in range(0, len(ids), chunks):
+            stop = min(start + chunks, len(ids))
+            idstr = ",".join(ids[start:stop])
+            res = Entrez.efetch(db=db, retmode=retmode, rettype=rettype, id=idstr)
+            if start != 0 and rettype=="runinfo":
+                # Skip header on non-first one
+                hdr = next(res)
+            for line in res:
+                if isinstance(line, bytes):
+                    line=line.decode('utf-8')
+                yield line
+            progress.update(stop-start)
+
+
+def main(argv=None):
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-e", "--email", type=str,
+            help="Email for Entrez API")
+    ap.add_argument("-c", "--chunksize", type=int, default=1000,
+            help="Number of records to download per chunk")
+    ap.add_argument("-o", "--out", type=argparse.FileType('w'), default="-",
+            help="Genbank output")
+    ap.add_argument("-d", "--db", required=True,
+            help="db api param")
+    ap.add_argument("-r", "--rettype", required=True,
+            help="rettype api param")
+    ap.add_argument("-m", "--retmode", default="text",
+            help="API return mode")
+    ap.add_argument("-i", "--id-file", type=Path,
+            help="List of IDs (give with --term to save a list of IDs to this file, or without to use this list of IDs)")
+    ap.add_argument("-t", "--term", type=str)
+    args = ap.parse_args(argv)
+
+    Entrez.email = args.email
+
+    if args.term:
+        ids = esearch(term=args.term, db=args.db)
+        if args.id_file:
+            with args.id_file.open("w") as fh:
+                for id in ids:
+                    print(id, file=fh)
+    elif args.id_file:
+        ids = []
+        with args.id_file.open("r") as fh:
+            for line in fh:
+                ids.append(line.rstrip())
+    else:
+        print("ERROR: must give either --term to search for, or a list of ids in --id-file", file=stderr)
+        exit(1)
+
+    for line in efetch(ids=ids, db=args.db, rettype=args.rettype, retmode=args.retmode, chunks=1000):
+        args.out.write(line)
+
+
+if __name__ == "__main__":
+    main()

From ae5705f7543ded14559b0882c5ebb28f4b06c826 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Fri, 12 May 2023 16:06:56 +0200
Subject: [PATCH 07/28] add deepclust2fa

---
 blsl/__init__.py     |  3 ++
 blsl/deepclust2fa.py | 95 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)
 create mode 100644 blsl/deepclust2fa.py

diff --git a/blsl/__init__.py b/blsl/__init__.py
index 8047fbc..eab787c 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -58,6 +58,9 @@
 from .esearchandfetch import main as esf_main
 cmds["esearchandfetch"] = esf_main
 
+from .deepclust2fa import deepclust2fa_main
+cmds["deepclust2fa"] = deepclust2fa_main
+
 
 def mainhelp(argv=None):
     """Print this help message"""
diff --git a/blsl/deepclust2fa.py b/blsl/deepclust2fa.py
new file mode 100644
index 0000000..6e9f82d
--- /dev/null
+++ b/blsl/deepclust2fa.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2023 Dr. K. D. Murray/Gekkonid Consulting <spam@gekkonid.com>
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import csv
+import sys
+import argparse
+from collections import defaultdict
+from pathlib import Path
+import gzip
+
+from Bio.SeqIO import parse
+from tqdm import tqdm
+
+
+class tsv(csv.unix_dialect):
+    delimiter = "\t"
+    quotechar = '"'
+    doublequote = False
+    escapechar = "\\"
+    skipinitialspace = True
+
+
+def deepclust2fa_main(argv=None):
+    """Split a .faa by the clusters diamond deepclust finds"""
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-z", "--min-cluster-size", type=int, default=1,
+            help="Minimum cluster size")
+    ap.add_argument("-b", "--buffer", action="store_true",
+            help="Buffer sequences rather than writing (more ram, more speed)")
+    ap.add_argument("-c", "--clusters", required=True,
+            help="Diamond deepclust result")
+    ap.add_argument("-f", "--faa", required=True,
+            help="Fasta sequences (that you gave to diamond deepclust --db)")
+    ap.add_argument("-o", "--outdir", required=True, type=Path,
+            help="Output directory (files named by their centroids)")
+    args = ap.parse_args(argv)
+
+    seq2cent = {}
+    cent2seq = defaultdict(list)
+    with open(args.clusters) as fh:
+        for rec in csv.reader(fh, dialect=tsv):
+            if rec[0] == "centroid" and rec[1] == "member":
+                continue
+            c, s = rec[0:2]
+            seq2cent[s] = c
+            cent2seq[c].append(s)
+    cent2seq_new = {}
+    seq2cent_new = {}
+    for cent, seqs in cent2seq.items():
+        if len(seqs) >= args.min_cluster_size:
+            cent2seq_new[cent] = seqs
+            for seq in seqs:
+                seq2cent_new[seq] = cent
+    cent2seq = cent2seq_new
+    seq2cent = seq2cent_new
+    print(f"Read clusters, {len(cent2seq)} clusters with size >= {args.min_cluster_size}", file=sys.stderr)
+
+    for cent in cent2seq:
+        outf = args.outdir/ (c+".fa")
+        with open(outf, "w") as fh:
+            pass
+
+    if args.faa.endswith(".gz"):
+        fh = gzip.open(args.faa, "rt")
+    else:
+        fh = open(args.faa)
+
+    buffers = defaultdict(list)
+    for seq in tqdm(parse(fh, "fasta")):
+        centroid = seq2cent.get(seq.id)
+        if centroid is None:
+            continue
+        if args.buffer:
+            buffers[centroid].append((seq.id, seq.seq))
+        else:
+            outf = args.outdir/ (centroid+".fa")
+            with open(outf, "a") as ofh:
+                print(f">{seq.id}", file=ofh)
+                print(seq.seq, file=ofh)
+    fh.close()
+
+    for centroid, seqs in buffers.items():
+        outf = args.outdir/ (centroid+".fa")
+        with open(outf, "w") as ofh:
+            for seq in seqs:
+                print(f">{seq[0]}", file=ofh)
+                print(seq[1], file=ofh)
+
+
+if __name__ == "__main__":
+    deepclust2fa_main()
+

From e72457fd9d6c9cfe651fa5b40d26e92b4c6ddbf8 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Wed, 21 Jun 2023 11:53:08 +0200
Subject: [PATCH 08/28] Add farename, fixes to genigvjs & esearchandfetch

---
 blsl/__init__.py        |  3 +++
 blsl/esearchandfetch.py |  1 +
 blsl/farename.py        | 25 +++++++++++++++++++++++++
 blsl/genigvjs.py        |  9 ++++++---
 4 files changed, 35 insertions(+), 3 deletions(-)
 create mode 100644 blsl/farename.py

diff --git a/blsl/__init__.py b/blsl/__init__.py
index eab787c..0a5e335 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -61,6 +61,9 @@
 from .deepclust2fa import deepclust2fa_main
 cmds["deepclust2fa"] = deepclust2fa_main
 
+from .farename import farename_main
+cmds["farename"] = farename_main
+
 
 def mainhelp(argv=None):
     """Print this help message"""
diff --git a/blsl/esearchandfetch.py b/blsl/esearchandfetch.py
index 78f1db6..c4adf18 100644
--- a/blsl/esearchandfetch.py
+++ b/blsl/esearchandfetch.py
@@ -52,6 +52,7 @@ def efetch(db="nucleotide", rettype="genbank", retmode="text", ids=None, chunks=
 
 
 def main(argv=None):
+    """Use the Entrez API to search for and download something. A CLI companion to the NCBI search box"""
     ap = argparse.ArgumentParser()
     ap.add_argument("-e", "--email", type=str,
             help="Email for Entrez API")
diff --git a/blsl/farename.py b/blsl/farename.py
new file mode 100644
index 0000000..3c03bfb
--- /dev/null
+++ b/blsl/farename.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+from sys import argv
+import argparse
+
+from Bio import SeqIO
+
+def farename_main(argv):
+    """Rename sequences in a fasta file sequentially"""
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-p", "--prefix", type=str, default="",
+            help="Prefix for each sequence ID (e.g. SEQ_ -> SEQ_1, SEQ_2 etc)")
+    ap.add_argument("fastas", help="FASTA file of contigs", nargs="+")
+    args = ap.parse_args(argv)
+    
+    for infile in args.fastas:
+        i = 0
+        with open(infile) as fh:
+            for seq in SeqIO.parse(fh, 'fasta'):
+                i += 1
+                print(f">{args.prefix}{i} {seq.id} {seq.description}")
+                print(seq.seq)
+
+                
+if __name__ == "__main__":
+    farename_main()
diff --git a/blsl/genigvjs.py b/blsl/genigvjs.py
index 883deb8..0c1dd03 100644
--- a/blsl/genigvjs.py
+++ b/blsl/genigvjs.py
@@ -116,7 +116,10 @@ def genigvjs_main(argv=None):
     for i, track in enumerate(args.tracks):
         track = Path(track)
         base = track.stem
-        format = track.suffix.lstrip(".")
+        dots = track.name.split('.')
+        format = dots[-1]
+        if format == "gz":
+            format = dots[-2]
         index = None
         if format == "bam":
             index = Path(str(track) + ".bai")
@@ -126,7 +129,7 @@ def genigvjs_main(argv=None):
                 index = None
         trackdat = {
             "name": base,
-            "format": track.suffix.lstrip("."),
+            "format": format,
             "autoHeight": True,
             "minHeight": 50,
             "maxHeight": 500,
@@ -146,7 +149,7 @@ def genigvjs_main(argv=None):
     with open(outdir / "index.html", "w") as fh:
         html = template \
                 .replace("__title__", args.title) \
-                .replace("__data__", json.dumps(data))
+                .replace("__data__", json.dumps(data, indent=4))
         fh.write(html)
 
 if __name__ == "__main__":

From 3e6474cbd5daefe891c9b20b229a7c2d7b0709de Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Wed, 21 Jun 2023 12:15:34 +0200
Subject: [PATCH 09/28] add ebio2rl2s, galhist, fix usages for some tools &
 readme

---
 README.md            | 11 ++++++++++-
 blsl/__init__.py     |  6 ++++++
 blsl/ebiosra2rl2s.py | 33 +++++++++++++++++++++++++++++++++
 blsl/galhist.py      | 43 +++++++++++++++++++++++++++++++++++++++++++
 blsl/tabcat.py       |  4 ++--
 5 files changed, 94 insertions(+), 3 deletions(-)
 create mode 100755 blsl/ebiosra2rl2s.py
 create mode 100755 blsl/galhist.py

diff --git a/README.md b/README.md
index 4ad8b3c..8d60355 100644
--- a/README.md
+++ b/README.md
@@ -18,17 +18,26 @@ USAGE: blsl <subtool> [options...]
 
 Where <subtool> is one of:
 
-  genigvjs:             Generate a simple IGV.js visualisation of some bioinf files.
   telogrep:             Search contigs for known telomere repeats
   n50:                  Calculate N50 and total length of a set of contigs
   falen:                Tabulate the lengths of sequences in a FASTA file
   mask2bed:             The inverse of bedtools maskfasta: softmasked fasta -> unmasked fasta + mask.bed
+  genigvjs:             Generate a simple IGV.js visualisation of some bioinf files.
   liftoff-gff3:         Obtain an actually-useful GFF3 from Liftoff by fixing basic GFF3 format errors
   pansn-rename:         Add, remove, or modify PanSN-style prefixes to contig/chromosome names in references
   ildemux:              Demultiplex modern illumina reads from read headers.
   ilsample:             Sample a fraction of read pairs from an interleaved fastq file
   regionbed:            Make a bed/region file of genome windows
   uniref-acc2taxid:     Make a ncbi-style acc2taxid.map file for a uniref fasta
+  nstitch:              Combine R1 + R2 into single sequences, with an N in the middle
+  gg2k:                 Summarise a table with GreenGenes-style lineages into a kraken-style report.
+  equalbestblast:       Output only the best blast hits.
+  tabcat:               Concatenate table (c/tsv) files, adding the filename as a column
+  esearchandfetch:      Use the Entrez API to search for and download something. A CLI companion to the NCBI search box
+  deepclust2fa:         Split a .faa by the clusters diamond deepclust finds
+  farename:             Rename sequences in a fasta file sequentially
+  ebiosra2rl2s:         INTERNAL: MPI Tübingen tool. Make a runlib-to-sample map table from ebio sra files
+  galhist:              Make a summary histogram of git-annex-list output
   help:                 Print this help message
 
 
diff --git a/blsl/__init__.py b/blsl/__init__.py
index 0a5e335..f69ba85 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -64,6 +64,12 @@
 from .farename import farename_main
 cmds["farename"] = farename_main
 
+from .ebiosra2rl2s import main as rl2s_main
+cmds["ebiosra2rl2s"] = rl2s_main
+
+from .galhist import main as galhist_main
+cmds["galhist"] = galhist_main
+
 
 def mainhelp(argv=None):
     """Print this help message"""
diff --git a/blsl/ebiosra2rl2s.py b/blsl/ebiosra2rl2s.py
new file mode 100755
index 0000000..1eab030
--- /dev/null
+++ b/blsl/ebiosra2rl2s.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+import re
+import json
+import select
+from sys import stdin, stdout, stderr, exit
+import sys
+
+def main(argv=None):
+    """INTERNAL: MPI Tübingen tool. Make a runlib-to-sample map table from ebio sra files"""
+    samples = {}
+    if not select.select([stdin,],[],[],2.0)[0]:
+        print("This is an internal MPI Tübingen tool to make a table that mapps runs of a given library to sample names, e.g. for Acanthophis.", file=stderr)
+        print(f"USAGE: find /ebio/abt6_sra/... -type f -name '*.fq.gz' | ebiosra2rl2s > run-lib2sample.tsv", file=stderr)
+        exit(1)
+    for line in stdin:
+        m = re.match(r"(.*/illumina_.+_flowcell.+_SampleId(\w+)_(RunId\d+_LaneId\d+)/.+_(R\d)_\d+.fastq.gz)", line.strip())
+        if m is not None:
+            path, sample, run, read = m.groups()
+            if sample not in samples:
+                samples[sample] = {run: {read: path}}
+            elif run not in samples[sample]:
+                samples[sample][run] = {read: path}
+            else:
+                samples[sample][run][read] = path
+    #json.dump(samples, stdout, indent=2)
+    print("library", "run", "read1_uri", "read2_uri", file=stdout, sep='\t')
+    for sample, dat in samples.items():
+        for run, reads in dat.items():
+            print(sample, run, reads["R1"], reads["R2"], file=stdout, sep='\t')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/blsl/galhist.py b/blsl/galhist.py
new file mode 100755
index 0000000..8bc39e0
--- /dev/null
+++ b/blsl/galhist.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python3
+from shutil import get_terminal_size
+from collections import Counter
+from sys import stdin, stdout, stderr, exit
+from math import ceil, log10
+import select
+
+def main(argv=None):
+    """Make a summary histogram of git-annex-list output"""
+    if not select.select([stdin], [], [], 1)[0]:
+        print("USAGE:  git annex list ... | blsl galhist", file=stderr)
+        print("This tool makes a histogram of file occurence patterns to summarise the output of git annex list. Just pipe git anex list in on stdin.", file=stderr)
+        exit(1)
+
+    twidth = 80
+    try:
+        if stdout.isatty():
+            twidth = get_terminal_size()[0]
+    except:
+        pass
+        
+    header = []
+    counts = Counter()
+    for line in stdin:
+        if not header:
+            header.append(line.rstrip())
+        elif line.startswith("|"):
+            header.append(line.rstrip())
+        else:
+            key, file = line.split(" ", maxsplit=1)
+            counts[key]+=1 
+
+    for l in header:
+        print(l)
+    _, mcv = counts.most_common(1)[0]
+    nw = int(ceil(log10(mcv)))
+    remaining = twidth - len(header) - 1 - nw - 2 
+
+    for k, v in counts.most_common():
+        vbarw = min(round(v/mcv*remaining), remaining)
+        vbar = ("#" * vbarw).ljust(remaining, " ")
+        npad = str(v).rjust(nw, " ")
+        print(f"{k} {vbar} ({npad})")
diff --git a/blsl/tabcat.py b/blsl/tabcat.py
index 52ec6aa..8757fcf 100644
--- a/blsl/tabcat.py
+++ b/blsl/tabcat.py
@@ -11,7 +11,7 @@
 
 
 def tabcat_main(argv=None):
-    """Output only the best blast hits."""
+    """Concatenate table (c/tsv) files, adding the filename as a column"""
     ap = argparse.ArgumentParser()
     ap.add_argument("-d", "--delim", default=",",
             help="Column delimiter.")
@@ -33,7 +33,7 @@ class xsv(csv.unix_dialect):
     for file in args.xsvs:
         with open(file) as fh:
             for rec in csv.DictReader(fh, dialect=xsv):
-                rec["filename"] = file
+                rec["tabcat_filename"] = file
                 if outcsv is None:
                     outcsv = csv.DictWriter(args.output, dialect=xsv, fieldnames=[k for k in rec])
                     outcsv.writeheader()

From c802dab8af4ef3e150dcb463f83413b625f682e4 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Fri, 23 Jun 2023 11:22:52 +0200
Subject: [PATCH 10/28] fix permissions

---
 blsl/ebiosra2rl2s.py | 0
 blsl/galhist.py      | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 blsl/ebiosra2rl2s.py
 mode change 100755 => 100644 blsl/galhist.py

diff --git a/blsl/ebiosra2rl2s.py b/blsl/ebiosra2rl2s.py
old mode 100755
new mode 100644
diff --git a/blsl/galhist.py b/blsl/galhist.py
old mode 100755
new mode 100644

From da189bc2f0b3a2e3ba76d06ad54467787a86b3b2 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Sat, 15 Jul 2023 15:57:21 +0200
Subject: [PATCH 11/28] add GFFcat

---
 blsl/__init__.py |  2 ++
 blsl/gffcat.py   | 62 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100755 blsl/gffcat.py

diff --git a/blsl/__init__.py b/blsl/__init__.py
index f69ba85..734d242 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -70,6 +70,8 @@
 from .galhist import main as galhist_main
 cmds["galhist"] = galhist_main
 
+from .gffcat import gffcat_main
+cmds["gffcat"] = gffcat_main
 
 def mainhelp(argv=None):
     """Print this help message"""
diff --git a/blsl/gffcat.py b/blsl/gffcat.py
new file mode 100755
index 0000000..6232554
--- /dev/null
+++ b/blsl/gffcat.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+import argparse
+from sys import stderr, stdout, stdin
+
+KNOWN_HEADERS = {
+        "##gff-version",
+        "##feature-ontology",
+}
+
+
+def gffcat_main(argv=None):
+    """Concatenate GFF3 files, resepcting header lines and FASTA sections"""
+    ap = argparse.ArgumentParser("gffcat")
+    ap.add_argument("-o", "--output", type=argparse.FileType("wt"), default=stdout,
+            help="Output gff file")
+    ap.add_argument("inputs", nargs="+")
+    args = ap.parse_args(argv)
+
+    fastalines = []
+    bodylines = []
+    headerlines = {}
+
+    for file in args.inputs:
+        with open(file) as fh:
+            in_fasta = False
+            for line in fh:
+                line = line.rstrip()
+                if not line:
+                    continue
+                ll = line.lower().split()[0]
+                if ll in KNOWN_HEADERS:
+                    if ll in headerlines:
+                        if headerlines[ll] != line:
+                            print("WARN: header line with different value in differnt files. Only using value from first file", file=stderr)
+                    else:
+                        headerlines[ll] = line
+                elif ll.startswith("##fasta"):
+                    in_fasta = True
+                else:
+                    if in_fasta:
+                        fastalines.append(line)
+                    else:
+                        bodylines.append(line)
+
+    for hdr in KNOWN_HEADERS:
+        if hdr in headerlines:
+            print(headerlines[hdr], file=args.output)
+
+    for line in bodylines:
+        print(line, file=args.output)
+
+    if fastalines:
+        print("##fasta", file=args.output)
+
+    for line in fastalines:
+        print(line, file=args.output)
+    
+    print(f"DONE! {len(headerlines)} headers, {len(bodylines)} body lines, {len(fastalines)} FASTA lines", file=stderr)
+
+
+if __name__ == "__main__":
+    gffcat_main()

From 7f260d2e0f17e34b406fcb036f5eca99e3218d5d Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Mon, 24 Jul 2023 13:41:39 +0200
Subject: [PATCH 12/28] Add gffparse library

---
 blsl/gffparse.py | 249 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 249 insertions(+)
 create mode 100644 blsl/gffparse.py

diff --git a/blsl/gffparse.py b/blsl/gffparse.py
new file mode 100644
index 0000000..2226995
--- /dev/null
+++ b/blsl/gffparse.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+A simple parser for the GFF3 format.
+
+Test with transcripts.gff3 from
+http://www.broadinstitute.org/annotation/gebo/help/gff3.html.
+
+Format specification source:
+http://www.sequenceontology.org/gff3.shtml
+
+Version 1.1: Python3 ready
+Version 2.0: @kdm9's version. Add code for comparison & normalisationG
+"""
+from collections import namedtuple, Counter
+import gzip
+import urllib.request, urllib.parse, urllib.error
+from sys import stderr, stdout
+
+from tqdm.auto import tqdm
+
+__author__  = "Uli Köhler"
+__license__ = "Apache License v2.0"
+__version__ = "1.1"
+
+#Initialized GeneInfo named tuple. Note: namedtuple is immutable
+gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
+GFFRecord = namedtuple("GFFRecord", gffInfoFields)
+
+ontological_order = [
+    "chromosome", "gene", "pseudogene", "pseudogenic_region", "mRNA", "transcript", "pseudogeneic_transcript",
+    "exon", "pseudogenic_exon", "five_prime_UTR", "three_prime_UTR", "CDS", "pseudogenic_CDS",
+]
+
+def parseGFFAttributes(attributeString):
+    """Parse the GFF3 attribute column and return a dict"""#
+    if attributeString == ".": return {}
+    ret = {}
+    for attribute in attributeString.split(";"):
+        if "=" in attribute:
+            key, value = attribute.split("=")
+        else:
+            key = attribute
+            value = "True"
+        if not key and not value:
+            continue
+        ret[urllib.parse.unquote(key)] = urllib.parse.unquote(value)
+    return ret
+
+def parseGFF3(filename, return_as=dict):
+    """
+    A minimalistic GFF3 format parser.
+    Yields objects that contain info about a single GFF3 feature.
+    
+    Supports transparent gzip decompression.
+    """
+    #Parse with transparent decompression
+    openFunc = gzip.open if filename.endswith(".gz") else open
+    with openFunc(filename) as infile:
+        for line in infile:
+            #if line.startswith("###"):
+            #    ### Yield last gene if we move that here
+            #    pass
+            if line.startswith("#") or line.strip() == "":
+                continue
+            parts = line.strip().split("\t")
+            #If this fails, the file format is not standard-compatible
+            assert len(parts) == len(gffInfoFields)
+            #Normalize data
+            normalizedInfo = {
+                "seqid": None if parts[0] == "." else urllib.parse.unquote(parts[0]),
+                "source": None if parts[1] == "." else urllib.parse.unquote(parts[1]),
+                "type": None if parts[2] == "." else urllib.parse.unquote(parts[2]),
+                "start": None if parts[3] == "." else int(parts[3]),
+                "end": None if parts[4] == "." else int(parts[4]),
+                "score": None if parts[5] == "." else float(parts[5]),
+                "strand": None if parts[6] == "." else urllib.parse.unquote(parts[6]),
+                "phase": None if parts[7] == "." else urllib.parse.unquote(parts[7]),
+                "attributes": parseGFFAttributes(parts[8])
+            }
+            #Alternatively, you can emit the dictionary here, if you need mutability:
+            #    yield normalizedInfo
+            yield return_as(**normalizedInfo)
+            
+
+def gff_heirarchy(filename, progress=None):
+    last = {"l1": None, "l2": None, "l3": None}
+    level_canonicaliser = {
+        "transcript": "mRNA",
+    }
+    levels = {
+        "gene": "l1",
+        "mRNA": "l2",
+        "exon": "l3",
+        "CDS": "l3",
+        "five_prime_UTR": "l3",
+        "three_prime_UTR": "l3",
+    }
+    records = {}
+    l2l1 = {}
+    i = 0
+    recordsrc =  parseGFF3(filename, return_as=dict)
+    if progress is not None:
+        if progress == True:
+            desc=None
+        else:
+            desc=progress
+        recordsrc = tqdm(recordsrc, desc=desc)
+    for record in recordsrc:
+        typ = record["type"]
+        typ = level_canonicaliser.get(typ, typ)
+        record["type"] = typ
+        lvl = levels.get(typ)
+        if lvl is None:
+            #print(f"WARNING: {typ} is not a nice feature, skipping", file=stderr)
+            continue
+        id = record["attributes"]["ID"]
+        if lvl == "l1":
+            record["children"] = {}
+            records[id] = record
+            i = 0
+        elif lvl == "l2":
+            parent = record["attributes"]["Parent"]
+            l2l1[id] = parent
+            record["children"] = {}
+            try:
+                records[parent]["children"][id] = record
+            except KeyError:
+                print(f"L2 entry {id} parent {parent} not in records? {record}")
+        else:
+            parent = record["attributes"]["Parent"]
+            try:
+                top = l2l1[parent]
+            except KeyError:
+                print(f"L3 entry {id} parent {parent} not in records? {record}")
+            if id in records[top]["children"][parent]["children"]:
+                i += 1
+                id = f"{id}_{i}"
+                record["attributes"]["ID"] = id
+            records[top]["children"][parent]["children"][id] = record
+    return records
+
+
+def attr2line(attrs):
+    from urllib.parse import quote as Q
+    return ";".join("=".join((Q(k), Q(v))) for k, v in attrs.items())
+
+
+def reformat_names(gene, geneid=None, changenames=True):
+    def prefix_name(entry, sid):
+        name = entry["attributes"].get("Name")
+        if name:
+            name = f"{sid}_{name}"
+        else:
+            name = sid
+        entry["attributes"]["Name"] = name
+    if not geneid:
+        geneid = gene["attributes"]["ID"]
+    gene["attributes"]["ID"] = geneid
+    subids = Counter()
+    for child in gene.get("children", {}).values():
+        ct = child["type"]
+        subids[ct] += 1
+        ci = subids[ct]
+        subid = f"{geneid}_{ct}{ci:02d}"
+        child["attributes"]["ID"] = subid
+        child["attributes"]["Parent"] = geneid
+        if changenames:
+            prefix_name(child, subid)
+        #print("E", child["attributes"]["Name"])
+        gcids = Counter()
+        for gchild in child.get("children", {}).values():
+            gt = gchild["type"]
+            gcids[gt] += 1
+            gi = gcids[gt]
+            gcid = f"{subid}_{gt}{gi:02d}"
+            #print("F", gt, gcid)
+            gchild["attributes"]["ID"] = gcid
+            gchild["attributes"]["Parent"] = subid
+            #print("G", gcid, gchild["attributes"].get("Name"))
+            if changenames:
+                prefix_name(gchild, gcid)
+            #print("H", gcid, gchild["attributes"]["Name"])
+    
+def write_gene(gene, geneid=None, file=None, changenames=False):
+    def write_line(entry):
+        x = [entry[field] if entry[field] is not None else "." for i, field in enumerate(gffInfoFields)]
+        x[-1] = attr2line(x[-1])
+        print(*x, sep="\t", file=file)
+    #print(gene, geneid)
+    #print("## gff-version 3", file=file)
+    if geneid or changenames:
+        reformat_names(gene, geneid, changenames)
+    write_line(gene)
+    for child in gene.get("children", {}).values():
+        write_line(child)
+        for gchild in child.get("children", {}).values():
+            write_line(gchild)       
+    print("###", file=file)
+    
+
+def get_all_features(annot, ftype):
+    feat = set()
+    if annot["type"] == ftype:
+        feat.add((annot["start"], annot["end"], annot["strand"], annot["type"]))
+    for child in annot.get("children", {}).values():
+        feat.update(get_all_features(child, ftype))
+    return feat
+
+
+def features_equal(a, b, ftype):
+    a = set(get_all_features(a, ftype))
+    b = set(get_all_features(b, ftype))
+    return a == b
+
+
+def feature_distance(a, b, ftype):
+    def overlap(a, b):
+        return a[0] <= b[0] <= a[1] or a[0] <= b[1] <= a[1]
+    a = list(get_all_features(a, ftype))
+    b = list(get_all_features(b, ftype))
+    ovl = list()
+    noovl = list()
+    b_with_ovl = set()
+    for ai in range(len(a)):
+        for bi in range(len(b)):
+            if overlap(a[ai], b[bi]):
+                ovl.append((a[ai], b[bi]))
+                b_with_ovl.add(b[bi])
+                break
+        else:
+            # a does not overlap with an b
+            noovl.append(a[ai])
+    for bi in range(len(b)):
+        if b[bi] not in b_with_ovl:
+            noovl.append(b[bi])
+    dist = 0
+    for A, B in ovl:
+        dist += abs(A[1] - B[1]) + abs(A[0] - B[0])
+    for x in noovl:
+        dist += x[1] - x[0]
+    return dist
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("file", help="The GFF3 input file (.gz allowed)")
+    args = parser.parse_args()
+    import json
+    json.dump(gff_heirarchy(args.file), stdout, indent=4)

From 3ad0de5f90e5a8bbed1e260a5c85f9baaa94899c Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Mon, 24 Jul 2023 13:42:13 +0200
Subject: [PATCH 13/28] tabcat: allow fofn input

---
 blsl/tabcat.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/blsl/tabcat.py b/blsl/tabcat.py
index 8757fcf..ed0ee14 100644
--- a/blsl/tabcat.py
+++ b/blsl/tabcat.py
@@ -19,6 +19,8 @@ def tabcat_main(argv=None):
             help="Column delimiter.")
     ap.add_argument("-N", "--add-name", action="store_true",
             help="Add a column with the file name")
+    ap.add_argument("-f", "--fofn", action="store_true",
+            help="Each input is a file of filenames")
     ap.add_argument("xsvs", help="Input {C,T}SVs to concatenate", nargs="+")
     args = ap.parse_args(argv)
 
@@ -29,8 +31,16 @@ class xsv(csv.unix_dialect):
         skipinitialspace = True
         quoting=csv.QUOTE_MINIMAL
 
+    if args.fofn:
+        files = []
+        for fofn in args.xsvs:
+            with open(fofn) as fh:
+                for file in fh:
+                    files.append(file.rstrip())
+    else:
+        files = args.xsvs
     outcsv = None
-    for file in args.xsvs:
+    for file in files:
         with open(file) as fh:
             for rec in csv.DictReader(fh, dialect=xsv):
                 rec["tabcat_filename"] = file

From 0c5abacb9c46a697dadfde499231ed3eb1b0b195 Mon Sep 17 00:00:00 2001
From: "Dr. K.D. Murray" <foss@kdmurray.id.au>
Date: Mon, 24 Jul 2023 15:28:43 +0200
Subject: [PATCH 14/28] mask2bed: add progress logging

---
 blsl/mask2bed.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/blsl/mask2bed.py b/blsl/mask2bed.py
index 42bbfca..e430032 100644
--- a/blsl/mask2bed.py
+++ b/blsl/mask2bed.py
@@ -8,6 +8,7 @@
 from Bio import SeqIO
 from shlex import quote
 import argparse
+from tqdm import tqdm
 
 
 full_help = """
@@ -29,7 +30,7 @@ def mask2bed_main(argv=None):
     args = ap.parse_args(argv)
     
     seqs = SeqIO.parse(args.fasta, "fasta")
-    for seq in seqs:
+    for seq in tqdm(seqs):
         last_start = 0
         last_masked = None
         for i, base in enumerate(str(seq.seq)):

From a8810bb14785ae7838286ad05cee2cc4045fa184 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Tue, 29 Aug 2023 08:56:50 +0200
Subject: [PATCH 15/28] ildemux: fix usage/args bug

---
 blsl/_utils.py  | 4 +---
 blsl/ildemux.py | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/blsl/_utils.py b/blsl/_utils.py
index 33b4bad..3e6a8d4 100644
--- a/blsl/_utils.py
+++ b/blsl/_utils.py
@@ -1,3 +1,4 @@
+import gzip
 
 def rc(seq):
     d = {"A": "T", "G":"C", "C":"G", "T":"A"}
@@ -27,6 +28,3 @@ def fqparse(stream):
     else:
         assert len(fqp) == 0
 
-
-
-
diff --git a/blsl/ildemux.py b/blsl/ildemux.py
index aac5c87..684541d 100644
--- a/blsl/ildemux.py
+++ b/blsl/ildemux.py
@@ -122,7 +122,7 @@ def make_sample_map(tablefile, outdir, fileext=".fq", distance=1):
     return smap
 
 
-def main():
+def main(argv=None):
     """Demultiplex modern illumina reads from read headers.
 
     Supports *ONLY* out-of-read indicies (in @header lines e.g. 1:N:0:ACGT+TGCA), and only reads with i7 + i5.
@@ -133,7 +133,7 @@ def main():
     ap.add_argument("-o", "--outdir", required=True, type=Path,
             help="Output directory (must exist)")
     ap.add_argument("-c", "--justcount", action="store_true",
-            help="Compress outputs with gzip.")
+            help="Write nothing, only tablulate (largely useless)")
     ap.add_argument("-z", "--zip", nargs="?", default=None, const=6, type=int,
             help="Compress outputs with gzip.")
     ap.add_argument("-j", "--threads", default=8,
@@ -142,7 +142,7 @@ def main():
             help="Number tolerable mismatches (hamming distance).")
     ap.add_argument("-k", "--keyfile",
             help="Mapping of i7/i5 -> sample as tsv (with cols i7, i5, sample)")
-    args=ap.parse_args()
+    args=ap.parse_args(argv)
 
     ByteBucket.threads = args.threads
     if args.zip is not None:

From 6501a18d5d18b31b7b5f64cf18932fe844a74e5e Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Tue, 29 Aug 2023 08:59:17 +0200
Subject: [PATCH 16/28] add pairslash command to add /1 /2 to read names

---
 blsl/__init__.py  |  5 +++++
 blsl/pairslash.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 blsl/pairslash.py

diff --git a/blsl/__init__.py b/blsl/__init__.py
index 734d242..f392bfa 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -73,6 +73,11 @@
 from .gffcat import gffcat_main
 cmds["gffcat"] = gffcat_main
 
+
+from .pairslash import main as pairslash_main
+cmds["pairslash"] = pairslash_main
+
+
 def mainhelp(argv=None):
     """Print this help message"""
     print("USAGE: blsl <subtool> [options...]\n\n")
diff --git a/blsl/pairslash.py b/blsl/pairslash.py
new file mode 100644
index 0000000..20ab2d1
--- /dev/null
+++ b/blsl/pairslash.py
@@ -0,0 +1,42 @@
+import gzip
+
+from ._utils import fqparse
+
+def seqnum(n, c):
+    if n.endswith("/1"):
+        return 1
+    if n.endswith("/2"):
+        return 2
+    if c:
+        if c[0].startswith("1:"):
+            return 1
+        if c[0].startswith("2:"):
+            return 2
+    raise ValueError(f"Unknown pair from fastq header: '{n} {''.join(c)}'")
+
+def main(argv=None):
+    """Add an old-style /1 /2 pair indicator to paired-end fastq files"""
+
+    import argparse
+    ap = argparse.ArgumentParser("blsl pairslash")
+    ap.add_argument("readfile", nargs='?', default="/dev/fd/0")
+    args=ap.parse_args(argv)
+
+
+    if args.readfile.endswith(".gz"):
+        reads = gzip.open(args.readfile, "rt")
+    else:
+        reads = open(args.readfile, "r")
+
+    for s in fqparse(reads):
+        n, *c = s[0].split(" ", 1)
+        if n.endswith("/1") or n.endswith("/2"):
+            print(s[0])
+        else:
+            i = seqnum(n, c)
+            print(f"{n}/{i} {''.join(c)}")
+        print(s[1], s[2], s[3], sep="\n")
+
+if __name__ == "__main__":
+    main()
+

From 214d3609ebddf7f5017be39d2107ed41b8547e32 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Tue, 29 Aug 2023 08:59:58 +0200
Subject: [PATCH 17/28] genigvjs: add --locus to set inital browser locus

---
 blsl/genigvjs.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/blsl/genigvjs.py b/blsl/genigvjs.py
index 0c1dd03..544727d 100644
--- a/blsl/genigvjs.py
+++ b/blsl/genigvjs.py
@@ -75,6 +75,8 @@ def genigvjs_main(argv=None):
             help="Webpage title")
     ap.add_argument("--reference", "-r", required=True,
             help="reference file. must be *.fa or *.fasta, with associated *.fai, and can also have *.gff as annotation.")
+    ap.add_argument("--locus", "-l", 
+            help="Initial region of interest.")
     ap.add_argument("--outdir", "-o", required=True,
             help="Output directory. Data files will be softlinked there, and 'index.html' will be generated.")
     ap.add_argument("tracks", help="Any files to be used as tracks. Can be gff/bed/vcf/bcf/bam/cram. Must be indexed", nargs="+")
@@ -93,6 +95,8 @@ def genigvjs_main(argv=None):
         "reference": {},
         "tracks": [],
     }
+    if args.locus:
+        data["locus"] = args.locus
     ref = Path(args.reference)
     refbase = ref.stem
     reffai = Path(str(ref) + ".fai")

From 0f6adebcb4845135db48a330161257a0e10de3fd Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Tue, 29 Aug 2023 09:00:19 +0200
Subject: [PATCH 18/28] minor fixes to gff parsing code

---
 blsl/gffparse.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/blsl/gffparse.py b/blsl/gffparse.py
index 2226995..d237da6 100644
--- a/blsl/gffparse.py
+++ b/blsl/gffparse.py
@@ -54,7 +54,7 @@ def parseGFF3(filename, return_as=dict):
     Supports transparent gzip decompression.
     """
     #Parse with transparent decompression
-    openFunc = gzip.open if filename.endswith(".gz") else open
+    openFunc = gzip.open if str(filename).endswith(".gz") else open
     with openFunc(filename) as infile:
         for line in infile:
             #if line.startswith("###"):
@@ -88,13 +88,22 @@ def gff_heirarchy(filename, progress=None):
         "transcript": "mRNA",
     }
     levels = {
+        "repeat_region": "l1",
+        "pseudogene": "l1",
+        "pseuodgenic_region": "l1",
+        "transposable_element_gene": "l1",
         "gene": "l1",
+        "tRNA": "l2",
+        "tmRNA": "l2",
         "mRNA": "l2",
         "exon": "l3",
         "CDS": "l3",
         "five_prime_UTR": "l3",
         "three_prime_UTR": "l3",
     }
+    ignore = {
+        "source",
+    }
     records = {}
     l2l1 = {}
     i = 0
@@ -109,9 +118,13 @@ def gff_heirarchy(filename, progress=None):
         typ = record["type"]
         typ = level_canonicaliser.get(typ, typ)
         record["type"] = typ
-        lvl = levels.get(typ)
+        lvl = None
+        if "RNA" in typ.upper():
+            lvl = "l2"
+        lvl = levels.get(typ, lvl)
         if lvl is None:
-            #print(f"WARNING: {typ} is not a nice feature, skipping", file=stderr)
+            if typ not in ignore:
+                print(f"WARNING: {typ} is not a nice feature, skipping", file=stderr)
             continue
         id = record["attributes"]["ID"]
         if lvl == "l1":
@@ -127,16 +140,16 @@ def gff_heirarchy(filename, progress=None):
             except KeyError:
                 print(f"L2 entry {id} parent {parent} not in records? {record}")
         else:
-            parent = record["attributes"]["Parent"]
             try:
+                parent = record["attributes"]["Parent"]
                 top = l2l1[parent]
+                if id in records[top]["children"][parent]["children"]:
+                    i += 1
+                    id = f"{id}_{i}"
+                    record["attributes"]["ID"] = id
+                records[top]["children"][parent]["children"][id] = record
             except KeyError:
-                print(f"L3 entry {id} parent {parent} not in records? {record}")
-            if id in records[top]["children"][parent]["children"]:
-                i += 1
-                id = f"{id}_{i}"
-                record["attributes"]["ID"] = id
-            records[top]["children"][parent]["children"][id] = record
+                print(f"L3 entry {id} parent not in records? {record}")
     return records
 
 
@@ -172,7 +185,7 @@ def prefix_name(entry, sid):
             gt = gchild["type"]
             gcids[gt] += 1
             gi = gcids[gt]
-            gcid = f"{subid}_{gt}{gi:02d}"
+            gcid = f"{geneid}_{gt}{gi:02d}"
             #print("F", gt, gcid)
             gchild["attributes"]["ID"] = gcid
             gchild["attributes"]["Parent"] = subid

From f1bf3c620d9f3319c0feef2377dbb0ae34c1ffca Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Thu, 12 Oct 2023 07:17:16 +0200
Subject: [PATCH 19/28] add vcfstats command

---
 blsl/__init__.py | 6 ++++++
 pyproject.toml   | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/blsl/__init__.py b/blsl/__init__.py
index f392bfa..6599044 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -77,6 +77,12 @@
 from .pairslash import main as pairslash_main
 cmds["pairslash"] = pairslash_main
 
+try:
+    from .vcfstats import main as vcfstats_main
+    cmds["vcfstats"] = vcfstats_main
+except ImportError as exc:
+    print(str(exc), "-- disabling vcfstats command", file=stderr)
+
 
 def mainhelp(argv=None):
     """Print this help message"""
diff --git a/pyproject.toml b/pyproject.toml
index dfbcac1..c030f5f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,9 @@ authors = [
 ]
 requires-python = ">=3.8"
 dependencies = [
+    "tqdm",
     "biopython",
+    "cyvcf2",
 ]
 dynamic = [
     "version",

From 4c5fe9759b09af304eee1b3c7f68b139888b329e Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Thu, 12 Oct 2023 07:17:50 +0200
Subject: [PATCH 20/28] regionbed & pansn_rename: add missing args

---
 blsl/pansn_rename.py | 5 +++--
 blsl/regionbed.py    | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/blsl/pansn_rename.py b/blsl/pansn_rename.py
index 5b43e16..2378e53 100644
--- a/blsl/pansn_rename.py
+++ b/blsl/pansn_rename.py
@@ -34,10 +34,11 @@ def make_replacer(mode, reffa=None, org_name=None, delim="~", outdelim="~", refd
     return fromto
 
 
-def do_tsv(infh, outfh, cols, replacer, coldelim="\t"):
+def do_tsv(infh, outfh, cols, replacer, coldelim="\t", skipcomment=False):
     for line in tqdm(infh):
         if line.startswith("#"):
-            outfh.write(line)
+            if not skipcomment:
+                outfh.write(line)
             continue
         fields = line.rstrip("\r\n").split(coldelim)
         for col in cols:
diff --git a/blsl/regionbed.py b/blsl/regionbed.py
index 8dfdf46..7ad8d95 100644
--- a/blsl/regionbed.py
+++ b/blsl/regionbed.py
@@ -20,7 +20,7 @@ def make_regions(fas, window=1e6):
 def main(argv):
     """Make a bed/region file of genome windows"""
     ap = argparse.ArgumentParser()
-    ap.add_argument("--windowsize", "-w", default=100000,
+    ap.add_argument("--windowsize", "-w", default=100000, type=int,
             help="Window size of each non-overlapping region")
     ap.add_argument("--bed", "-b", type=argparse.FileType("w"),
             help="Output bed file")
@@ -29,7 +29,7 @@ def main(argv):
     ap.add_argument("fasta", help="Fasta reference (must have fai)")
     args = ap.parse_args(argv)
 
-    for chrm, start, stop, leng in make_regions(args.fasta):
+    for chrm, start, stop, leng in make_regions(args.fasta, window=args.windowsize):
         region = f"{chrm}:{start+1}-{stop}"
         if args.bed is not None:
             print(chrm, start, stop, region, file=args.bed, sep="\t")

From 8de7de45876b0f2bbf8dcf754f74d10f39d50de6 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Thu, 12 Oct 2023 07:18:48 +0200
Subject: [PATCH 21/28] add vcfstats source file

---
 blsl/vcfstats.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100755 blsl/vcfstats.py

diff --git a/blsl/vcfstats.py b/blsl/vcfstats.py
new file mode 100755
index 0000000..bd00b79
--- /dev/null
+++ b/blsl/vcfstats.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+from tqdm import tqdm
+from cyvcf2 import VCF
+import pandas as pd
+
+from sys import stdin, stdout, stderr
+from subprocess import Popen, PIPE
+import argparse
+import math
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+def parallel_regions(vcf, cores=1):
+    V = VCF(vcf)
+    # 10 chunks per chrom per core
+    chunks = len(V.seqlens)*10*cores
+    for cname, clen in zip(V.seqnames, V.seqlens):
+        chunk = int(math.ceil(clen/chunks))
+        for start in range(0, clen, chunk):
+            s = start+1
+            e = start+chunk+1
+            yield f"{cname}:{s}-{e}"
+
+def variant2dict(v):
+    for i, alt in enumerate(v.ALT):
+        dat = {"CHROM": v.CHROM, "POS": v.POS, "REF": v.REF, "ALT": alt, "QUAL": v.QUAL}
+        dat["call_rate"] = v.call_rate
+        for key, val  in v.INFO:
+            if isinstance(val, str) and "," in val:
+                val = val.split(',')
+            if isinstance(val, tuple) or isinstance(val, list):
+                val = val[i]
+            dat[f"INFO_{key}"] = val
+        yield dat
+
+def bcftools_info_with_tags(vbcf):
+    res = []
+    cmd=f"bcftools +fill-tags {vbcf} -Ou -- -d -t all,F_MISSING"
+    with Popen(cmd, shell=True, stdout=PIPE) as proc:
+        for v in tqdm(VCF(proc.stdout)):
+            for r in variant2dict(v):
+                res.append(r)
+    return res
+
+def one_chunk_stats(vcf, chunk, fill=True):
+    cmd=f"bcftools view -r {chunk} {vcf} -Ou"
+    if fill:
+        cmd = f"{cmd} | bcftools +fill-tags - -Ou -- -d -t all,F_MISSING"
+    res = []
+    with Popen(cmd, shell=True, stdout=PIPE) as proc:
+        for v in VCF(proc.stdout):
+            for r in variant2dict(v):
+                res.append(r)
+    return res
+
+def chunkwise_bcfools_stats(args):
+    res = []
+    with ProcessPoolExecutor(args.threads) as exc:
+        jobs = []
+        for region in parallel_regions(args.vcf):
+            jobs.append(exc.submit(one_chunk_stats, args.vcf, region, fill=args.fill_tags_first))
+        for job in tqdm(as_completed(jobs), total=len(jobs), unit="chunk"):
+            res.extend(job.result())
+    return res
+
+def filter_fields(dat, fields=None):
+    if fields is None:
+        return dat
+    return {k: dat[k] for k in fields if k in dat}
+
+def main(argv=None):
+    """Use bcftools to calculate various statistics, outputing an R-ready table"""
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-o", "--output", default=stdout, type=argparse.FileType("wt"),
+            help="Output TSV file")
+    ap.add_argument("-f", "--fields", nargs="+",
+            help="Use only these fields")
+    ap.add_argument("-F", "--fill-tags-first", action="store_true",
+            help="Use bcftools +fill-tags to pre-fill more fields (forces no parallelism)")
+    ap.add_argument("-t", "--threads", default=1, type=int,
+            help="Number of parallel threads")
+
+    ap.add_argument("vcf")
+    args = ap.parse_args(argv)
+
+    #if args.fill_tags_first:
+    #    variants = bcftools_info_with_tags(args.vcf)
+    #else:
+    variants = chunkwise_bcfools_stats(args)
+    if args.fields:
+        variants = [filter_fields(x, args.fields) for x in variants]
+
+    df = pd.DataFrame(variants)
+    df.to_csv(args.output, sep="\t", index=False)
+
+if __name__ == "__main__":
+    main()

From c4f50969019cc960225506cec25080a058c7a8a0 Mon Sep 17 00:00:00 2001
From: "Dr. K. D. Murray" <1560490+kdm9@users.noreply.github.com>
Date: Thu, 12 Oct 2023 07:21:30 +0200
Subject: [PATCH 22/28] gh action to build pkg

---
 .github/workflows/python-package.yml | 39 ++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 .github/workflows/python-package.yml

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 0000000..9c80961
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,39 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Build python pkg
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test CLI
+      run: |
+        blsl --help

From 7d8ce9f8d1080d196f1c0b07b96387ff793f497c Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Thu, 12 Oct 2023 07:24:10 +0200
Subject: [PATCH 23/28] fix vcfstats

---
 .github/workflows/python-package.yml | 4 ++--
 blsl/__init__.py                     | 2 +-
 blsl/vcfstats.py                     | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 9c80961..9722b26 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -28,12 +28,12 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip install flake8 pytest
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test CLI
+    - name: Build and Test CLI
       run: |
+        pip install .
         blsl --help
diff --git a/blsl/__init__.py b/blsl/__init__.py
index 6599044..c333c09 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -4,7 +4,7 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-from sys import argv, exit
+from sys import argv, exit, stderr
 
 __version__ = "0.1.11"
 
diff --git a/blsl/vcfstats.py b/blsl/vcfstats.py
index bd00b79..86ce112 100755
--- a/blsl/vcfstats.py
+++ b/blsl/vcfstats.py
@@ -3,6 +3,7 @@
 from cyvcf2 import VCF
 import pandas as pd
 
+import sys
 from sys import stdin, stdout, stderr
 from subprocess import Popen, PIPE
 import argparse

From e5b0754d04b60c1c6cbaaa326906f1a82b49c103 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Thu, 12 Oct 2023 07:25:21 +0200
Subject: [PATCH 24/28] fixes from pylint

---
 blsl/ildemux.py | 1 +
 blsl/nstitch.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/blsl/ildemux.py b/blsl/ildemux.py
index 684541d..c072280 100644
--- a/blsl/ildemux.py
+++ b/blsl/ildemux.py
@@ -1,5 +1,6 @@
 from io import BytesIO
 from pathlib import Path
+import sys
 from sys import stdin
 from tqdm import tqdm
 from collections import Counter
diff --git a/blsl/nstitch.py b/blsl/nstitch.py
index 02556a6..dddc02a 100644
--- a/blsl/nstitch.py
+++ b/blsl/nstitch.py
@@ -49,4 +49,4 @@ def nstitch_main(argv=None):
     r2.close()
 
 if __name__ == "__main__":
-    nstich_main()
+    nstitch_main()

From fe4d144d360f42c58fff2cc0ef9f9e47a95adf15 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Thu, 12 Oct 2023 07:26:16 +0200
Subject: [PATCH 25/28] more pylint bugs

---
 blsl/ilsample.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/blsl/ilsample.py b/blsl/ilsample.py
index fb2a36b..3ebfbb8 100644
--- a/blsl/ilsample.py
+++ b/blsl/ilsample.py
@@ -4,6 +4,7 @@
 from tqdm import tqdm
 from collections import Counter
 import random
+import gzip
 
 def fqpair(stream):
     fqp = list()

From 8d00ec872c8523668d158ed61aa1b7856e811f37 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Thu, 12 Oct 2023 07:28:37 +0200
Subject: [PATCH 26/28] fix gh actions

---
 .github/workflows/python-package.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 9722b26..a51ff46 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -27,7 +27,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install flake8 pytest
+        python -m pip install flake8
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
@@ -36,4 +36,4 @@ jobs:
     - name: Build and Test CLI
       run: |
         pip install .
-        blsl --help
+        blsl help

From 4e2db17ee310fb01d18944a0b717fcf65739f861 Mon Sep 17 00:00:00 2001
From: Kevin Murray <foss@kdmurray.id.au>
Date: Thu, 12 Oct 2023 07:30:13 +0200
Subject: [PATCH 27/28] install pandas by default

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index c030f5f..b219d00 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,7 @@ dependencies = [
     "tqdm",
     "biopython",
     "cyvcf2",
+    "pandas",
 ]
 dynamic = [
     "version",

From 6dd9d5fe5a5b62e6d0df6cd7b045fc4b67370b93 Mon Sep 17 00:00:00 2001
From: "Dr. K.D. Murray" <foss@kdmurray.id.au>
Date: Sat, 28 Oct 2023 12:06:10 +0200
Subject: [PATCH 28/28] add shannon entropy, redo readme

---
 README.md        | 40 ++++++++++++++++++++-----------------
 blsl/__init__.py |  2 ++
 blsl/shannon.py  | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 18 deletions(-)
 create mode 100644 blsl/shannon.py

diff --git a/README.md b/README.md
index 8d60355..e694605 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 # blindschleiche
 
-Misc sequence tools in python. These tools are small things which I have at some point needed to write because I couldn't find a solution I liked. This is in no way a comprehensive toolkit. It is a companion to [seqhax](https://github.com/kdm9/seqhax), execpt that seqhax is written in C/C++ and generally contains tools to handle very large datasets where performance is somewhat important. This is all in python for ease of development, and so typically these tools perform less data- or compute-intensive tasks.
+A collection of bioinformatics / sequence utilities needed for my research, and hopefully useful for yours.
+
+[![DOI](https://zenodo.org/badge/509693094.svg)](https://zenodo.org/doi/10.5281/zenodo.10049825)
 
 ## Install
 
@@ -15,38 +17,40 @@ pip install blindschleiche
 ```
 USAGE: blsl <subtool> [options...]
 
-
 Where <subtool> is one of:
 
-  telogrep:             Search contigs for known telomere repeats
-  n50:                  Calculate N50 and total length of a set of contigs
+  deepclust2fa:         Split a .faa by the clusters diamond deepclust finds
+  ebiosra2rl2s:         INTERNAL: MPI Tübingen tool. Make a runlib-to-sample map table from ebio sra files
+  equalbestblast:       Output only the best blast hits.
+  esearchandfetch:      Use the Entrez API to search for and download something. A CLI companion to the NCBI search box
   falen:                Tabulate the lengths of sequences in a FASTA file
-  mask2bed:             The inverse of bedtools maskfasta: softmasked fasta -> unmasked fasta + mask.bed
+  farename:             Rename sequences in a fasta file sequentially
+  galhist:              Make a summary histogram of git-annex-list output
   genigvjs:             Generate a simple IGV.js visualisation of some bioinf files.
-  liftoff-gff3:         Obtain an actually-useful GFF3 from Liftoff by fixing basic GFF3 format errors
-  pansn-rename:         Add, remove, or modify PanSN-style prefixes to contig/chromosome names in references
+  gffcat:               Concatenate GFF3 files, resepcting header lines and FASTA sections
+  gg2k:                 Summarise a table with GreenGenes-style lineages into a kraken-style report.
   ildemux:              Demultiplex modern illumina reads from read headers.
   ilsample:             Sample a fraction of read pairs from an interleaved fastq file
-  regionbed:            Make a bed/region file of genome windows
-  uniref-acc2taxid:     Make a ncbi-style acc2taxid.map file for a uniref fasta
+  liftoff-gff3:         Obtain an actually-useful GFF3 from Liftoff by fixing basic GFF3 format errors
+  mask2bed:             The inverse of bedtools maskfasta: softmasked fasta -> unmasked fasta + mask.bed
+  n50:                  Calculate N50 and total length of a set of contigs
   nstitch:              Combine R1 + R2 into single sequences, with an N in the middle
-  gg2k:                 Summarise a table with GreenGenes-style lineages into a kraken-style report.
-  equalbestblast:       Output only the best blast hits.
+  pairslash:            Add an old-style /1 /2 pair indicator to paired-end fastq files
+  pansn-rename:         Add, remove, or modify PanSN-style prefixes to contig/chromosome names in references
+  regionbed:            Make a bed/region file of genome windows
+  shannon-entropy:      Calculate Shannon's entropy (in bits) at each column of one or more alignments
   tabcat:               Concatenate table (c/tsv) files, adding the filename as a column
-  esearchandfetch:      Use the Entrez API to search for and download something. A CLI companion to the NCBI search box
-  deepclust2fa:         Split a .faa by the clusters diamond deepclust finds
-  farename:             Rename sequences in a fasta file sequentially
-  ebiosra2rl2s:         INTERNAL: MPI Tübingen tool. Make a runlib-to-sample map table from ebio sra files
-  galhist:              Make a summary histogram of git-annex-list output
+  telogrep:             Search contigs for known telomere repeats
+  uniref-acc2taxid:     Make a ncbi-style acc2taxid.map file for a uniref fasta
   help:                 Print this help message
 
 
 Use blsl subtool --help to get help about a specific tool
 ```
 
-## Why Blindschleiche
+## Why the name Blindschleiche?
 
 1) [They're awesome animals](https://www.google.com/search?q=blindschleiche&tbm=isch)
 2) Their English name is Slow Worm, which is appropriate for this set of low-performance tools in Python.
-3) All tools implemented in python must be named with a snake pun, and they're kinda a snake (not really, they're legless lizards)
+3) All tools implemented in Python must be named with a snake pun, and they're kinda a snake (not really, they're legless lizards)
 
diff --git a/blsl/__init__.py b/blsl/__init__.py
index c333c09..6fbd7ae 100644
--- a/blsl/__init__.py
+++ b/blsl/__init__.py
@@ -83,6 +83,8 @@
 except ImportError as exc:
     print(str(exc), "-- disabling vcfstats command", file=stderr)
 
+from .shannon import main as shannon_main
+cmds["shannon-entropy"] = shannon_main
 
 def mainhelp(argv=None):
     """Print this help message"""
diff --git a/blsl/shannon.py b/blsl/shannon.py
new file mode 100644
index 0000000..8947442
--- /dev/null
+++ b/blsl/shannon.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+from Bio import AlignIO
+from tqdm import tqdm
+
+import argparse
+from sys import stdin, stdout, stderr
+from math import log
+from statistics import mean
+from collections import Counter
+
+class FreqCounter(Counter):
+    def freqs(self):
+        N = sum(self.values())
+        return {b:v/N for b, v in self.most_common()}
+
+def shannon_entropy(alncol):
+    alncol = [b for b in alncol if b not in ("-", "*", "X")]
+    aafreq = FreqCounter(alncol)
+    entropy = -0
+    for aa, freq in aafreq.freqs().items():
+        entropy += freq*(log(freq,2))
+    return -entropy
+
+def shanent_aln(alignment, alnformat="fasta"):
+    aln = AlignIO.read(alignment, alnformat)
+    if len(aln) < 15:
+        print(f"WARN: {alignment} has <15 seqs, treat SE values with a grain of salt", file=stderr)
+    se = []
+    for i in range(aln.get_alignment_length()):
+        se.append(shannon_entropy(aln[:,i]))
+    return aln, se
+
+def main(argv=None):
+    """Calculate Shannon's entropy (in bits) at each column of one or more alignments"""
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-o", "--out-tsv", default=stdout, type=argparse.FileType("wt"),
+            help="Output TSV file")
+    ap.add_argument('alignments', nargs="+",
+            help='One or more MSAs, in fasta format')
+    args = ap.parse_args()
+
+    print("alnfile", "nseqs",  "column", "shannon_entropy", sep="\t", file=args.out_tsv)
+    for alnfile in tqdm(args.alignments, unit=" Alignment"):
+        aln, se = shanent_aln(alnfile)
+        for i, s in enumerate(se):
+            print(alnfile, len(aln), i+1, s, sep="\t", file=args.out_tsv)
+        mean(se)
+        print(alnfile, len(aln), "*", mean(se), sep="\t", file=args.out_tsv)
+
+if __name__ == '__main__':
+    main()