-
Notifications
You must be signed in to change notification settings - Fork 0
/
mask2bed.py
51 lines (43 loc) · 1.93 KB
/
mask2bed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Copyright (c) 2022 Dr. K. D. Murray/Gekkonid Consulting <spam@gekkonid.com>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from Bio import SeqIO
from shlex import quote
import argparse
from tqdm import tqdm
full_help = """
Repeats or other problematic sequences are often masked from a reference
genome. Ideally, this is done by converting each base to lower-case ("soft
masking"), though is sometimes done by replacing bases with N ("hard masking").
This tool reads a genome file, and for each sequence, outputs a zero-based bed
file of the masked regions. By default this means lower-case regions, although
if -N/--hardmasked is given, stretches of N are considered masked too.
"""
def mask2bed_main(argv=None):
"""The inverse of bedtools maskfasta: softmasked fasta -> unmasked fasta + mask.bed"""
ap = argparse.ArgumentParser(epilog=full_help)
ap.add_argument("-N", "--hardmasked", action="store_true",
help="count N characters as a masked base (i.e. FASTA is hard-masked)")
ap.add_argument("fasta", help="FASTA file of contigs")
args = ap.parse_args(argv)
seqs = SeqIO.parse(args.fasta, "fasta")
for seq in tqdm(seqs):
last_start = 0
last_masked = None
for i, base in enumerate(str(seq.seq)):
base_masked = base.islower() or (args.hardmasked and base.lower() == "n")
if last_masked is None:
last_masked = base_masked
continue
if last_masked != base_masked:
if last_masked:
print(seq.name, last_start, i, sep="\t")
else:
last_start = i
last_masked = base_masked
if last_masked:
print(seq.name, last_start, i+1, sep="\t")
if __name__ == "__main__":
mask2bed_main()