-
Notifications
You must be signed in to change notification settings - Fork 0
/
esearchandfetch.py
96 lines (85 loc) · 3.3 KB
/
esearchandfetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python3
from Bio import Entrez
from tqdm import tqdm
import argparse
from time import sleep
from sys import stdout, stdin, stderr, exit
from pathlib import Path
import io
def esearch(*args, chunks=10000, **kwargs):
kwargs["retmax"] = chunks
ids = []
n = None
i = 0
with tqdm(desc="esearch IDs") as progress:
while n is None or i < n:
this = Entrez.esearch(*args, **kwargs, retstart=i)
for x in range(3):
try:
sleep(0.5)
this = Entrez.read(this)
break
except RuntimeError:
sleep(0.5)
pass
else:
this = Entrez.read(this)
nret = len(this["IdList"])
ids.extend(this["IdList"])
i += nret
n = int(this["Count"])
progress.total = n
progress.update(nret)
return ids
def efetch(db="nucleotide", rettype="genbank", retmode="text", ids=None, chunks=2000):
with tqdm(desc=f"efetch {rettype}", total=len(ids)) as progress:
for start in range(0, len(ids), chunks):
stop = min(start + chunks, len(ids))
idstr = ",".join(ids[start:stop])
res = Entrez.efetch(db=db, retmode=retmode, rettype=rettype, id=idstr)
if start != 0 and rettype=="runinfo":
# Skip header on non-first one
hdr = next(res)
for line in res:
if isinstance(line, bytes):
line=line.decode('utf-8')
yield line
progress.update(stop-start)
def main(argv=None):
"""Use the Entrez API to search for and download something. A CLI companion to the NCBI search box"""
ap = argparse.ArgumentParser()
ap.add_argument("-e", "--email", type=str,
help="Email for Entrez API")
ap.add_argument("-c", "--chunksize", type=int, default=1000,
help="Number of records to download per chunk")
ap.add_argument("-o", "--out", type=argparse.FileType('w'), default="-",
help="Genbank output")
ap.add_argument("-d", "--db", required=True,
help="db api param")
ap.add_argument("-r", "--rettype", required=True,
help="rettype api param")
ap.add_argument("-m", "--retmode", default="text",
help="API return mode")
ap.add_argument("-i", "--id-file", type=Path,
help="List of IDs (give with --term to save a list of IDs to this file, or without to use this list of IDs)")
ap.add_argument("-t", "--term", type=str)
args = ap.parse_args(argv)
Entrez.email = args.email
if args.term:
ids = esearch(term=args.term, db=args.db)
if args.id_file:
with args.id_file.open("w") as fh:
for id in ids:
print(id, file=fh)
elif args.id_file:
ids = []
with args.id_file.open("r") as fh:
for line in fh:
ids.append(line.rstrip())
else:
print("ERROR: must give either --term to search for, or a list of ids in --id-file", file=stderr)
exit(1)
for line in efetch(ids=ids, db=args.db, rettype=args.rettype, retmode=args.retmode, chunks=1000):
args.out.write(line)
if __name__ == "__main__":
main()