Skip to content

Commit

Permalink
fix index_allc_file
Browse files Browse the repository at this point in the history
  • Loading branch information
lhqing committed Mar 24, 2019
1 parent cc031a7 commit 4f634ee
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 26 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

.idea/libraries/R_User_Library.xml
.idea/methylpy.iml
.idea/misc.xml
.idea/modules.xml
.idea/vcs.xml
.idea/workspace.xml
64 changes: 38 additions & 26 deletions methylpy/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,7 @@ def index_allc_file(allc_file,reindex=False):
if (not reindex) and os.path.exists(index_file):
# check index file completeness
eof_count = 0
with open(index_file,'r') as f:
with open(index_file,'r') as f:
for line in f:
if line == '#eof\n':
eof_count += 1
Expand All @@ -681,34 +681,46 @@ def index_allc_file(allc_file,reindex=False):
pass
else:
return 0
g = open(index_file,'w')
f = open_allc_file(allc_file)
cur_chrom = ""

if allc_file.endswith('gz'): # works for .gz, .bgz
f = subprocess.Popen(['zcat', allc_file],
stdout=subprocess.PIPE,
encoding='utf8').stdout
else:
f = open(allc_file)

index_lines = []
cur_chrom = "TOTALLY_NOT_A_CHROM"
cur_start = cur_chrom + '\t'
cur_pointer = 0
# check header
line = f.readline()
try:
fields = line.split("\t")
int(fields[1])
int(fields[4])
int(fields[5])
# no header, continue to start from the beginning of allc file
f.seek(0)
cur_pointer = 0
except:
# find header, skip it
cur_pointer = f.tell()
# find chrom pointer
while True:
line = f.readline()
if not line: break
fields = line.split("\t")
if fields[0] != cur_chrom:
g.write(fields[0]+"\t"+str(cur_pointer)+"\n")
first_line = True
for line in f:
if first_line:
# all new allc files don't have header,
# some old files' first line is header
fields = line.split("\t")
first_line = False
try:
int(fields[1])
int(fields[4])
int(fields[5])
# no header, continue to start from the beginning of allc file
except ValueError:
# find header, skip it
cur_pointer += len(line)
continue
if not line.startswith(cur_start):
fields = line.split("\t")
index_lines.append(fields[0] + "\t" + str(cur_pointer) + "\n")
cur_chrom = fields[0]
cur_pointer = f.tell()
g.write("#eof\n")
cur_start = cur_chrom + '\t'
cur_pointer += len(line)
# backward compatibility
index_lines.append("#eof\n")
f.close()
g.close()
with open(index_file, 'w') as idx:
idx.writelines(index_lines)
return 0

def read_allc_index(allc_file):
Expand Down

0 comments on commit 4f634ee

Please sign in to comment.