diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f1b1e44 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ + +.idea/libraries/R_User_Library.xml +.idea/methylpy.iml +.idea/misc.xml +.idea/modules.xml +.idea/vcs.xml +.idea/workspace.xml diff --git a/methylpy/utilities.py b/methylpy/utilities.py index 7f1d5bb..e640089 100644 --- a/methylpy/utilities.py +++ b/methylpy/utilities.py @@ -672,7 +672,7 @@ def index_allc_file(allc_file,reindex=False): if (not reindex) and os.path.exists(index_file): # check index file completeness eof_count = 0 - with open(index_file,'r') as f: + with open(index_file,'r') as f: for line in f: if line == '#eof\n': eof_count += 1 @@ -681,34 +681,46 @@ def index_allc_file(allc_file,reindex=False): pass else: return 0 - g = open(index_file,'w') - f = open_allc_file(allc_file) - cur_chrom = "" + + if allc_file.endswith('gz'): # works for .gz, .bgz + f = subprocess.Popen(['zcat', allc_file], + stdout=subprocess.PIPE, + encoding='utf8').stdout + else: + f = open(allc_file) + + index_lines = [] + cur_chrom = "TOTALLY_NOT_A_CHROM" + cur_start = cur_chrom + '\t' + cur_pointer = 0 # check header - line = f.readline() - try: - fields = line.split("\t") - int(fields[1]) - int(fields[4]) - int(fields[5]) - # no header, continue to start from the beginning of allc file - f.seek(0) - cur_pointer = 0 - except: - # find header, skip it - cur_pointer = f.tell() - # find chrom pointer - while True: - line = f.readline() - if not line: break - fields = line.split("\t") - if fields[0] != cur_chrom: - g.write(fields[0]+"\t"+str(cur_pointer)+"\n") + first_line = True + for line in f: + if first_line: + # all new allc files don't have header, + # some old files' first line is header + fields = line.split("\t") + first_line = False + try: + int(fields[1]) + int(fields[4]) + int(fields[5]) + # no header, continue to start from the beginning of allc file + except ValueError: + # find header, skip it + cur_pointer += len(line) + continue + if not line.startswith(cur_start): + fields = line.split("\t") + index_lines.append(fields[0] + "\t" + str(cur_pointer) + "\n") cur_chrom = fields[0] - cur_pointer = f.tell() - g.write("#eof\n") + cur_start = cur_chrom + '\t' + cur_pointer += len(line) + # backward compatibility + index_lines.append("#eof\n") f.close() - g.close() + with open(index_file, 'w') as idx: + idx.writelines(index_lines) return 0 def read_allc_index(allc_file):