Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix globbing bug in splitp #221

Merged
merged 4 commits into from
Jul 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions ldscore/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import pandas as pd
import os
import glob


def series_eq(x, y):
Expand All @@ -20,12 +21,21 @@ def read_csv(fh, **kwargs):
return pd.read_csv(fh, delim_whitespace=True, na_values='.', **kwargs)


def sub_chr(s, chr):
def sub_chr(s, chrom):
'''Substitute chr for @, else append chr to the end of str.'''
if '@' not in s:
s += '@'

return s.replace('@', str(chr))
return s.replace('@', str(chrom))


def get_present_chrs(fh, num):
'''Checks which chromosomes exist, assuming that the file base will be appended by a dot in any suffix.'''
chrs = []
for chrom in xrange(1,num):
if glob.glob(sub_chr(fh, chrom) + '.*'):
chrs.append(chrom)
return chrs


def which_compression(fh):
Expand Down Expand Up @@ -133,9 +143,10 @@ def ldscore(fh, num=None):
'''Parse .l2.ldscore files, split across num chromosomes. See docs/file_formats_ld.txt.'''
suffix = '.l2.ldscore'
if num is not None: # num files, e.g., one per chromosome
first_fh = sub_chr(fh, 1) + suffix
chrs = get_present_chrs(fh, num+1)
first_fh = sub_chr(fh, chrs[0]) + suffix
s, compression = which_compression(first_fh)
chr_ld = [l2_parser(sub_chr(fh, i) + suffix + s, compression) for i in xrange(1, num + 1)]
chr_ld = [l2_parser(sub_chr(fh, i) + suffix + s, compression) for i in chrs]
x = pd.concat(chr_ld) # automatically sorted by chromosome
else: # just one file
s, compression = which_compression(fh + suffix)
Expand All @@ -154,7 +165,7 @@ def M(fh, num=None, N=2, common=False):
suffix += '_5_50'

if num is not None:
x = np.sum([parsefunc(sub_chr(fh, i) + suffix) for i in xrange(1, num + 1)], axis=0)
x = np.sum([parsefunc(sub_chr(fh, i) + suffix) for i in get_present_chrs(fh, num+1)], axis=0)
else:
x = parsefunc(fh + suffix)

Expand All @@ -176,8 +187,9 @@ def annot(fh_list, num=None, frqfile=None):
annot_suffix = ['.annot' for fh in fh_list]
annot_compression = []
if num is not None: # 22 files, one for each chromosome
chrs = get_present_chrs(fh, num+1)
for i, fh in enumerate(fh_list):
first_fh = sub_chr(fh, 1) + annot_suffix[i]
first_fh = sub_chr(fh, chrs[0]) + annot_suffix[i]
annot_s, annot_comp_single = which_compression(first_fh)
annot_suffix[i] += annot_s
annot_compression.append(annot_comp_single)
Expand All @@ -190,13 +202,13 @@ def annot(fh_list, num=None, frqfile=None):

y = []
M_tot = 0
for chr in xrange(1, num + 1):
for chrom in chrs:
if frqfile is not None:
df_annot_chr_list = [annot_parser(sub_chr(fh, chr) + annot_suffix[i], annot_compression[i],
sub_chr(frqfile, chr) + frq_suffix, frq_compression)
df_annot_chr_list = [annot_parser(sub_chr(fh, chrom) + annot_suffix[i], annot_compression[i],
sub_chr(frqfile, chrom) + frq_suffix, frq_compression)
for i, fh in enumerate(fh_list)]
else:
df_annot_chr_list = [annot_parser(sub_chr(fh, chr) + annot_suffix[i], annot_compression[i])
df_annot_chr_list = [annot_parser(sub_chr(fh, chrom) + annot_suffix[i], annot_compression[i])
for i, fh in enumerate(fh_list)]

annot_matrix_chr_list = [np.matrix(df_annot_chr) for df_annot_chr in df_annot_chr_list]
Expand Down
11 changes: 4 additions & 7 deletions ldscore/sumstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
This module deals with getting all the data needed for LD Score regression from files
into memory and checking that the input makes sense. There is no math here. LD Score
regression is implemented in the regressions module.

'''
from __future__ import division
import numpy as np
Expand Down Expand Up @@ -51,10 +50,8 @@

def _splitp(fstr):
flist = fstr.split(',')
paths = []
for x in [os.path.expanduser(os.path.expandvars(x)) for x in flist]:
paths.extend(glob.glob(x))
return paths
flist = [os.path.expanduser(os.path.expandvars(x)) for x in flist]
return flist


def _select_and_log(x, ii, log, msg):
Expand Down Expand Up @@ -147,11 +144,11 @@ def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs):
'''Read files split across 22 chromosomes (annot, ref_ld, w_ld).'''
try:
if not_chr_arg:
log.log('Reading {N} from {F} ...'.format(F=not_chr_arg, N=noun))
log.log('Reading {N} from {F} ... ({p})'.format(N=noun, F=not_chr_arg, p=parsefunc.__name__))
out = parsefunc(_splitp(not_chr_arg), **kwargs)
elif chr_arg:
f = ps.sub_chr(chr_arg, '[1-22]')
log.log('Reading {N} from {F} ...'.format(F=f, N=noun))
log.log('Reading {N} from {F} ... ({p})'.format(N=noun, F=f, p=parsefunc.__name__))
out = parsefunc(_splitp(chr_arg), _N_CHR, **kwargs)
except ValueError as e:
log.log('Error parsing {N}.'.format(N=noun))
Expand Down