bulik · tpoterba · Jul 24, 2020 · Jun 19, 2020 · Jun 19, 2020 · Jun 19, 2020
diff --git a/ldscore/parse.py b/ldscore/parse.py
@@ -9,6 +9,7 @@
 import numpy as np
 import pandas as pd
 import os
+import glob
 
 
 def series_eq(x, y):
@@ -20,12 +21,21 @@ def read_csv(fh, **kwargs):
     return pd.read_csv(fh, delim_whitespace=True, na_values='.', **kwargs)
 
 
-def sub_chr(s, chr):
+def sub_chr(s, chrom):
     '''Substitute chr for @, else append chr to the end of str.'''
     if '@' not in s:
         s += '@'
 
-    return s.replace('@', str(chr))
+    return s.replace('@', str(chrom))
+
+
+def get_present_chrs(fh, num):
+    '''Checks which chromosomes exist, assuming that the file base will be appended by a dot in any suffix.'''
+    chrs = []
+    for chrom in xrange(1,num):
+        if glob.glob(sub_chr(fh, chrom) + '.*'):
+            chrs.append(chrom)
+    return chrs
 
 
 def which_compression(fh):
@@ -133,9 +143,10 @@ def ldscore(fh, num=None):
     '''Parse .l2.ldscore files, split across num chromosomes. See docs/file_formats_ld.txt.'''
     suffix = '.l2.ldscore'
     if num is not None:  # num files, e.g., one per chromosome
-        first_fh = sub_chr(fh, 1) + suffix
+        chrs = get_present_chrs(fh, num+1)
+        first_fh = sub_chr(fh, chrs[0]) + suffix
         s, compression = which_compression(first_fh)
-        chr_ld = [l2_parser(sub_chr(fh, i) + suffix + s, compression) for i in xrange(1, num + 1)]
+        chr_ld = [l2_parser(sub_chr(fh, i) + suffix + s, compression) for i in chrs]
         x = pd.concat(chr_ld)  # automatically sorted by chromosome
     else:  # just one file
         s, compression = which_compression(fh + suffix)
@@ -154,7 +165,7 @@ def M(fh, num=None, N=2, common=False):
         suffix += '_5_50'
 
     if num is not None:
-        x = np.sum([parsefunc(sub_chr(fh, i) + suffix) for i in xrange(1, num + 1)], axis=0)
+        x = np.sum([parsefunc(sub_chr(fh, i) + suffix) for i in get_present_chrs(fh, num+1)], axis=0)
     else:
         x = parsefunc(fh + suffix)
 
@@ -176,8 +187,9 @@ def annot(fh_list, num=None, frqfile=None):
     annot_suffix = ['.annot' for fh in fh_list]
     annot_compression = []
     if num is not None:  # 22 files, one for each chromosome
+        chrs = get_present_chrs(fh, num+1)
         for i, fh in enumerate(fh_list):
-            first_fh = sub_chr(fh, 1) + annot_suffix[i]
+            first_fh = sub_chr(fh, chrs[0]) + annot_suffix[i]
             annot_s, annot_comp_single = which_compression(first_fh)
             annot_suffix[i] += annot_s
             annot_compression.append(annot_comp_single)
@@ -190,13 +202,13 @@ def annot(fh_list, num=None, frqfile=None):
 
         y = []
         M_tot = 0
-        for chr in xrange(1, num + 1):
+        for chrom in chrs:
             if frqfile is not None:
-                df_annot_chr_list = [annot_parser(sub_chr(fh, chr) + annot_suffix[i], annot_compression[i],
-                                                  sub_chr(frqfile, chr) + frq_suffix, frq_compression)
+                df_annot_chr_list = [annot_parser(sub_chr(fh, chrom) + annot_suffix[i], annot_compression[i],
+                                                  sub_chr(frqfile, chrom) + frq_suffix, frq_compression)
                                      for i, fh in enumerate(fh_list)]
             else:
-                df_annot_chr_list = [annot_parser(sub_chr(fh, chr) + annot_suffix[i], annot_compression[i])
+                df_annot_chr_list = [annot_parser(sub_chr(fh, chrom) + annot_suffix[i], annot_compression[i])
                                      for i, fh in enumerate(fh_list)]
 
             annot_matrix_chr_list = [np.matrix(df_annot_chr) for df_annot_chr in df_annot_chr_list]

diff --git a/ldscore/sumstats.py b/ldscore/sumstats.py
@@ -4,7 +4,6 @@
 This module deals with getting all the data needed for LD Score regression from files
 into memory and checking that the input makes sense. There is no math here. LD Score
 regression is implemented in the regressions module.
-
 '''
 from __future__ import division
 import numpy as np
@@ -51,10 +50,8 @@
 
 def _splitp(fstr):
     flist = fstr.split(',')
-    paths = []
-    for x in [os.path.expanduser(os.path.expandvars(x)) for x in flist]:
-      paths.extend(glob.glob(x))
-    return paths
+    flist = [os.path.expanduser(os.path.expandvars(x)) for x in flist]
+    return flist
 
 
 def _select_and_log(x, ii, log, msg):
@@ -147,11 +144,11 @@ def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs):
     '''Read files split across 22 chromosomes (annot, ref_ld, w_ld).'''
     try:
         if not_chr_arg:
-            log.log('Reading {N} from {F} ...'.format(F=not_chr_arg, N=noun))
+            log.log('Reading {N} from {F} ... ({p})'.format(N=noun, F=not_chr_arg, p=parsefunc.__name__))
             out = parsefunc(_splitp(not_chr_arg), **kwargs)
         elif chr_arg:
             f = ps.sub_chr(chr_arg, '[1-22]')
-            log.log('Reading {N} from {F} ...'.format(F=f, N=noun))
+            log.log('Reading {N} from {F} ... ({p})'.format(N=noun, F=f, p=parsefunc.__name__))
             out = parsefunc(_splitp(chr_arg), _N_CHR, **kwargs)
     except ValueError as e:
         log.log('Error parsing {N}.'.format(N=noun))