-
Notifications
You must be signed in to change notification settings - Fork 1
/
bridgePRS
executable file
·372 lines (252 loc) · 22.1 KB
/
bridgePRS
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
#!/usr/bin/env python3
########################################################################################################################################################################
import sys, os, shutil, argparse, platform, multiprocessing
import argparse as ap
from src.Python.Util.Bridge_IO.BridgeHelp import BridgeHelp
#from src.Python.Util.BridgeParse import BridgeParse
from collections import Counter
from collections import defaultdict as dd
bridgeHelp = BridgeHelp()
HELP_FORMAT="%-30s %29s"
# optimize easyrun web
# restart
class MPS(argparse.ArgumentParser):
def set_paths(self):
self.paths = ['',"/".join(os.path.abspath(sys.argv[0]).split('/')[0:-1])+'/']
return self
def error(self, detail=None, notes = [], info=[]): #values=None,choice=None): #,settings = None):
if bridgeHelp.evaluate_error(detail, sys.argv) and detail is not None:
sys.stderr.write('\n'+detail+'\n')
if len(info) > 0:
for x in info: sys.stderr.write(x+'\n\n')
if len(notes) > 0:
if notes[0] == 'config' and notes[-1].split('-')[0] == 'INVALID': sys.stderr.write('\nError - Supplied \"'+notes[1]+'\" Does Not Exist: '+notes[2]+'\n')
else: print(notes)
sys.stderr.write('\n')
self.print_help()
sys.exit(2)
def fail(self, MSG, ETYPE = 'BridgeInputError:'):
EB = ' '.join(['' for x in range(len(ETYPE))])+' '
if type(MSG) not in [list,tuple]: sys.stderr.write('\n'+ETYPE+' '+MSG)
else:
sys.stderr.write('\n'+ETYPE+' '+MSG[0]+'\n')
for es in MSG[1::]: sys.stderr.write(EB+es+'\n')
sys.stderr.write('\n')
self.print_help()
sys.exit()
def pop_error(self, msg = 'Invalid Input'):
sys.stderr.write(' BridgePopError: '+msg+'\n')
sys.stderr.write('\n')
#sys.stderr.write('Please See bridgePRS tools make-config for more help\n')
#self.print_help()
sys.exit()
def config_error(self, s1=None, s2 = None):
sys.stderr.write('InvalidConfigFile: '+self.Fn+'\n')
if s1 is not None: sys.stderr.write(' '+s1+'\n')
if s2 is not None: sys.stderr.write(' '+s2+'\n')
sys.stderr.write('\n')
#sys.stderr.write('Please See bridgePRS tools make-config for more help\n')
self.print_help()
sys.exit()
def config_warning(self, s1='', s2=None):
sys.stderr.write('ConfigFileWarning: '+self.Fn+' '+s1+'\n')
if s2 is not None: sys.stderr.write(' '+s2+'\n')
return
def config_key(self, F):
if 'Fn' in vars(self): PREV=True
else: PREV=False
self.Fn = F.split('/')[-1]
C, K, fp = {'config_name': F, 'MISSING': dd(bool)}, {}, self.valid_file(F)
self.paths.append("/".join(fp.split('/')[0:-1])+'/')
with open(fp, "r") as f: f_data = [ln.split('#')[0].strip().split('=') for ln in f if len(ln.split('#')[0].split('=')) == 2]
for c_name, cnt in [both for both in Counter([fd[0] for fd in f_data]).items() if both[1] > 1]: self.config_error('Repeated Option: '+c_name)
for k,kval in f_data:
kx, ktail, kp, kn = k.lower(), k.lower().split('_')[-1], "/".join(kval.split('/')[0:-1]), kval.split('/')[-1]
if ktail in ['file','path','prefix']:
if ktail == 'file': k_cand = self.valid_file(kval, CONFIG=True)
elif ktail == 'path': k_cand = self.valid_path(kval, CONFIG=True)
elif ktail == 'prefix': k_cand = self.valid_pref(kval, CONFIG=True)
if k_cand.split('-')[0] == 'INVALID': self.error(notes=['config',kx,kval,k_cand])
else: K[kx] = k_cand
else: K[kx] = kval
if 'sumstats_file' in K:
ss_file = K['sumstats_file']
if 'sumstats_prefix' in K or 'sumstats_suffix' in K: self.config_error('Incompatible Arguments','sumstats_file not compatible with sumstats_prefix/sumstats_suffix')
else: K['sumstats_prefix'], K['sumstats_suffix'] = K.pop('sumstats_file'), '__FILE__'
for k in ['pop','ldpop','ld_path','sumstats_prefix']:
if k not in K: self.config_error('Is missing a required argument','please add to config file: '+k.upper()+'=<'+k.lower()+'>')
else: C[k] = K.pop(k)
for k in ['sumstats_suffix','sumstats_size','validation_file']:
if k in K: C[k] = K.pop(k)
else:
C[k], C['MISSING'][k] = None, True
if k == 'sumstats_suffix': self.config_warning('is missing sumstats_suffix, will attempt to infer from filesystem')
elif k == 'sumstats_size': self.config_warning('is missing sumstats_size, required for stage-2')
else: continue
for k in ['genotype_prefix','phenotype_file']:
if k in K: C[k] = K.pop(k)
elif PREV: C[k] = self.lastConfig[k]
else: self.config_error('Is missing a required leading argument','please add to config file: '+k.upper()+'=<'+k.lower()+'>')
## FIELD PARSE ##
ssf_names, ssf_vals, ssf_indivs = ['snpid','ref','alt','p','beta'], ['_NA_','_NA_','_NA_','_NA_','_NA_'], [k for k in K.keys() if k.split('-')[0] == 'ssf']
if 'sumstats_fields' in K:
ssf_vals = K.pop('sumstats_fields').split(',')
if len(ssf_vals) != 5: self.config_error('Invalid sumstats_fields entry. Five comma separated values required','Example: SUMSTATS_FIELDS=ID,REF,ALT,PV,BETA')
for k in ssf_indivs:
if k.split('-')[-1] in ssf_names:
k_idx = ssf_names.index(k.split('-')[-1])
ssf_vals[k_idx] = K.pop(k)
if '_NA_' in ssf_vals:
F_NEED = [ssf_names[i].upper() for i,k in enumerate(ssf_vals) if k == '_NA_']
self.error('ConfigFileError: Missing The Following Sumstats Field(s): '+", ".join(F_NEED)+'\n Please add missing field name(s) individually [eg: SSF-'+F_NEED[0]+'=<FIELD_NAME>],\n Or as five ordered and comma separated values [eg: SUMSTATS_FIELDS=ID,REF,A1,P,BETA]')
C['sumstats_fields'] = ','.join(ssf_vals)
for k in ['snp_file','thinned_snp_file','max_clump_size','covariates','clump_value']:
if k in K: C[k] = K.pop(k)
else: C[k] = None
if len(C['sumstats_fields'].split(',')) != 5:
self.error('ConfigFileError: Invalid Entry For sumstats_fields, five comma separated values are required\n For snp_id, ref_allele, alt_allele, pvalue, and effect_size\n Example: ID,REF,A1,P,BETA')
for k in K: self.error('Unrecognized Field in Config File: '+k+'\n Hint: Remove or Comment Out Line: '+k.upper()+'='+K[k])
self.lastConfig = C
return C
def dir_path(self,path):
if not os.path.isdir(path): os.makedirs(path)
return path
def valid_pref(self, f, CONFIG=False):
fp, fn = "/".join(f.split('/')[0:-1]), f.split('/')[-1]
pp = [[np,os.listdir(np)] for np in ['./' if len(p+fp) == 0 else p+fp for p in self.paths] if os.path.isdir(np)]
for path_name, path_files in pp:
if len([x for x in path_files if x[0:len(fn)] == fn]) > 0: return os.path.abspath(path_name)+'/'+fn
if CONFIG: return('INVALID-PREFIX')
else: raise ValueError()
def valid_path(self, f, CONFIG=False):
for p in self.paths:
if os.path.isdir(p + f): return os.path.abspath(p+f)
if CONFIG: return('INVALID-PATH')
else: raise ValueError()
def valid_file(self, f, CONFIG=False):
for p in self.paths:
if os.path.isfile(p+f): return os.path.abspath(p+f)
if CONFIG: return('INVALID-FILE')
else: raise ValueError()
def find_platform(self, p = 'NA'):
if p == 'NA':
P1,P2 = platform.system().upper(), platform.platform().upper()
if P1[0:3] in ['MAC','DAR'] or P2[0:3] in ['MAC','DAR']: return 'mac'
if P1[0:3] in ['LIN','SER'] or P2[0:3] in ['LIN','SER']: return 'linux'
return p
else:
P = p.upper()
if P[0:3] in ['MAC','DAR']: return 'mac'
if P[0:3] in ['LIN','UNI','SER']: return 'linux'
return p
def force_caps(self,x,y = None):
return x.upper()
def help_str(self,module):
if module == 'pipeline': return (HELP_FORMAT % ('Main Program: BridgePRS','Stage1,2,Combine,Analyze'))
if module == 'analyze': return (HELP_FORMAT % ('Analyze/Combine Results:','*Plotting Requires Matplotlib'))
if module == 'prs-single': return (HELP_FORMAT % ('Subprogram: Stage1', 'Ridge Regression'))
if module == 'build-model': return (HELP_FORMAT % ('Subprogram: Base Model','Necessary for (port/bridge)'))
if module == 'prs-port': return (HELP_FORMAT % ('Subprogram: Optional Stage', 'Port snp-weights btwn pops'))
if module == 'prs-prior': return (HELP_FORMAT % ('Subprogram: Stage 2', 'Bridge snp-priors btwn pops',))
if module == 'easyrun': return (HELP_FORMAT % ('Run bridgePRS Pipeline:','Full Program'))
if module == 'check': return (HELP_FORMAT % ('Bridge Axillary Commands','None'))
if module == 'tools': return (HELP_FORMAT % ('Axillary Programs: Toolbox','reformat files,check data,etc'))
def sub_help(self, cmd, module=None):
if cmd == 'run': HS = ('Run The Following Module Commands:','')
elif cmd == 'go': HS = ('Run BridgePRS Sequentially: prs-single, build-model,prs-port,prs-prior,analyze','')
elif cmd == 'clump': HS = ('Load Sumstats Data, Produce SNP Clumps','')
elif cmd == 'beta': HS = ('Evaluate SNP Clumps, Produce SNP Weights','')
elif cmd == 'predict': HS = ('Apply SNP Weights, Produce Polygenic Predictions','')
elif cmd == 'quantify': HS = ('Test Predictions, Produce Quantifiable Results','')
elif cmd == 'optimize': HS = ('Optimize SNP Weights,Produce Generalizable Predictions','')
elif cmd == 'prior': HS = ('Test Predictions,Produce SNP Prior Proability Matrix','')
elif cmd == 'result': HS = ('Analyze PRS Results','')
elif cmd == 'combine': HS = ('Combine And Analyze PRS Results','')
elif cmd == 'requirements': HS = ('Check System Requirements','')
elif cmd == 'pop': HS = ('Check Population Data','')
elif cmd == 'pops': HS = ('Check Two Population Datasets','')
elif cmd == 'data': HS = ('Check Input Data','')
elif cmd == 'reformat-sumstats': HS = ('Validate And Reformat A Sumstats File','')
else: HS = ('INFO','INFO')
return(HELP_FORMAT % HS)
###############################################################################################################################################
if __name__ == '__main__':
mps = MPS(formatter_class = lambda prog: ap.RawTextHelpFormatter(prog, max_help_position=100)).set_paths()
subs = mps.add_subparsers(help=(HELP_FORMAT % ('Description','Notes')),dest='module',title='Modules')
subs.required = True
sub_help = (str(HELP_FORMAT % ('Description','')))
Global, Pipeline, OnePop, Pop = MPS(add_help=False), MPS(add_help=False), MPS(add_help=False), MPS(add_help=False)
twopop = Pipeline.add_argument_group('Required')
twopop.add_argument('--config_files', dest = 'config', type=mps.config_key, nargs = 2, required=True, metavar='', help = 'Two Pop Config Files Required: Target and Base')
twopop.add_argument('-o', '--out', dest ='outpath', type=str, action='store', metavar='', help='Output Path For Pipeline Run')
twopop.add_argument('--phenotype', dest='phenotype', type=str, default=None, metavar='',help='Phenotype Name (Must Match Phenotype File(s))')
twopop.add_argument('--fst', dest='fst', type=float, metavar='',default=0.1,help='Fst Value Between Target and Base')
onepop = OnePop.add_argument_group('Required')
onepop.add_argument('--config_file', dest = 'config', type=mps.config_key, nargs = 1, required=True, help = 'One Population Config File Is Required')
onepop.add_argument('-o', '--out', dest ='outpath', type=str, action='store', metavar='', help='Output Path For Pipeline Run')
onerun = OnePop.add_argument_group('Generated Or Subprogram Specific')
onerun.add_argument('--phenotype', dest='phenotype', type=str, default=None, metavar='',help='Phenotype Name')
onerun.add_argument('--fst', dest='fst', type=float, metavar='',default=0.1,help='Fst Value Between Target and Base')
onerun.add_argument('--model_file', dest = 'model_file', type=mps.valid_file, action = 'store', metavar='', default=None, help = 'large population model file')
onerun.add_argument('--clump_prefix', dest = 'clump_prefix', type=mps.valid_pref, action = 'store', metavar='', default=None, help = 'prefix for files generated by clump step')
onerun.add_argument('--beta_prefix', dest = 'beta_prefix', type=mps.valid_pref, action = 'store', metavar='', default=None, help = 'prefix for files generated by beta step')
onerun.add_argument('--predict_prefix', dest = 'predict_prefix', type=mps.valid_pref, action = 'store', metavar='', default=None, help = 'prefix for files generated by predict step')
system, toggle = Global.add_argument_group('System'), Global.add_argument_group('Toggles')
system.add_argument('--platform', dest='platform', action='store', metavar = '', type=mps.find_platform, default=mps.find_platform("NA"), help='Force platform (Linux or MacOS)')
system.add_argument('--cores', dest='cores', action='store', type=int, default=0,metavar='',help='By default bridgePRS parralelized across (n-1) cores.')
system.add_argument('--total_cores', dest='total_cores', action='store', type=int, metavar = '', default=multiprocessing.cpu_count(), help=argparse.SUPPRESS)
system.add_argument('--rPath', dest='rpath', action='store', type=mps.valid_path, default='src/Rscripts', help=argparse.SUPPRESS)
system.add_argument('--plinkPath', dest='plinkpath', action='store', type=mps.valid_path, default='src/Python/Xtra', help=argparse.SUPPRESS)
system.add_argument('--debug_level', dest = 'debug_level', type=int, metavar ='', default=1, help=argparse.SUPPRESS)
for t in ['verbose','restart','clean','noplots']: toggle.add_argument('--'+t, dest=t, action='store_true', default=False, help = 'Toggle '+t.capitalize()+' Mode On')
pop = Pop.add_argument_group('Pop Data:')
pop = Pop.add_argument_group('Population Args')
pop.add_argument('--pop','--pops', dest = 'pop', type=mps.force_caps, nargs = '+', metavar ='', default=[], help='Pop Name(s)')
pop.add_argument('--ldpop', dest = 'ldpop', type=str, nargs = '+', metavar ='', default=[], help=argparse.SUPPRESS)
pop.add_argument('--ld_path', dest = 'ld_path', type=mps.valid_path, nargs = '+', metavar ='', default=[], help='Path to LD-Reference Panel')
pop.add_argument('--sumstats_prefix','--sumstats_file', dest = 'sumstats_prefix', type=mps.valid_pref, nargs = '+', metavar ='', default=[], help = 'path to sumstats prefix')
pop.add_argument('--sumstats_suffix', dest = 'sumstats_suffix', type=str, nargs = '+', metavar ='', default=[], help = 'sumstats suffix')
pop.add_argument('--sumstats_size', dest = 'sumstats_size', type=str, nargs = '+', metavar ='', default=[], help = 'sumstats size')
pop.add_argument('--sumstats_fields', dest='sumstats_fields', type=str, nargs='+', metavar = '', default=['ID,REF,A1,P,BETA'], help='Sumstats Fields')
pop.add_argument('--genotype_prefix', dest = 'genotype_prefix', type=mps.valid_pref, nargs = '+', metavar ='', default=[], help = 'path to genotype prefix')
pop.add_argument('--phenotype_file', dest = 'phenotype_file', type=mps.valid_file, nargs = '+', metavar ='', default=[], help = 'phenotype file')
pop.add_argument('--validation_file', dest = 'validation_file', type=mps.valid_file, metavar='', default=[], help = 'phenotype validation file')
pop.add_argument('--snp_file', dest = 'snp_file', type=mps.valid_file, nargs = '+', metavar ='', default=[], help = 'snp qc file')
pop.add_argument('--thinned_snp_file', dest = 'thinned_snp_file', type=mps.valid_file, nargs = '+', metavar ='', default=[], help = 'path to thinned snp qc file')
pop.add_argument('--max_clump_size', dest='max_clump_size', type=int, nargs='+', metavar = '', default=[], help='Max Size for Clumping')
pop.add_argument('--covariates', dest='covariates', type=str, nargs='+', metavar = '', default = [], help='Phenotype File Field(s): covariates')
pop.add_argument('--clump_value', dest='clump_value', type=float, nargs='+', metavar='',default=[], help='Clump Pvalue')
pop.add_argument('--phenotype', dest='phenotype', type=str, default=None, metavar='',help='Phenotype Name')
# Option: PIPELINE #
pipeline = subs.add_parser('pipeline', help=mps.help_str('pipeline')).add_subparsers(help=(sub_help),dest='cmd',title="Commands")
pipeline.required = True
go = pipeline.add_parser('go',parents=[Pipeline, Global], help=mps.sub_help('go'), formatter_class=mps.formatter_class)
go.add_argument('--port', dest='port', action='store_true', default=False, help=ap.SUPPRESS)
check = pipeline.add_parser('check',parents=[Pipeline, Global], help=mps.sub_help('check'), formatter_class=mps.formatter_class)
prs_single,build,prs_port,prs_bridge = [subs.add_parser(x,help=mps.help_str(x)).add_subparsers(help=sub_help,dest='cmd',title='Commands') for x in ['prs-single','build-model','prs-port','prs-prior']]
for i,(sub_parser,sn) in enumerate(zip([prs_single, build, prs_port, prs_bridge],['prs-single','build-model','prs-port','prs-prior'])):
sub_parser.required = True
if sn in ['prs-single','prs-prior']: cmds = ['run','clump','beta','predict','quantify']
elif sn == 'prs-port': cmds = ['run','predict','quantify']
else: cmds = ['run','clump','beta','predict','prior']
for c in cmds: sub_parser.add_parser(c, parents = [OnePop, Global], help = mps.sub_help(c),formatter_class=mps.formatter_class)
Analyze = MPS(add_help=False)
Analyze.add_argument('--result_files', nargs='+',dest='result_files',type=str,required=True,metavar='',help='One or More PRS Result Files')
Analyze.add_argument('-o', '--out', dest ='outpath', type=str, action='store', metavar='', help='Output Path For Analysis')
analyze = subs.add_parser('analyze', help=mps.help_str('analyze')).add_subparsers(help=(sub_help),dest='cmd',title="Commands")
for p in ['result','combine']: analyze.add_parser(p, parents = [Global, Analyze], help = mps.sub_help(p),formatter_class=mps.formatter_class)
Tools = MPS(add_help=False)
Tools.add_argument('-o, --out', dest ='outpath', type=str, action='store', metavar='', help='Output Path For Toolbox Result')
#Tools.add_argument('--config_files', dest = 'config', type=mps.config_key, default=[], nargs = 2, metavar='', help = 'Two Pop Config Files Required: Target and Base')
tools = subs.add_parser('tools', help=mps.help_str('tools')).add_subparsers(help=(sub_help),dest='cmd',title="Commands")
tools.required = True
reformat_sumstats = tools.add_parser('reformat-sumstats',parents=[Global,Tools,Pop], help=mps.sub_help('reformat-sumstats'), formatter_class=mps.formatter_class)
check_requirements = tools.add_parser('check-requirements', parents=[Global,Tools,Pop], help=mps.sub_help('requirements'), formatter_class=mps.formatter_class)
check_pop = tools.add_parser('check-pop', parents=[Global,Tools,Pop], help=mps.sub_help('pop'), formatter_class=mps.formatter_class)
check_pops = tools.add_parser('check-pops', parents=[Global,Tools,Pop], help=mps.sub_help('pops'), formatter_class=mps.formatter_class)
check_data = tools.add_parser('check-data', parents=[Global,Tools,Pop], help=mps.sub_help('data'), formatter_class=mps.formatter_class)
########################################################################################################################################################################
########################################################################################################################################################################
########################################################################################################################################################################
from src.Python.BridgePRS import BridgePRS
bridgePRS = BridgePRS(mps, sys.argv)