close #47, close #50, close #14, close #32

russelllab · Jun 6, 2023 · a2d0431 · a2d0431
1 parent 888b9f3
commit a2d0431
Show file tree

Hide file tree

Showing 110 changed files with 189,868 additions and 488 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+webApp/static/predictor/output/*
diff --git a/...G/Enhancements_May2023/16thMay/__pycache__/create_svg_20230516_kinases_GS.cpython-310.pyc b/...G/Enhancements_May2023/16thMay/__pycache__/create_svg_20230516_kinases_GS.cpython-310.pyc
diff --git a/DB/hh b/DB/hh
@@ -0,0 +1,14 @@
+#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
+# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
+#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
+sp|Q7Z7A4|PXK_HUMAN  -          PK_Tyr_Ser-Thr       PF07714.20   8.4e-11   27.7   0.0   1.8e-08   20.1   0.0   2.3   2   0   0   2   2   2   2 PX domain-containing protein kinase-like protein OS=Homo sapiens OX=9606 GN=PXK PE=1 SV=1
+#
+# Program:         hmmsearch
+# Version:         3.1b2 (February 2015)
+# Pipeline mode:   SEARCH
+# Query file:      ../pfam/PK_Tyr_Ser-Thr.hmm
+# Target file:     ../KA/UniProtFasta2/Q7Z7A4.fasta.gz
+# Option settings: hmmsearch --tblout hh ../pfam/PK_Tyr_Ser-Thr.hmm ../KA/UniProtFasta2/Q7Z7A4.fasta.gz 
+# Current dir:     /home/gurdeep/projects/kinaseResistance/DB
+# Date:            Mon Jun  5 18:26:54 2023
+# [ok]
diff --git a/DB/make_db.py b/DB/make_db.py
diff --git a/KA/UniProtFasta/P0C263.txt.gz b/KA/UniProtFasta/P0C263.txt.gz
diff --git a/KA/UniProtFasta/Q5MAI5.txt.gz b/KA/UniProtFasta/Q5MAI5.txt.gz
diff --git a/KA/UniProtFasta/Q86YV6.txt.gz b/KA/UniProtFasta/Q86YV6.txt.gz
diff --git a/KA/UniProtFasta/Q96LW2.txt.gz b/KA/UniProtFasta/Q96LW2.txt.gz
diff --git a/KA/UniProtFasta2/P0C263.txt.gz b/KA/UniProtFasta2/P0C263.txt.gz
diff --git a/KA/UniProtFasta2/Q5MAI5.txt.gz b/KA/UniProtFasta2/Q5MAI5.txt.gz
diff --git a/KA/UniProtFasta2/Q86YV6.txt.gz b/KA/UniProtFasta2/Q86YV6.txt.gz
diff --git a/KA/UniProtFasta2/Q96LW2.txt.gz b/KA/UniProtFasta2/Q96LW2.txt.gz
diff --git a/ML/ML.py b/ML/ML.py
@@ -26,19 +26,52 @@
 from sklearn import tree
 import xgboost as xgb
 import pickle
+import argparse
 
-# 5 3 3 100 for AD
-# 12 3 3 100 for RN
+# 5 3 3 100 LvNA
+# 5 3 3 100 AvNL
+# 5 4 4 100 AIvNLD
+# 5 5 5 50 LDvNAI
+# 5 3 5 50 AIvLD
+# 5 3 4 100 AvL
+# 5 4 4 100 RvN
 
-RANDOM_STATE = 0
+
+RANDOM_STATE = 1
 ALGO = 'RF' #LR, XGB, RF
 N_SPLITS = 10
 N_REPEATS = 10
 N_JOBS = -1
 
 AA = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
 
-def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
+def makeSets(positives, negatives):
+    dic = {}
+    for set_type, set_type_name in [[positives, 'positives'], [negatives,'negatives']]:
+        if set_type_name not in dic:
+            dic[set_type_name] = []
+        for char in set_type:
+            if char == 'A':
+                dic[set_type_name].append('activating')
+            elif char == 'I':
+                dic[set_type_name].append('increase')
+            elif char == 'L':
+                dic[set_type_name].append('loss')
+            elif char == 'D':
+                dic[set_type_name].append('decrease')
+            elif char == 'R':
+                dic[set_type_name].append('resistance')
+            elif char == 'N':
+                dic[set_type_name].append('neutral')
+            else:
+                print('Error: invalid set type', char)
+                sys.exit(1)
+    return dic
+
+
+def main(max_depth, min_samples_split, min_samples_leaf, n_estimators,\
+        positives, negatives,
+        scaler_filename=None, model_filename=None):
     df = pd.read_csv('trainDataFromHitsSplitTrimmedAln.tsv.gz', sep = '\t')
     df['Dataset'] = df['Dataset'].replace(to_replace='train', value=0.025, regex=True)
     df['Dataset'] = df['Dataset'].replace(to_replace='test', value=0.3, regex=True)
@@ -63,62 +96,62 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
                         'hmmSS',
                         # 'ChargesWT',
                         # 'ChargesMUT',
-                        # 'ChargesDiff',
+                        'ChargesDiff',
                         #   'A_known',
                         #   'D_known',
                         #   'R_known',
                         #   'Phosphomimic',
                         #   'hmmScoreWT',
                         #   'hmmScoreMUT',
-                        #   'hmmScoreDiff'
+                          'hmmScoreDiff'
                         ]
     '''
     for aa in AA:
-        if aa not in ['S', 'T', 'Y']:
+        # if aa not in ['S', 'T', 'Y']:
             columns_to_exclude.append(aa+'_WT')
-        if aa not in ['D', 'E']:
+        # if aa not in ['D', 'E']:
             columns_to_exclude.append(aa+'_MUT')
     '''
 
     ############
     pfam_ptm_cols = ['ac_pfam', 'me_pfam', 'gl_pfam', 'm1_pfam', 'm2_pfam', 'm3_pfam', 'sm_pfam', 'ub_pfam']
     for i in range(-5,6):
-        if i in [-2, -1, 0, 1, 2]: continue
+        if i in [-1, 0, 1]: continue
         for col in pfam_ptm_cols:
             columns_to_exclude.append(col.split('_')[0]+'_'+str(i)+'_'+col.split('_')[1])
 
     pfam_ptm_cols = ['p_pfam']
     for i in range(-5,6):
-        if i in [-2, -1, 0, 1, 2]: continue
+        if i in [-1, 0, 1]: continue
         for col in pfam_ptm_cols:
             columns_to_exclude.append(col.split('_')[0]+'_'+str(i)+'_'+col.split('_')[1])
     ############
 
     ptm_cols = ['ac', 'me', 'gl', 'm1', 'm2', 'm3', 'sm', 'ub']
     for i in range(-5,6):
-        if i in [-2, -1, 0, 1, 2]: continue
+        if i in [-1, 0, 1]: continue
         for col in ptm_cols:
             columns_to_exclude.append(col.split('_')[0]+'_'+str(i))
 
     ptm_cols = ['p']
     for i in range(-5,6):
-        if i in [-2, -1, 0, 1, 2]: continue
+        if i in [-1, 0, 1]: continue
         for col in ptm_cols:
             columns_to_exclude.append(col.split('_')[0]+'_'+str(i))
 
     ############
 
     adr_cols = ['A', 'D', 'R']
     for i in range(-5, 6):
-        if i in [-2, -1, 1, 2]: continue
+        if i in [-1, 1]: continue
         for col in adr_cols:
             columns_to_exclude.append(col+'_'+str(i))
 
     ############
 
     adr_cols = ['A_pfam', 'D_pfam', 'R_pfam']
     for i in range(-5, 6):
-        if i in [-2, -1, 1, 2]: continue
+        if i in [-1, 0, 1]: continue
         for col in adr_cols:
             columns_to_exclude.append(col.split('_')[0]+'_'+str(i)+'_'+col.split('_')[1])
 
@@ -132,7 +165,7 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
     print ('columns to consider', df.columns.to_numpy())
     columns_to_consider = '\n'.join(df.columns.to_numpy())
     # print (columns_to_consider)
-    # open('columns_to_consider.txt', 'w').write(columns_to_consider)
+    open('columns_to_consider.txt', 'w').write(columns_to_consider)
 
     feature_names = df.columns.to_numpy()
     feature_names = feature_names[3:-1]
@@ -145,14 +178,17 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
     X_test = []
     y_test = []
     test_names = []
+    dic = makeSets(positives, negatives)
     for row in df.to_numpy():
         # print (row)
-        if row[-1] in ['activating', 'increase']:
+        # if row[-1] in ['activating', 'increase']:
+        if row[-1] in dic['positives']:
             y.append(1)
             y_names.append(row[-1])
             X.append(row[3:-1])
             train_names.append('/'.join(row[:3]))
-        elif row[-1] in ['neutral', 'loss', 'decrease']:
+        # elif row[-1] in ['loss', 'decrease']:
+        elif row[-1] in dic['negatives']:
             y.append(0)
             y_names.append(row[-1])
             X.append(row[3:-1])
@@ -180,7 +216,15 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
     scaler.fit(X)
     X = scaler.transform(X)
     X_test = scaler.transform(X_test)
+    for name, row in zip(test_names, X_test):
+        if 'A84T' in name:
+            print (row)
+            print (len(row))
+            break
+    # sys.exit()
     # pickle.dump(scaler, open('finalized_scaler_RN.pkl', 'wb'))
+    if scaler_filename is not None:
+        pickle.dump(scaler, open('scaler_'+scaler_filename+'.pkl', 'wb'))
 
     y = np.array(y)
 
@@ -266,7 +310,8 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
                     'max_depth': max_depth,
                     'min_samples_split': min_samples_split,
                     'min_samples_leaf': min_samples_leaf,
-                    'max_features': ['sqrt', 'log2'],
+                    # 'max_features': ['sqrt', 'log2'],
+                    'max_features': ['log2'],
                     'n_estimators': n_estimators
                     }
         # parameters = {
@@ -302,6 +347,10 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
     ## Best model hyper-parameters
     print ('Best model found during the CV')
     print (model.best_params_)
+    print (model.predict_proba(X))
+    for y_pred, y_true in zip(model.predict_proba(X), y):
+        open('ai_ld_roc.txt', 'a').write(str(y_pred[1]) + '\t' + str(y_true) + '\n')
+    # sys.exit()
     '''
     tprs = []
     aucs = []
@@ -471,6 +520,7 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
                         class_names = y_names,
                         filled=True)
         # plt.show()
+        '''
 
 
         print (''.join(['#' for i in range(1,25)]))
@@ -484,7 +534,9 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
                 data.append(row)
 
         df_feature_importances = pd.DataFrame(data, columns=['Feature', 'Importance'])
-        df_feature_importances.sort_values(by=['Importance'], ascending=False)
+        df_feature_importances = df_feature_importances.sort_values(by=['Importance'], ascending=False)
+        print (df_feature_importances)
+        '''
         sns.set(font_scale = 0.6)
         sns.barplot(data=df_feature_importances, color="grey", x="Importance", y="Feature")
         plt.grid(True, lw=0.1)
@@ -494,13 +546,14 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
 
 
     # filename = 'finalized_model_RN.sav'
-    # pickle.dump(clf, open(filename, 'wb'))
+    if model_filename is not None:
+        pickle.dump(clf, open('model_'+model_filename+'.sav', 'wb'))
 
-    test_types = ['activatingresistance','resistance', 'A', 'TBD', 'Inconclusive']
+    test_types = ['activatingresistance', 'increaseresistance','resistance', 'A', 'TBD', 'Inconclusive']
     for test_type in test_types:
         print (''.join(['#' for i in range(1,25)]))
-        if test_type in ['resistance', 'activatingresistance']:
-        # if test_type in ['activatingresistance']:
+        if test_type in ['activatingresistance', 'increaseresistance']:
+        # if test_type in ['activatingresistance', 'increaseresistance', 'resistance']:
             X_sub_test = []; y_sub_test = []
             for test_name, p, q in zip(test_names, X_test, y_test):
                 if q != test_type: continue
@@ -521,13 +574,41 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
                 X_sub_test = []
                 X_sub_test.append(p)
                 X_sub_test = np.array(X_sub_test)
+                if 'A84' in test_name:
+                    print (X_sub_test)
                 y_pred = round((clf.predict_proba(X_sub_test)[0])[1], 3)
                 print (test_name, y_pred, q)
 
 if __name__ == '__main__':
-    max_depth = int(sys.argv[1])
+    '''max_depth = int(sys.argv[1])
     min_samples_split = int(sys.argv[2])
     min_samples_leaf = int(sys.argv[3])
-    n_estimators = int(sys.argv[4])
-    print ('hello')
-    main([max_depth],[min_samples_split],[min_samples_leaf], [n_estimators])
+    n_estimators = int(sys.argv[4])'''
+    # set arguments
+    parser = argparse.ArgumentParser(description='Training for Activark', epilog='End of help.')
+    parser.add_argument('max_depth', help='')
+    parser.add_argument('min_samples_split', help='')
+    parser.add_argument('min_samples_leaf', help='')
+    parser.add_argument('n_estimators', help='')
+    parser.add_argument('name', help='AIvLD or AIvNLD or LDvNAI or RvN')
+    parser.add_argument('--s', help='filename of the scaler to be saved')
+    parser.add_argument('--m', help='filename of the model to be saved')
+    args = parser.parse_args()
+
+    # set input file to default if not provided
+    max_depth = int(args.max_depth)
+    min_samples_split = int(args.min_samples_split)
+    min_samples_leaf = int(args.min_samples_leaf)
+    n_estimators = int(args.n_estimators)
+    positives = args.name.split('v')[0]
+    negatives = args.name.split('v')[1]
+
+    if args.s: scaler_filename = args.s
+    else: scaler_filename = None
+
+    if args.m: model_filename = args.m
+    else: model_filename = None
+    # print ('hello')
+    main([max_depth],[min_samples_split],[min_samples_leaf], [n_estimators],\
+        positives=positives, negatives=negatives,
+        scaler_filename=scaler_filename, model_filename=model_filename)
diff --git a/ML/__pycache__/cls.cpython-310.pyc b/ML/__pycache__/cls.cpython-310.pyc
diff --git a/ML/__pycache__/fetchData.cpython-310.pyc b/ML/__pycache__/fetchData.cpython-310.pyc
diff --git a/ML/__pycache__/prepareTestData.cpython-310.pyc b/ML/__pycache__/prepareTestData.cpython-310.pyc
Original file line number	Diff line number	Diff line change
Expand Up		@@ -127,3 +127,4 @@ dmypy.json

		# Pyre type checker
		.pyre/
		webApp/static/predictor/output/*