update DB

russelllab · May 26, 2023 · 73d6377 · 73d6377
1 parent 744d0f0
commit 73d6377
Show file tree

Hide file tree

Showing 24 changed files with 27,316 additions and 21,329 deletions.
diff --git a/DB/make_db.py b/DB/make_db.py
@@ -256,10 +256,10 @@ def create_mutations_table(mycursor)->None:
         # print (gene, acc, mutation, pfamPos)
         # if pfamPos is None: continue
         mut_type = line.split('\t')[5].lstrip().rstrip()
-        if mut_type not in ['increase', 'activation', 'activating', 'decrease', 'loss']:
+        if mut_type not in ['increase', 'activating', 'decrease', 'loss']:
             print (mutation, gene, 'is unknown', mut_type)
             continue
-        mut_type = 'A' if mut_type in ['increase', 'activation', 'activating'] else 'D'
+        # mut_type = 'A' if mut_type in ['increase', 'activation', 'activating'] else 'D'
         # print (acc, kinases[acc].gene, wtAA, position, mutAA)
         # print (mutation, mut_type, gene)
         # mutation = wtAA + wtPos + mutAA
@@ -287,7 +287,7 @@ def create_mutations_table(mycursor)->None:
         wtPos = mutation[1:-1]
         pfamPos = find_pfampos(mycursor, acc, wtPos)
         # mutation = wtAA + wtPos + mutAA
-        mut_type = 'R'
+        mut_type = 'resistance'
         source = 'COSMIC'
         info = '-'
         # if wtPos not in seq2pfam[acc]:
@@ -314,7 +314,7 @@ def create_mutations_table(mycursor)->None:
         if wtPos.isdigit() == False: continue
         # print (acc, wtPos)
         pfamPos = find_pfampos(mycursor, acc, wtPos)
-        mut_type = 'N'
+        mut_type = 'neutral'
         source = 'gnomAD'
         info = 'AC/AN:'+str(line.split('\t')[7]) + '; Hom/AC:'+str(line.split('\t')[8].rstrip())
         # if acc not in seq2pfam:
@@ -539,17 +539,17 @@ def create_kinases_table(mycursor)->None:
 
     # Create tables
     # print ('Creating HMM table')
-    # create_hmm_table(mycursor)
+    create_hmm_table(mycursor)
     # print ('Creating kinases table')
-    # create_kinases_table(mycursor)
+    create_kinases_table(mycursor)
     # print ('Creating mutation table')
     create_mutations_table(mycursor)
     # print ('Creating homology table')
     # create_homology_table(mycursor)
     # print ('Creating PTM table')
-    # create_ptm_table(mycursor)
+    create_ptm_table(mycursor)
     # print ('Creating alignment table')
-    # create_alignment_table(mycursor)
+    create_alignment_table(mycursor)
     mydb.commit()
 
     # Use mysqldump to create backup file

diff --git a/DB/schemaspy-6.2.2.jar b/DB/schemaspy-6.2.2.jar
diff --git a/DB/tmp_dataframe.csv b/DB/tmp_dataframe.csv
diff --git a/ML/ML.py b/ML/ML.py
@@ -30,7 +30,7 @@
 # 5 3 3 100 for AD
 # 12 3 3 100 for RN
 
-RANDOM_STATE = 1
+RANDOM_STATE = 0
 ALGO = 'RF' #LR, XGB, RF
 N_SPLITS = 10
 N_REPEATS = 10
@@ -146,12 +146,13 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
     y_test = []
     test_names = []
     for row in df.to_numpy():
-        if row[-1] in ['A']:
+        # print (row)
+        if row[-1] in ['activating', 'increase']:
             y.append(1)
             y_names.append(row[-1])
             X.append(row[3:-1])
             train_names.append('/'.join(row[:3]))
-        elif row[-1] in ['D']:
+        elif row[-1] in ['neutral', 'loss', 'decrease']:
             y.append(0)
             y_names.append(row[-1])
             X.append(row[3:-1])
@@ -285,7 +286,7 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
         rf = RandomForestClassifier(random_state=RANDOM_STATE, class_weight="balanced", n_jobs=N_JOBS)
         model = GridSearchCV(rf, parameters, cv=rskf, scoring='roc_auc', n_jobs=N_JOBS)
         model.fit(X, y)
-        print (model.cv_results_['mean_test_score'])
+        print ('mean_test_score', model.cv_results_['mean_test_score'])
         # print (model.cv_results_['mean_train_score'])
         clf = RandomForestClassifier(
                 n_estimators=model.best_params_['n_estimators'],
@@ -491,10 +492,11 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
     # filename = 'finalized_model_RN.sav'
     # pickle.dump(clf, open(filename, 'wb'))
 
-    test_types = ['AR', 'Activating', 'TBD', 'Inconclusive']
+    test_types = ['activatingresistance','resistance', 'A', 'TBD', 'Inconclusive']
     for test_type in test_types:
         print (''.join(['#' for i in range(1,25)]))
-        if test_type in ['AR', 'R']:
+        if test_type in ['resistance', 'activatingresistance']:
+        # if test_type in ['activatingresistance']:
             X_sub_test = []; y_sub_test = []
             for test_name, p, q in zip(test_names, X_test, y_test):
                 if q != test_type: continue

diff --git a/ML/__pycache__/cls.cpython-310.pyc b/ML/__pycache__/cls.cpython-310.pyc
diff --git a/ML/__pycache__/fetchData.cpython-310.pyc b/ML/__pycache__/fetchData.cpython-310.pyc
diff --git a/ML/out.txt b/ML/out.txt
diff --git a/ML/prepareTrainData.py b/ML/prepareTrainData.py
@@ -116,7 +116,7 @@ def fetchStrucFeat(acc, domainNum):
         kinases[acc].mutations[mutation].positionHmm = seq2pfam[acc][position]
     # pkinase_act_deact_res[mut_type].append(kinases[acc].mutations[mutation].positionHmm)
 
-pkinase_act_deact_res = {'A': [], 'D': [], 'R': [], 'N': []}
+# pkinase_act_deact_res = {'A': [], 'D': [], 'R': [], 'N': []}
 '''Fetch act/deact mutation data'''
 mycursor.execute("select acc, gene, mutation, wtaa, mutaa, wtpos, mut_type from mutations")
 for row in mycursor.fetchall():
@@ -241,13 +241,13 @@ def fetchStrucFeat(acc, domainNum):
         trainMat += '\t'.join([str(item) for item in adr_row]) + '\t'
         trainMat += mut_types + '\n'
 
-        if mut_types == 'A':
+        if mut_types in ['activating', 'increase']:
             mut_types_colors.append('green')
-        elif mut_types == 'D':
+        elif mut_types in ['loss', 'decrease']:
             mut_types_colors.append('red')
-        elif mut_types == 'N':
+        elif mut_types == 'neutral':
             mut_types_colors.append('cyan')
-        elif mut_types == 'R':
+        elif mut_types == 'resistance':
             mut_types_colors.append('blue')
         else:
             mut_types_colors.append('violet')

diff --git a/ML/test_mutations.txt b/ML/test_mutations.txt
@@ -1,10 +1,10 @@
-Q02750 MAP2K1 Q56P Activating
+Q02750 MAP2K1 Q56P A
 P11309 PIM1 T23I Inconclusive
-P11309 PIM1 S97N Activating
+P11309 PIM1 S97N A
 P11309 PIM1 Q127E Inconclusive
 O96017 CHEK2 K373E Inconclusive
-O96017 CHEK2 T68A Neutral
-P46734 MAP2K3 A84T Activating
+O96017 CHEK2 T68A N
+P46734 MAP2K3 A84T A
 P46734 MAP2K3 R94L Inconclusive
 P46734 MAP2K3 R96W Inconclusive
 P46734 MAP2K3 L215W Inconclusive

diff --git a/ML/trainDataFromHitsSplitTrimmedAln.tsv.gz b/ML/trainDataFromHitsSplitTrimmedAln.tsv.gz
diff --git a/data/humanKinasesHits.hmmsearch b/data/humanKinasesHits.hmmsearch
@@ -526,5 +526,5 @@ sp|Q7Z695|ADCK2_HUMAN -            626 Pkinase              PF00069.28   264
 # Target file:     humanKinases.fasta
 # Option settings: hmmsearch --domtblout humanKinasesHits.hmmsearch --noali --domE 1.0 ../pfam/Pkinase.hmm humanKinases.fasta 
 # Current dir:     /home/gurdeep/projects/kinaseResistance/data
-# Date:            Wed May 24 13:53:08 2023
+# Date:            Fri May 26 17:33:42 2023
 # [ok]
diff --git a/data/humanKinasesHitsSplitHmmsearchTrimmed.txt.gz b/data/humanKinasesHitsSplitHmmsearchTrimmed.txt.gz
diff --git a/data/humanKinasesHitsSplitHmmsearchTrimmedMappings.tsv.gz b/data/humanKinasesHitsSplitHmmsearchTrimmedMappings.tsv.gz