Skip to content

Commit

Permalink
update DB
Browse files Browse the repository at this point in the history
  • Loading branch information
gurdeep330 committed May 26, 2023
1 parent 744d0f0 commit 73d6377
Show file tree
Hide file tree
Showing 24 changed files with 27,316 additions and 21,329 deletions.
16 changes: 8 additions & 8 deletions DB/make_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,10 @@ def create_mutations_table(mycursor)->None:
# print (gene, acc, mutation, pfamPos)
# if pfamPos is None: continue
mut_type = line.split('\t')[5].lstrip().rstrip()
if mut_type not in ['increase', 'activation', 'activating', 'decrease', 'loss']:
if mut_type not in ['increase', 'activating', 'decrease', 'loss']:
print (mutation, gene, 'is unknown', mut_type)
continue
mut_type = 'A' if mut_type in ['increase', 'activation', 'activating'] else 'D'
# mut_type = 'A' if mut_type in ['increase', 'activation', 'activating'] else 'D'
# print (acc, kinases[acc].gene, wtAA, position, mutAA)
# print (mutation, mut_type, gene)
# mutation = wtAA + wtPos + mutAA
Expand Down Expand Up @@ -287,7 +287,7 @@ def create_mutations_table(mycursor)->None:
wtPos = mutation[1:-1]
pfamPos = find_pfampos(mycursor, acc, wtPos)
# mutation = wtAA + wtPos + mutAA
mut_type = 'R'
mut_type = 'resistance'
source = 'COSMIC'
info = '-'
# if wtPos not in seq2pfam[acc]:
Expand All @@ -314,7 +314,7 @@ def create_mutations_table(mycursor)->None:
if wtPos.isdigit() == False: continue
# print (acc, wtPos)
pfamPos = find_pfampos(mycursor, acc, wtPos)
mut_type = 'N'
mut_type = 'neutral'
source = 'gnomAD'
info = 'AC/AN:'+str(line.split('\t')[7]) + '; Hom/AC:'+str(line.split('\t')[8].rstrip())
# if acc not in seq2pfam:
Expand Down Expand Up @@ -539,17 +539,17 @@ def create_kinases_table(mycursor)->None:

# Create tables
# print ('Creating HMM table')
# create_hmm_table(mycursor)
create_hmm_table(mycursor)
# print ('Creating kinases table')
# create_kinases_table(mycursor)
create_kinases_table(mycursor)
# print ('Creating mutation table')
create_mutations_table(mycursor)
# print ('Creating homology table')
# create_homology_table(mycursor)
# print ('Creating PTM table')
# create_ptm_table(mycursor)
create_ptm_table(mycursor)
# print ('Creating alignment table')
# create_alignment_table(mycursor)
create_alignment_table(mycursor)
mydb.commit()

# Use mysqldump to create backup file
Expand Down
Binary file added DB/schemaspy-6.2.2.jar
Binary file not shown.
843 changes: 843 additions & 0 deletions DB/tmp_dataframe.csv

Large diffs are not rendered by default.

14 changes: 8 additions & 6 deletions ML/ML.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
# 5 3 3 100 for AD
# 12 3 3 100 for RN

RANDOM_STATE = 1
RANDOM_STATE = 0
ALGO = 'RF' #LR, XGB, RF
N_SPLITS = 10
N_REPEATS = 10
Expand Down Expand Up @@ -146,12 +146,13 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
y_test = []
test_names = []
for row in df.to_numpy():
if row[-1] in ['A']:
# print (row)
if row[-1] in ['activating', 'increase']:
y.append(1)
y_names.append(row[-1])
X.append(row[3:-1])
train_names.append('/'.join(row[:3]))
elif row[-1] in ['D']:
elif row[-1] in ['neutral', 'loss', 'decrease']:
y.append(0)
y_names.append(row[-1])
X.append(row[3:-1])
Expand Down Expand Up @@ -285,7 +286,7 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
rf = RandomForestClassifier(random_state=RANDOM_STATE, class_weight="balanced", n_jobs=N_JOBS)
model = GridSearchCV(rf, parameters, cv=rskf, scoring='roc_auc', n_jobs=N_JOBS)
model.fit(X, y)
print (model.cv_results_['mean_test_score'])
print ('mean_test_score', model.cv_results_['mean_test_score'])
# print (model.cv_results_['mean_train_score'])
clf = RandomForestClassifier(
n_estimators=model.best_params_['n_estimators'],
Expand Down Expand Up @@ -491,10 +492,11 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
# filename = 'finalized_model_RN.sav'
# pickle.dump(clf, open(filename, 'wb'))

test_types = ['AR', 'Activating', 'TBD', 'Inconclusive']
test_types = ['activatingresistance','resistance', 'A', 'TBD', 'Inconclusive']
for test_type in test_types:
print (''.join(['#' for i in range(1,25)]))
if test_type in ['AR', 'R']:
if test_type in ['resistance', 'activatingresistance']:
# if test_type in ['activatingresistance']:
X_sub_test = []; y_sub_test = []
for test_name, p, q in zip(test_names, X_test, y_test):
if q != test_type: continue
Expand Down
Binary file modified ML/__pycache__/cls.cpython-310.pyc
Binary file not shown.
Binary file modified ML/__pycache__/fetchData.cpython-310.pyc
Binary file not shown.
34,960 changes: 24,246 additions & 10,714 deletions ML/out.txt

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions ML/prepareTrainData.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def fetchStrucFeat(acc, domainNum):
kinases[acc].mutations[mutation].positionHmm = seq2pfam[acc][position]
# pkinase_act_deact_res[mut_type].append(kinases[acc].mutations[mutation].positionHmm)

pkinase_act_deact_res = {'A': [], 'D': [], 'R': [], 'N': []}
# pkinase_act_deact_res = {'A': [], 'D': [], 'R': [], 'N': []}
'''Fetch act/deact mutation data'''
mycursor.execute("select acc, gene, mutation, wtaa, mutaa, wtpos, mut_type from mutations")
for row in mycursor.fetchall():
Expand Down Expand Up @@ -241,13 +241,13 @@ def fetchStrucFeat(acc, domainNum):
trainMat += '\t'.join([str(item) for item in adr_row]) + '\t'
trainMat += mut_types + '\n'

if mut_types == 'A':
if mut_types in ['activating', 'increase']:
mut_types_colors.append('green')
elif mut_types == 'D':
elif mut_types in ['loss', 'decrease']:
mut_types_colors.append('red')
elif mut_types == 'N':
elif mut_types == 'neutral':
mut_types_colors.append('cyan')
elif mut_types == 'R':
elif mut_types == 'resistance':
mut_types_colors.append('blue')
else:
mut_types_colors.append('violet')
Expand Down
8 changes: 4 additions & 4 deletions ML/test_mutations.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Q02750 MAP2K1 Q56P Activating
Q02750 MAP2K1 Q56P A
P11309 PIM1 T23I Inconclusive
P11309 PIM1 S97N Activating
P11309 PIM1 S97N A
P11309 PIM1 Q127E Inconclusive
O96017 CHEK2 K373E Inconclusive
O96017 CHEK2 T68A Neutral
P46734 MAP2K3 A84T Activating
O96017 CHEK2 T68A N
P46734 MAP2K3 A84T A
P46734 MAP2K3 R94L Inconclusive
P46734 MAP2K3 R96W Inconclusive
P46734 MAP2K3 L215W Inconclusive
Expand Down
Binary file modified ML/trainDataFromHitsSplitTrimmedAln.tsv.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion data/humanKinasesHits.hmmsearch
Original file line number Diff line number Diff line change
Expand Up @@ -526,5 +526,5 @@ sp|Q7Z695|ADCK2_HUMAN - 626 Pkinase PF00069.28 264
# Target file: humanKinases.fasta
# Option settings: hmmsearch --domtblout humanKinasesHits.hmmsearch --noali --domE 1.0 ../pfam/Pkinase.hmm humanKinases.fasta
# Current dir: /home/gurdeep/projects/kinaseResistance/data
# Date: Wed May 24 13:53:08 2023
# Date: Fri May 26 17:33:42 2023
# [ok]
Binary file modified data/humanKinasesHitsSplitHmmsearchTrimmed.txt.gz
Binary file not shown.
Binary file modified data/humanKinasesHitsSplitHmmsearchTrimmedMappings.tsv.gz
Binary file not shown.
Loading

0 comments on commit 73d6377

Please sign in to comment.