Skip to content

Commit

Permalink
close #47, close #50, close #14, close #32
Browse files Browse the repository at this point in the history
  • Loading branch information
gurdeep330 committed Jun 6, 2023
1 parent 888b9f3 commit a2d0431
Show file tree
Hide file tree
Showing 110 changed files with 189,868 additions and 488 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,4 @@ dmypy.json

# Pyre type checker
.pyre/
webApp/static/predictor/output/*
Binary file not shown.
14 changes: 14 additions & 0 deletions DB/hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ ----- --- --- --- --- --- --- --- --- ---------------------
sp|Q7Z7A4|PXK_HUMAN - PK_Tyr_Ser-Thr PF07714.20 8.4e-11 27.7 0.0 1.8e-08 20.1 0.0 2.3 2 0 0 2 2 2 2 PX domain-containing protein kinase-like protein OS=Homo sapiens OX=9606 GN=PXK PE=1 SV=1
#
# Program: hmmsearch
# Version: 3.1b2 (February 2015)
# Pipeline mode: SEARCH
# Query file: ../pfam/PK_Tyr_Ser-Thr.hmm
# Target file: ../KA/UniProtFasta2/Q7Z7A4.fasta.gz
# Option settings: hmmsearch --tblout hh ../pfam/PK_Tyr_Ser-Thr.hmm ../KA/UniProtFasta2/Q7Z7A4.fasta.gz
# Current dir: /home/gurdeep/projects/kinaseResistance/DB
# Date: Mon Jun 5 18:26:54 2023
# [ok]
314 changes: 302 additions & 12 deletions DB/make_db.py

Large diffs are not rendered by default.

Binary file added KA/UniProtFasta/P0C263.txt.gz
Binary file not shown.
Binary file added KA/UniProtFasta/Q5MAI5.txt.gz
Binary file not shown.
Binary file added KA/UniProtFasta/Q86YV6.txt.gz
Binary file not shown.
Binary file added KA/UniProtFasta/Q96LW2.txt.gz
Binary file not shown.
Binary file added KA/UniProtFasta2/P0C263.txt.gz
Binary file not shown.
Binary file added KA/UniProtFasta2/Q5MAI5.txt.gz
Binary file not shown.
Binary file added KA/UniProtFasta2/Q86YV6.txt.gz
Binary file not shown.
Binary file added KA/UniProtFasta2/Q96LW2.txt.gz
Binary file not shown.
135 changes: 108 additions & 27 deletions ML/ML.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,52 @@
from sklearn import tree
import xgboost as xgb
import pickle
import argparse

# 5 3 3 100 for AD
# 12 3 3 100 for RN
# 5 3 3 100 LvNA
# 5 3 3 100 AvNL
# 5 4 4 100 AIvNLD
# 5 5 5 50 LDvNAI
# 5 3 5 50 AIvLD
# 5 3 4 100 AvL
# 5 4 4 100 RvN

RANDOM_STATE = 0

RANDOM_STATE = 1
ALGO = 'RF' #LR, XGB, RF
N_SPLITS = 10
N_REPEATS = 10
N_JOBS = -1

AA = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
def makeSets(positives, negatives):
dic = {}
for set_type, set_type_name in [[positives, 'positives'], [negatives,'negatives']]:
if set_type_name not in dic:
dic[set_type_name] = []
for char in set_type:
if char == 'A':
dic[set_type_name].append('activating')
elif char == 'I':
dic[set_type_name].append('increase')
elif char == 'L':
dic[set_type_name].append('loss')
elif char == 'D':
dic[set_type_name].append('decrease')
elif char == 'R':
dic[set_type_name].append('resistance')
elif char == 'N':
dic[set_type_name].append('neutral')
else:
print('Error: invalid set type', char)
sys.exit(1)
return dic


def main(max_depth, min_samples_split, min_samples_leaf, n_estimators,\
positives, negatives,
scaler_filename=None, model_filename=None):
df = pd.read_csv('trainDataFromHitsSplitTrimmedAln.tsv.gz', sep = '\t')
df['Dataset'] = df['Dataset'].replace(to_replace='train', value=0.025, regex=True)
df['Dataset'] = df['Dataset'].replace(to_replace='test', value=0.3, regex=True)
Expand All @@ -63,62 +96,62 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
'hmmSS',
# 'ChargesWT',
# 'ChargesMUT',
# 'ChargesDiff',
'ChargesDiff',
# 'A_known',
# 'D_known',
# 'R_known',
# 'Phosphomimic',
# 'hmmScoreWT',
# 'hmmScoreMUT',
# 'hmmScoreDiff'
'hmmScoreDiff'
]
'''
for aa in AA:
if aa not in ['S', 'T', 'Y']:
# if aa not in ['S', 'T', 'Y']:
columns_to_exclude.append(aa+'_WT')
if aa not in ['D', 'E']:
# if aa not in ['D', 'E']:
columns_to_exclude.append(aa+'_MUT')
'''

############
pfam_ptm_cols = ['ac_pfam', 'me_pfam', 'gl_pfam', 'm1_pfam', 'm2_pfam', 'm3_pfam', 'sm_pfam', 'ub_pfam']
for i in range(-5,6):
if i in [-2, -1, 0, 1, 2]: continue
if i in [-1, 0, 1]: continue
for col in pfam_ptm_cols:
columns_to_exclude.append(col.split('_')[0]+'_'+str(i)+'_'+col.split('_')[1])

pfam_ptm_cols = ['p_pfam']
for i in range(-5,6):
if i in [-2, -1, 0, 1, 2]: continue
if i in [-1, 0, 1]: continue
for col in pfam_ptm_cols:
columns_to_exclude.append(col.split('_')[0]+'_'+str(i)+'_'+col.split('_')[1])
############

ptm_cols = ['ac', 'me', 'gl', 'm1', 'm2', 'm3', 'sm', 'ub']
for i in range(-5,6):
if i in [-2, -1, 0, 1, 2]: continue
if i in [-1, 0, 1]: continue
for col in ptm_cols:
columns_to_exclude.append(col.split('_')[0]+'_'+str(i))

ptm_cols = ['p']
for i in range(-5,6):
if i in [-2, -1, 0, 1, 2]: continue
if i in [-1, 0, 1]: continue
for col in ptm_cols:
columns_to_exclude.append(col.split('_')[0]+'_'+str(i))

############

adr_cols = ['A', 'D', 'R']
for i in range(-5, 6):
if i in [-2, -1, 1, 2]: continue
if i in [-1, 1]: continue
for col in adr_cols:
columns_to_exclude.append(col+'_'+str(i))

############

adr_cols = ['A_pfam', 'D_pfam', 'R_pfam']
for i in range(-5, 6):
if i in [-2, -1, 1, 2]: continue
if i in [-1, 0, 1]: continue
for col in adr_cols:
columns_to_exclude.append(col.split('_')[0]+'_'+str(i)+'_'+col.split('_')[1])

Expand All @@ -132,7 +165,7 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
print ('columns to consider', df.columns.to_numpy())
columns_to_consider = '\n'.join(df.columns.to_numpy())
# print (columns_to_consider)
# open('columns_to_consider.txt', 'w').write(columns_to_consider)
open('columns_to_consider.txt', 'w').write(columns_to_consider)

feature_names = df.columns.to_numpy()
feature_names = feature_names[3:-1]
Expand All @@ -145,14 +178,17 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
X_test = []
y_test = []
test_names = []
dic = makeSets(positives, negatives)
for row in df.to_numpy():
# print (row)
if row[-1] in ['activating', 'increase']:
# if row[-1] in ['activating', 'increase']:
if row[-1] in dic['positives']:
y.append(1)
y_names.append(row[-1])
X.append(row[3:-1])
train_names.append('/'.join(row[:3]))
elif row[-1] in ['neutral', 'loss', 'decrease']:
# elif row[-1] in ['loss', 'decrease']:
elif row[-1] in dic['negatives']:
y.append(0)
y_names.append(row[-1])
X.append(row[3:-1])
Expand Down Expand Up @@ -180,7 +216,15 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
scaler.fit(X)
X = scaler.transform(X)
X_test = scaler.transform(X_test)
for name, row in zip(test_names, X_test):
if 'A84T' in name:
print (row)
print (len(row))
break
# sys.exit()
# pickle.dump(scaler, open('finalized_scaler_RN.pkl', 'wb'))
if scaler_filename is not None:
pickle.dump(scaler, open('scaler_'+scaler_filename+'.pkl', 'wb'))

y = np.array(y)

Expand Down Expand Up @@ -266,7 +310,8 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'max_features': ['sqrt', 'log2'],
# 'max_features': ['sqrt', 'log2'],
'max_features': ['log2'],
'n_estimators': n_estimators
}
# parameters = {
Expand Down Expand Up @@ -302,6 +347,10 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
## Best model hyper-parameters
print ('Best model found during the CV')
print (model.best_params_)
print (model.predict_proba(X))
for y_pred, y_true in zip(model.predict_proba(X), y):
open('ai_ld_roc.txt', 'a').write(str(y_pred[1]) + '\t' + str(y_true) + '\n')
# sys.exit()
'''
tprs = []
aucs = []
Expand Down Expand Up @@ -471,6 +520,7 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
class_names = y_names,
filled=True)
# plt.show()
'''


print (''.join(['#' for i in range(1,25)]))
Expand All @@ -484,7 +534,9 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
data.append(row)

df_feature_importances = pd.DataFrame(data, columns=['Feature', 'Importance'])
df_feature_importances.sort_values(by=['Importance'], ascending=False)
df_feature_importances = df_feature_importances.sort_values(by=['Importance'], ascending=False)
print (df_feature_importances)
'''
sns.set(font_scale = 0.6)
sns.barplot(data=df_feature_importances, color="grey", x="Importance", y="Feature")
plt.grid(True, lw=0.1)
Expand All @@ -494,13 +546,14 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):


# filename = 'finalized_model_RN.sav'
# pickle.dump(clf, open(filename, 'wb'))
if model_filename is not None:
pickle.dump(clf, open('model_'+model_filename+'.sav', 'wb'))

test_types = ['activatingresistance','resistance', 'A', 'TBD', 'Inconclusive']
test_types = ['activatingresistance', 'increaseresistance','resistance', 'A', 'TBD', 'Inconclusive']
for test_type in test_types:
print (''.join(['#' for i in range(1,25)]))
if test_type in ['resistance', 'activatingresistance']:
# if test_type in ['activatingresistance']:
if test_type in ['activatingresistance', 'increaseresistance']:
# if test_type in ['activatingresistance', 'increaseresistance', 'resistance']:
X_sub_test = []; y_sub_test = []
for test_name, p, q in zip(test_names, X_test, y_test):
if q != test_type: continue
Expand All @@ -521,13 +574,41 @@ def main(max_depth, min_samples_split, min_samples_leaf, n_estimators):
X_sub_test = []
X_sub_test.append(p)
X_sub_test = np.array(X_sub_test)
if 'A84' in test_name:
print (X_sub_test)
y_pred = round((clf.predict_proba(X_sub_test)[0])[1], 3)
print (test_name, y_pred, q)

if __name__ == '__main__':
max_depth = int(sys.argv[1])
'''max_depth = int(sys.argv[1])
min_samples_split = int(sys.argv[2])
min_samples_leaf = int(sys.argv[3])
n_estimators = int(sys.argv[4])
print ('hello')
main([max_depth],[min_samples_split],[min_samples_leaf], [n_estimators])
n_estimators = int(sys.argv[4])'''
# set arguments
parser = argparse.ArgumentParser(description='Training for Activark', epilog='End of help.')
parser.add_argument('max_depth', help='')
parser.add_argument('min_samples_split', help='')
parser.add_argument('min_samples_leaf', help='')
parser.add_argument('n_estimators', help='')
parser.add_argument('name', help='AIvLD or AIvNLD or LDvNAI or RvN')
parser.add_argument('--s', help='filename of the scaler to be saved')
parser.add_argument('--m', help='filename of the model to be saved')
args = parser.parse_args()

# set input file to default if not provided
max_depth = int(args.max_depth)
min_samples_split = int(args.min_samples_split)
min_samples_leaf = int(args.min_samples_leaf)
n_estimators = int(args.n_estimators)
positives = args.name.split('v')[0]
negatives = args.name.split('v')[1]

if args.s: scaler_filename = args.s
else: scaler_filename = None

if args.m: model_filename = args.m
else: model_filename = None
# print ('hello')
main([max_depth],[min_samples_split],[min_samples_leaf], [n_estimators],\
positives=positives, negatives=negatives,
scaler_filename=scaler_filename, model_filename=model_filename)
Binary file modified ML/__pycache__/cls.cpython-310.pyc
Binary file not shown.
Binary file modified ML/__pycache__/fetchData.cpython-310.pyc
Binary file not shown.
Binary file modified ML/__pycache__/prepareTestData.cpython-310.pyc
Binary file not shown.
Loading

0 comments on commit a2d0431

Please sign in to comment.