Skip to content

Commit

Permalink
fix mutation description
Browse files Browse the repository at this point in the history
  • Loading branch information
gurdeep330 committed Jun 7, 2023
1 parent 3be12a0 commit 09f1fd2
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 40 deletions.
61 changes: 34 additions & 27 deletions DB/make_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,13 +236,14 @@ def create_mutations_table(mycursor)->None:
mut_type VARCHAR(10), \
acc VARCHAR(10) REFERENCES kinases(acc) DEFERRABLE, \
gene VARCHAR(10), \
info TEXT, source VARCHAR(200) \
info TEXT, pubmed TEXT, source VARCHAR(200) \
)")
# UNIQUE(mutation, wtAA, wtPos, mutAA, mut_type, acc, gene, info, source)\
# for line in open('../AK_mut_w_sc_feb2023/act_deact_v2.tsv', 'r'):
# for line in open('../data/mutations/act_deact_mutlist_may2023.tsv', 'r'):
for line in gzip.open('../data/mutations/ad_mutations.tsv.gz', 'rt'):
if line.split()[0] == 'UniProtAcc': continue
# print (line)
# gene = line.split('\t')[0]
acc = line.split('\t')[0]
gene = line.split('\t')[1]
Expand All @@ -255,7 +256,7 @@ def create_mutations_table(mycursor)->None:
acc, gene, uniprot_id, protein_name = fetchData.getAccGene(mycursor, acc)
# print (gene, acc, mutation, pfamPos)
# if pfamPos is None: continue
mut_type = line.split('\t')[5].lstrip().rstrip()
mut_type = line.split('\t')[6].lstrip().rstrip()
if mut_type not in ['increase', 'activating', 'decrease', 'loss']:
print (mutation, gene, 'is unknown', mut_type)
continue
Expand All @@ -267,11 +268,13 @@ def create_mutations_table(mycursor)->None:
# info = 'info'
# source = line.split('\t')[-1]
source = 'UniProt'
pubmed = line.split('\t')[5]
# print (mutation, wtAA, wtPos, mutAA, mut_type, acc, gene, info, source)
mycursor.execute("INSERT INTO mutations (mutation, wtAA, wtPos, mutAA, \
pfamPos, mut_type, acc, gene, info, source) \
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", \
(mutation, wtAA, wtPos, mutAA, pfamPos, mut_type, acc, gene, info, source))
pfamPos, mut_type, acc, gene, info, pubmed, source) \
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", \
(mutation, wtAA, wtPos, mutAA, pfamPos, mut_type,\
acc, gene, info, pubmed, source))

'''Fetch resistant mutation data'''
# for line in open('../AK_mut_w_sc_feb2023/res_mut_v3_only_subs_KD_neighb.tsv', 'r'):
Expand All @@ -290,13 +293,15 @@ def create_mutations_table(mycursor)->None:
mut_type = 'resistance'
source = 'COSMIC'
info = '-'
pubmed = '-'
# if wtPos not in seq2pfam[acc]:
# print (f'{uniprot_position} seems to be outside the domain in {acc} and reported {mut_type}')
# continue
mycursor.execute("INSERT INTO mutations (mutation, wtAA, wtPos, mutAA, \
pfamPos, mut_type, acc, gene, info, source) \
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", \
(mutation, wtAA, wtPos, mutAA, pfamPos, mut_type, acc, gene, info, source))
pfamPos, mut_type, acc, gene, info, pubmed, source) \
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", \
(mutation, wtAA, wtPos, mutAA, pfamPos, mut_type,\
acc, gene, info, pubmed, source))

'''Fetch neutral mutation data'''
# for line in open('../AK_mut_w_sc_feb2023/nat_mut_tidy_v2_march2023.tsv', 'r'):
Expand All @@ -317,16 +322,18 @@ def create_mutations_table(mycursor)->None:
mut_type = 'neutral'
source = 'gnomAD'
info = 'AC/AN:'+str(line.split('\t')[7]) + '; Hom/AC:'+str(line.split('\t')[8].rstrip())
pubmed = '-'
# if acc not in seq2pfam:
# continue
# if uniprot_position not in seq2pfam[acc]:
# print (f'{uniprot_position} seems to be outside the domain and reported {mut_type}')
# print (seq2pfam[acc])
# continue
mycursor.execute("INSERT INTO mutations (mutation, wtAA, wtPos, mutAA, \
pfamPos, mut_type, acc, gene, info, source) \
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", \
(mutation, wtAA, wtPos, mutAA, pfamPos, mut_type, acc, gene, info, source))
pfamPos, mut_type, acc, gene, info, pubmed, source) \
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", \
(mutation, wtAA, wtPos, mutAA, pfamPos, mut_type, acc,\
gene, info, pubmed, source))

def createDicForDSSP(dic, position, mutation, value):
if position not in dic: dic[position] = {}
Expand Down Expand Up @@ -822,24 +829,24 @@ def create_kinases_table(mycursor)->None:
'''

# Create tables
print ('Creating HMM table')
create_hmm_table(mycursor)
print ('Creating kinases table')
create_kinases_table(mycursor)
# print ('Creating HMM table')
# create_hmm_table(mycursor)
# print ('Creating kinases table')
# create_kinases_table(mycursor)
print ('Creating mutation table')
create_mutations_table(mycursor)
print ('Creating homology table')
create_homology_table(mycursor)
print ('Creating IUPRED table')
create_iupred_table(mycursor)
print ('Creating Mechismo table')
create_mechismo_table(mycursor)
print ('Creating DSSP tables')
create_dssp_tables(mycursor)
print ('Creating PTM table')
create_ptm_table(mycursor)
print ('Creating alignment table')
create_alignment_table(mycursor)
# print ('Creating homology table')
# create_homology_table(mycursor)
# print ('Creating IUPRED table')
# create_iupred_table(mycursor)
# print ('Creating Mechismo table')
# create_mechismo_table(mycursor)
# print ('Creating DSSP tables')
# create_dssp_tables(mycursor)
# print ('Creating PTM table')
# create_ptm_table(mycursor)
# print ('Creating alignment table')
# create_alignment_table(mycursor)
mydb.commit()

# Use mysqldump to create backup file
Expand Down
Binary file modified data/mutations/ad_mutations.tsv.gz
Binary file not shown.
56 changes: 47 additions & 9 deletions data/mutations/integerate_ad.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
#!/usr/bin/env python3

import os, sys, gzip
import os, sys, gzip, re

class Mutation:
def __init__(self, mutation, mutation_type, gene, info, outcome):
def __init__(self, mutation, mutation_type, gene, info, pubmedIDs, outcome):
self.acc = mutation.split('/')[0]
self.mut = mutation.split('/')[1]
# self.mutation = mutation
self.mutation_type = mutation_type
self.gene = gene
self.info = info
self.pubmedIDs = pubmedIDs
self.outcome = outcome

def show(self):
Expand All @@ -21,34 +22,69 @@ def show(self):
# self.mutation,
self.mutation_type,
self.info,
self.pubmedIDs,
self.outcome
])

def extractInformation(text):
'''
extract pubmed information
'''
pubmed_pattern = r"PubMed:\s*(\d+)"
pubmed_matches = re.findall(pubmed_pattern, text)
pubmed = ','.join([match for match in pubmed_matches])
pubmed = pubmed.rstrip('\n')

return pubmed

info_dic = {}
for line in open('../tt3', 'r'):
if line.startswith('\n'): continue
if line.startswith('UniProt'): continue
if line.startswith('Error:'): continue
# print(line)
mutation = line.split('\t')[3]
info = line.split('\t')[5]+ ' ' + line.split('\t')[6]
pubmedIDs = line.split('\t')[8].rstrip()
if mutation not in info_dic:
info_dic[mutation] = {}
info_dic[mutation]['info'] = info
info_dic[mutation]['pubmedIDs'] = pubmedIDs

dic = {}
for line in open('ana_set.tsv', 'r'):
if line.startswith('UniProtAccMutation'): continue
mutation= line.split('\t')[0]
mutation_type = line.split('\t')[5]
gene = line.split('\t')[3]
info = line.split('\t')[9]
# info = line.split('\t')[9]
if mutation in info_dic:
info = info_dic[mutation]['info']
pubmedIDs = info_dic[mutation]['pubmedIDs']
else:
description = line.split('\t')[9].rstrip()
info = description.split('"";/evidence')[0]
pubmedIDs = extractInformation(description)
outcome1 = line.split('\t')[10]
outcome2 = line.split('\t')[13]
if outcome2 == 'exclude': continue
outcome = outcome2 if outcome2 != '' else outcome1
# print (line)
if mutation not in dic:
dic[mutation] = Mutation(mutation, mutation_type, gene, info, outcome)
dic[mutation] = Mutation(mutation, mutation_type, gene, info, pubmedIDs, outcome)
dic[mutation].show()

for line in open('missing_cases_annotated.tsv', 'r'):
if line.startswith('UniProtAcc'): continue
mutation= line.split('\t')[3]
gene = line.split('\t')[1]
info = line.split('\t')[6] + ' '+ line.split('\t')[7]
info = line.split('\t')[5] + ' '+ line.split('\t')[6]
pubmedIDs = line.split('\t')[8]
mutation_type = line.split('\t')[5]
outcome = line.split('\t')[9]
if outcome == '': continue
if mutation not in dic:
dic[mutation] = Mutation(mutation, mutation_type, gene, info, outcome)
dic[mutation] = Mutation(mutation, mutation_type, gene, info, pubmedIDs, outcome)
dic[mutation].show()

for line in gzip.open('final_mined_RR_checked_checked-again.txt.gz', 'rt'):
Expand All @@ -60,15 +96,17 @@ def show(self):
if mut[0].isalpha() == False: continue
if mut[-1].isalpha() == False: continue
gene = line.split('\t')[1]
info = line.split('\t')[3]+'; '+line.split('\t')[2]
pubmedIDs = line.split('\t')[3]
mutation_type = 'VARIANT'
info = mutation_type + ' '+line.split('\t')[2]
outcome = 'activating'
if mutation not in dic:
dic[mutation] = Mutation(mutation, mutation_type, gene, info, outcome)
dic[mutation] = Mutation(mutation, mutation_type, gene, info, pubmedIDs, outcome)
# dic[mutation].show()

text = 'UniProtAcc\tGene\tMutation\tMutationType\tInfo\tOutcome\n'
text = 'UniProtAcc\tGene\tMutation\tMutationType\tInfo\tPubMedID\tOutcome\n'
for mutation in dic:
# print (mutation)
text += dic[mutation].show() + '\n'
# print (dic[mutation].show())

Expand Down
11 changes: 7 additions & 4 deletions webApp/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def makeText(acc, gene, mutation, interested_kinase_pfampos, mycursor):
alnpos, pfampos = mycursor.fetchone()
# print(pfampos)
if pfampos == '-' or alnpos == '-': continue
mycursor.execute("SELECT mutation, wtaa, wtpos, mut_type, acc, gene, info FROM mutations \
mycursor.execute("SELECT mutation, wtaa, wtpos, mut_type, acc, gene, info, pubmed FROM mutations \
WHERE pfampos = %s", (str(pfampos), ))
hits = mycursor.fetchall()
for entry in hits:
Expand All @@ -208,6 +208,7 @@ def makeText(acc, gene, mutation, interested_kinase_pfampos, mycursor):
# if ref_acc == acc: continue
ref_gene = entry[5]
info = entry[6]
pubmedIDs = entry[7]
text += "<b>" + ref_gene+'/'+str(ref_mutation) + "</b>" + ' is a known '+dic_mutations[mut_type]+' mutation.'
row = []
row.append(ref_gene)
Expand All @@ -219,10 +220,12 @@ def makeText(acc, gene, mutation, interested_kinase_pfampos, mycursor):
row.append(str(pfampos) + makeWindowText(pfampos, interested_kinase_pfampos))
row.append(str(alnpos) + makeWindowText(alnpos, interested_kinase_alnpos))
row.append(dic_mutations[mut_type])
row.append(info.split('"""')[0] if '"' in info else '-')
if mut_type != 'R':
# row.append(info.split('"""')[0] if '"' in info else '-')
row.append(info)
if mut_type != 'resistance':
text += ' <u>Description</u>: ' + info.split('"""')[0]
pubmed_ids = extract_pubmed_ids(info.replace('"', '')) # remove double quotes
# pubmed_ids = extract_pubmed_ids(info.replace('"', '')) # remove double quotes
pubmed_ids = pubmedIDs.split(',')
pubmed_ids_text = '+or+'.join(pubmed_ids)
# for pubmed_id in pubmed_ids:
# pubmed_ids_text.append('<a href=\"https://pubmed.ncbi.nlm.nih.gov/' + str(pubmed_id) + '\" target=\"_blank\">' + str(pubmed_id) + '<i class="bi bi-box-arrow-in-up-right"></i></a>')
Expand Down

0 comments on commit 09f1fd2

Please sign in to comment.