-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
803 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
data/*.txt | ||
data/*.tsv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,353 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# AddDB Updater\n", | ||
"\n", | ||
"This is a Jupyter notebook to build a python script that can parse and create a fullly annotated tsv file to replace gene_xref for ANNOVAR. This script could be useful to update these database and be added to the Achabilarity container.\n", | ||
"\n", | ||
"# Import library" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 82, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Parser function\n", | ||
"\n", | ||
"These functions will parse them for merging.\n", | ||
"\n", | ||
"## HGNC" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 83, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def hgnc(file):\n", | ||
" hgnc = pd.read_csv(file, sep='\\t')\n", | ||
" hgnc.rename(columns={'Approved symbol': '#Gene_name'}, inplace=True)\n", | ||
" return(hgnc)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 84, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" #Gene_name Approved name\n", | ||
"0 A1BG alpha-1-B glycoprotein\n", | ||
"1 A1BG-AS1 A1BG antisense RNA 1\n", | ||
"2 A1CF APOBEC1 complementation factor\n", | ||
"3 A2M alpha-2-macroglobulin\n", | ||
"4 A2M-AS1 A2M antisense RNA 1\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"hgnc_list = hgnc('~/Kevin/AddDB_updater/data/hgnc.tsv')\n", | ||
"print(hgnc_list.head())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## GnomAD constraint score" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 85, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def gnomad_score(file):\n", | ||
" gnomad = pd.read_csv(file, sep='\\t')\n", | ||
" gnomad.rename(columns={'gene': '#Gene_name'}, inplace=True)\n", | ||
" gnomad_select = gnomad[['#Gene_name', 'oe_lof_upper_rank',\n", | ||
" 'oe_lof_upper_bin','oe_lof','oe_lof_lower','oe_lof_upper','oe_mis','oe_mis_lower', 'oe_mis_upper','oe_syn','oe_syn_lower', 'oe_syn_upper','constraint_flag']]\n", | ||
" return(gnomad_select)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 86, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" #Gene_name oe_lof_upper_rank oe_lof_upper_bin oe_lof oe_lof_lower \\\n", | ||
"0 MED13 0.0 0.0 0.000000 0.000 \n", | ||
"1 NIPBL 1.0 0.0 0.006653 0.001 \n", | ||
"2 SMC3 2.0 0.0 0.000000 0.000 \n", | ||
"3 CNOT1 3.0 0.0 0.007998 0.002 \n", | ||
"4 RLF 4.0 0.0 0.000000 0.000 \n", | ||
"\n", | ||
" oe_lof_upper oe_mis oe_mis_lower oe_mis_upper oe_syn oe_syn_lower \\\n", | ||
"0 0.030 0.77921 0.736 0.824 1.0890 1.005 \n", | ||
"1 0.032 0.58688 0.554 0.621 1.0020 0.930 \n", | ||
"2 0.037 0.28251 0.249 0.320 1.0578 0.946 \n", | ||
"3 0.038 0.43290 0.403 0.464 1.0306 0.955 \n", | ||
"4 0.040 0.68766 0.645 0.733 1.0153 0.930 \n", | ||
"\n", | ||
" oe_syn_upper constraint_flag \n", | ||
"0 1.180 NaN \n", | ||
"1 1.079 NaN \n", | ||
"2 1.184 NaN \n", | ||
"3 1.112 NaN \n", | ||
"4 1.108 NaN \n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"gnomad_score_list = gnomad_score('~/Kevin/AddDB_updater/data/gnomad.v2.1.1.lof_metrics.by_gene.txt')\n", | ||
"print(gnomad_score_list.head())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## OMIM genemap2" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 87, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def omim(file):\n", | ||
" omim = pd.read_csv(file, sep='\\t',skiprows=3)\n", | ||
" omim_select = omim[['Approved Symbol','Phenotypes']]\n", | ||
" omim_select.rename(columns={'Approved Symbol': '#Gene_name'}, inplace=True)\n", | ||
" omim_select = omim_select.dropna(subset=['#Gene_name']) \n", | ||
" return(omim_select)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 89, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" #Gene_name Phenotypes\n", | ||
"5 CMM {Melanoma, cutaneous malignant, 1}, 155600 (2)...\n", | ||
"6 CCV Cataract 8, multiple types, 115665 (2), Autoso...\n", | ||
"8 DYX8 {Dyslexia, susceptibility to, 8}, 608995 (2), ...\n", | ||
"10 IBD7 {Inflammatory bowel disease 7}, 605225 (2)\n", | ||
"12 MYP14 Myopia 14, 610320 (2)\n", | ||
"13 PSORS7 {Psoriasis susceptibility 7}, 605606 (2)\n", | ||
"14 PTPRZ2 NaN\n", | ||
"15 SAI1 NaN\n", | ||
"16 SAMD11 NaN\n", | ||
"17 NOC2L NaN\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"omim_list = omim('~/Kevin/AddDB_updater/data/genemap2.txt')\n", | ||
"print(omim_list.head(n=10))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## UniProt database" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 80, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def uniprot(file):\n", | ||
" uniprot = pd.read_csv(file, sep='\\t')\n", | ||
" uniprot_select = uniprot.iloc[:,3:]\n", | ||
" uniprot_select.rename(columns={'Gene names (primary )': '#Gene_name'}, inplace=True)\n", | ||
" return(uniprot_select)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 81, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" #Gene_name Function [CC] \\\n", | ||
"0 TRBV11-2 FUNCTION: V region of the variable domain of T... \n", | ||
"1 TEX13A NaN \n", | ||
"2 LARS2 NaN \n", | ||
"3 TXNDC11 FUNCTION: May act as a redox regulator involve... \n", | ||
"4 TXK FUNCTION: Non-receptor tyrosine kinase that pl... \n", | ||
"\n", | ||
" Tissue specificity \\\n", | ||
"0 NaN \n", | ||
"1 TISSUE SPECIFICITY: Testis specific. \n", | ||
"2 TISSUE SPECIFICITY: Ubiquitously expressed, bu... \n", | ||
"3 TISSUE SPECIFICITY: Widely expressed at low le... \n", | ||
"4 TISSUE SPECIFICITY: Expressed in T-cells and s... \n", | ||
"\n", | ||
" Involvement in disease \n", | ||
"0 NaN \n", | ||
"1 NaN \n", | ||
"2 DISEASE: Perrault syndrome 4 (PRLTS4) [MIM:615... \n", | ||
"3 NaN \n", | ||
"4 NaN \n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"uniprot_list = uniprot('~/Kevin/AddDB_updater/data/uniprot.tsv')\n", | ||
"print(uniprot_list.head())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Merge into one file\n", | ||
"\n", | ||
"This function will merge all databases into HGNC." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 98, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def merge_db(hgnc_file,omim_file,gnomad_score_file,uniprot_file):\n", | ||
" hgnc_list = hgnc(hgnc_file)\n", | ||
" omim_list = omim(omim_file).reset_index(drop=True)\n", | ||
" gnomad_score_list = gnomad_score(gnomad_score_file)\n", | ||
" uniprot_list = uniprot(uniprot_file)\n", | ||
" gene_fullxref = hgnc_list.merge(omim_list,on='#Gene_name').merge(gnomad_score_list,on='#Gene_name').merge(uniprot_list,on='#Gene_name')\n", | ||
" gene_fullxref = gene_fullxref.fillna('')\n", | ||
" return gene_fullxref" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 100, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" #Gene_name Approved name \\\n", | ||
"0 A1BG alpha-1-B glycoprotein \n", | ||
"1 A1CF APOBEC1 complementation factor \n", | ||
"2 A2M alpha-2-macroglobulin \n", | ||
"3 A2ML1 alpha-2-macroglobulin like 1 \n", | ||
"4 A4GALT alpha 1,4-galactosyltransferase (P blood group) \n", | ||
"\n", | ||
" Phenotypes oe_lof_upper_rank \\\n", | ||
"0 13015 \n", | ||
"1 9254 \n", | ||
"2 Alpha-2-macroglobulin deficiency, 614036 (1), ... 5366 \n", | ||
"3 {Otitis media, susceptibility to}, 166760 (3),... 10116 \n", | ||
"4 [Blood group, P1Pk system, P(2) phenotype], 11... 16517 \n", | ||
"\n", | ||
" oe_lof_upper_bin oe_lof oe_lof_lower oe_lof_upper oe_mis oe_mis_lower \\\n", | ||
"0 6 0.78457 0.524 1.208 1.0141 0.922 \n", | ||
"1 4 0.60537 0.425 0.88 0.84521 0.765 \n", | ||
"2 2 0.40526 0.305 0.544 0.81065 0.758 \n", | ||
"3 5 0.77171 0.629 0.952 0.96729 0.911 \n", | ||
"4 8 0.94609 0.553 1.654 1.0345 0.930 \n", | ||
"\n", | ||
" oe_mis_upper oe_syn oe_syn_lower oe_syn_upper constraint_flag \\\n", | ||
"0 1.116 1.0299 0.897 1.184 \n", | ||
"1 0.934 1.0355 0.895 1.201 \n", | ||
"2 0.866 0.87995 0.796 0.973 \n", | ||
"3 1.027 1.0048 0.916 1.103 \n", | ||
"4 1.151 1.1197 0.967 1.300 \n", | ||
"\n", | ||
" Function [CC] \\\n", | ||
"0 \n", | ||
"1 FUNCTION: Essential component of the apolipopr... \n", | ||
"2 FUNCTION: Is able to inhibit all four classes ... \n", | ||
"3 FUNCTION: Is able to inhibit all four classes ... \n", | ||
"4 FUNCTION: Necessary for the biosynthesis of th... \n", | ||
"\n", | ||
" Tissue specificity \\\n", | ||
"0 TISSUE SPECIFICITY: Plasma. \n", | ||
"1 TISSUE SPECIFICITY: Widely expressed with high... \n", | ||
"2 TISSUE SPECIFICITY: Secreted in plasma. {ECO:0... \n", | ||
"3 TISSUE SPECIFICITY: In the epidermis, expresse... \n", | ||
"4 TISSUE SPECIFICITY: Ubiquitous. Highly express... \n", | ||
"\n", | ||
" Involvement in disease \n", | ||
"0 \n", | ||
"1 \n", | ||
"2 \n", | ||
"3 DISEASE: Otitis media (OM) [MIM:166760]: An in... \n", | ||
"4 \n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"gene_fullxref_list = merge_db(hgnc_file='~/Kevin/AddDB_updater/data/hgnc.tsv',omim_file='~/Kevin/AddDB_updater/data/genemap2.txt', gnomad_score_file='~/Kevin/AddDB_updater/data/gnomad.v2.1.1.lof_metrics.by_gene.txt', uniprot_file='~/Kevin/AddDB_updater/data/uniprot.tsv')\n", | ||
"gene_fullxref_list.to_csv('~/Kevin/AddDB_updater/data/test.txt',sep='\\t',index=False)\n", | ||
"print(gene_fullxref_list.head())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.