Skip to content

Commit

Permalink
updater.sh with python script v1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
kyauy committed May 16, 2019
1 parent 9567594 commit 5c518ea
Show file tree
Hide file tree
Showing 6 changed files with 803 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
data/*.txt
data/*.tsv
353 changes: 353 additions & 0 deletions .ipynb_checkpoints/AddDB_updater-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,353 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# AddDB Updater\n",
"\n",
"This is a Jupyter notebook to build a python script that can parse and create a fullly annotated tsv file to replace gene_xref for ANNOVAR. This script could be useful to update these database and be added to the Achabilarity container.\n",
"\n",
"# Import library"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Parser function\n",
"\n",
"These functions will parse them for merging.\n",
"\n",
"## HGNC"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"def hgnc(file):\n",
" hgnc = pd.read_csv(file, sep='\\t')\n",
" hgnc.rename(columns={'Approved symbol': '#Gene_name'}, inplace=True)\n",
" return(hgnc)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" #Gene_name Approved name\n",
"0 A1BG alpha-1-B glycoprotein\n",
"1 A1BG-AS1 A1BG antisense RNA 1\n",
"2 A1CF APOBEC1 complementation factor\n",
"3 A2M alpha-2-macroglobulin\n",
"4 A2M-AS1 A2M antisense RNA 1\n"
]
}
],
"source": [
"hgnc_list = hgnc('~/Kevin/AddDB_updater/data/hgnc.tsv')\n",
"print(hgnc_list.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## GnomAD constraint score"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"def gnomad_score(file):\n",
" gnomad = pd.read_csv(file, sep='\\t')\n",
" gnomad.rename(columns={'gene': '#Gene_name'}, inplace=True)\n",
" gnomad_select = gnomad[['#Gene_name', 'oe_lof_upper_rank',\n",
" 'oe_lof_upper_bin','oe_lof','oe_lof_lower','oe_lof_upper','oe_mis','oe_mis_lower', 'oe_mis_upper','oe_syn','oe_syn_lower', 'oe_syn_upper','constraint_flag']]\n",
" return(gnomad_select)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" #Gene_name oe_lof_upper_rank oe_lof_upper_bin oe_lof oe_lof_lower \\\n",
"0 MED13 0.0 0.0 0.000000 0.000 \n",
"1 NIPBL 1.0 0.0 0.006653 0.001 \n",
"2 SMC3 2.0 0.0 0.000000 0.000 \n",
"3 CNOT1 3.0 0.0 0.007998 0.002 \n",
"4 RLF 4.0 0.0 0.000000 0.000 \n",
"\n",
" oe_lof_upper oe_mis oe_mis_lower oe_mis_upper oe_syn oe_syn_lower \\\n",
"0 0.030 0.77921 0.736 0.824 1.0890 1.005 \n",
"1 0.032 0.58688 0.554 0.621 1.0020 0.930 \n",
"2 0.037 0.28251 0.249 0.320 1.0578 0.946 \n",
"3 0.038 0.43290 0.403 0.464 1.0306 0.955 \n",
"4 0.040 0.68766 0.645 0.733 1.0153 0.930 \n",
"\n",
" oe_syn_upper constraint_flag \n",
"0 1.180 NaN \n",
"1 1.079 NaN \n",
"2 1.184 NaN \n",
"3 1.112 NaN \n",
"4 1.108 NaN \n"
]
}
],
"source": [
"gnomad_score_list = gnomad_score('~/Kevin/AddDB_updater/data/gnomad.v2.1.1.lof_metrics.by_gene.txt')\n",
"print(gnomad_score_list.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## OMIM genemap2"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"def omim(file):\n",
" omim = pd.read_csv(file, sep='\\t',skiprows=3)\n",
" omim_select = omim[['Approved Symbol','Phenotypes']]\n",
" omim_select.rename(columns={'Approved Symbol': '#Gene_name'}, inplace=True)\n",
" omim_select = omim_select.dropna(subset=['#Gene_name']) \n",
" return(omim_select)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" #Gene_name Phenotypes\n",
"5 CMM {Melanoma, cutaneous malignant, 1}, 155600 (2)...\n",
"6 CCV Cataract 8, multiple types, 115665 (2), Autoso...\n",
"8 DYX8 {Dyslexia, susceptibility to, 8}, 608995 (2), ...\n",
"10 IBD7 {Inflammatory bowel disease 7}, 605225 (2)\n",
"12 MYP14 Myopia 14, 610320 (2)\n",
"13 PSORS7 {Psoriasis susceptibility 7}, 605606 (2)\n",
"14 PTPRZ2 NaN\n",
"15 SAI1 NaN\n",
"16 SAMD11 NaN\n",
"17 NOC2L NaN\n"
]
}
],
"source": [
"omim_list = omim('~/Kevin/AddDB_updater/data/genemap2.txt')\n",
"print(omim_list.head(n=10))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## UniProt database"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"def uniprot(file):\n",
" uniprot = pd.read_csv(file, sep='\\t')\n",
" uniprot_select = uniprot.iloc[:,3:]\n",
" uniprot_select.rename(columns={'Gene names (primary )': '#Gene_name'}, inplace=True)\n",
" return(uniprot_select)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" #Gene_name Function [CC] \\\n",
"0 TRBV11-2 FUNCTION: V region of the variable domain of T... \n",
"1 TEX13A NaN \n",
"2 LARS2 NaN \n",
"3 TXNDC11 FUNCTION: May act as a redox regulator involve... \n",
"4 TXK FUNCTION: Non-receptor tyrosine kinase that pl... \n",
"\n",
" Tissue specificity \\\n",
"0 NaN \n",
"1 TISSUE SPECIFICITY: Testis specific. \n",
"2 TISSUE SPECIFICITY: Ubiquitously expressed, bu... \n",
"3 TISSUE SPECIFICITY: Widely expressed at low le... \n",
"4 TISSUE SPECIFICITY: Expressed in T-cells and s... \n",
"\n",
" Involvement in disease \n",
"0 NaN \n",
"1 NaN \n",
"2 DISEASE: Perrault syndrome 4 (PRLTS4) [MIM:615... \n",
"3 NaN \n",
"4 NaN \n"
]
}
],
"source": [
"uniprot_list = uniprot('~/Kevin/AddDB_updater/data/uniprot.tsv')\n",
"print(uniprot_list.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Merge into one file\n",
"\n",
"This function will merge all databases into HGNC."
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"def merge_db(hgnc_file,omim_file,gnomad_score_file,uniprot_file):\n",
" hgnc_list = hgnc(hgnc_file)\n",
" omim_list = omim(omim_file).reset_index(drop=True)\n",
" gnomad_score_list = gnomad_score(gnomad_score_file)\n",
" uniprot_list = uniprot(uniprot_file)\n",
" gene_fullxref = hgnc_list.merge(omim_list,on='#Gene_name').merge(gnomad_score_list,on='#Gene_name').merge(uniprot_list,on='#Gene_name')\n",
" gene_fullxref = gene_fullxref.fillna('')\n",
" return gene_fullxref"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" #Gene_name Approved name \\\n",
"0 A1BG alpha-1-B glycoprotein \n",
"1 A1CF APOBEC1 complementation factor \n",
"2 A2M alpha-2-macroglobulin \n",
"3 A2ML1 alpha-2-macroglobulin like 1 \n",
"4 A4GALT alpha 1,4-galactosyltransferase (P blood group) \n",
"\n",
" Phenotypes oe_lof_upper_rank \\\n",
"0 13015 \n",
"1 9254 \n",
"2 Alpha-2-macroglobulin deficiency, 614036 (1), ... 5366 \n",
"3 {Otitis media, susceptibility to}, 166760 (3),... 10116 \n",
"4 [Blood group, P1Pk system, P(2) phenotype], 11... 16517 \n",
"\n",
" oe_lof_upper_bin oe_lof oe_lof_lower oe_lof_upper oe_mis oe_mis_lower \\\n",
"0 6 0.78457 0.524 1.208 1.0141 0.922 \n",
"1 4 0.60537 0.425 0.88 0.84521 0.765 \n",
"2 2 0.40526 0.305 0.544 0.81065 0.758 \n",
"3 5 0.77171 0.629 0.952 0.96729 0.911 \n",
"4 8 0.94609 0.553 1.654 1.0345 0.930 \n",
"\n",
" oe_mis_upper oe_syn oe_syn_lower oe_syn_upper constraint_flag \\\n",
"0 1.116 1.0299 0.897 1.184 \n",
"1 0.934 1.0355 0.895 1.201 \n",
"2 0.866 0.87995 0.796 0.973 \n",
"3 1.027 1.0048 0.916 1.103 \n",
"4 1.151 1.1197 0.967 1.300 \n",
"\n",
" Function [CC] \\\n",
"0 \n",
"1 FUNCTION: Essential component of the apolipopr... \n",
"2 FUNCTION: Is able to inhibit all four classes ... \n",
"3 FUNCTION: Is able to inhibit all four classes ... \n",
"4 FUNCTION: Necessary for the biosynthesis of th... \n",
"\n",
" Tissue specificity \\\n",
"0 TISSUE SPECIFICITY: Plasma. \n",
"1 TISSUE SPECIFICITY: Widely expressed with high... \n",
"2 TISSUE SPECIFICITY: Secreted in plasma. {ECO:0... \n",
"3 TISSUE SPECIFICITY: In the epidermis, expresse... \n",
"4 TISSUE SPECIFICITY: Ubiquitous. Highly express... \n",
"\n",
" Involvement in disease \n",
"0 \n",
"1 \n",
"2 \n",
"3 DISEASE: Otitis media (OM) [MIM:166760]: An in... \n",
"4 \n"
]
}
],
"source": [
"gene_fullxref_list = merge_db(hgnc_file='~/Kevin/AddDB_updater/data/hgnc.tsv',omim_file='~/Kevin/AddDB_updater/data/genemap2.txt', gnomad_score_file='~/Kevin/AddDB_updater/data/gnomad.v2.1.1.lof_metrics.by_gene.txt', uniprot_file='~/Kevin/AddDB_updater/data/uniprot.tsv')\n",
"gene_fullxref_list.to_csv('~/Kevin/AddDB_updater/data/test.txt',sep='\\t',index=False)\n",
"print(gene_fullxref_list.head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 5c518ea

Please sign in to comment.