updater.sh with python script v1.0

mobidic · May 16, 2019 · 5c518ea · 5c518ea
1 parent 9567594
commit 5c518ea
Show file tree

Hide file tree

Showing 6 changed files with 803 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+data/*.txt
+data/*.tsv
diff --git a/.ipynb_checkpoints/AddDB_updater-checkpoint.ipynb b/.ipynb_checkpoints/AddDB_updater-checkpoint.ipynb
@@ -0,0 +1,353 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# AddDB Updater\n",
+    "\n",
+    "This is a Jupyter notebook to build a python script that can parse and create a fullly annotated tsv file to replace gene_xref for ANNOVAR. This script could be useful to update these database and be added to the Achabilarity container.\n",
+    "\n",
+    "# Import library"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Parser function\n",
+    "\n",
+    "These functions will parse them for merging.\n",
+    "\n",
+    "## HGNC"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def hgnc(file):\n",
+    "    hgnc = pd.read_csv(file, sep='\\t')\n",
+    "    hgnc.rename(columns={'Approved symbol': '#Gene_name'}, inplace=True)\n",
+    "    return(hgnc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  #Gene_name                   Approved name\n",
+      "0       A1BG          alpha-1-B glycoprotein\n",
+      "1   A1BG-AS1            A1BG antisense RNA 1\n",
+      "2       A1CF  APOBEC1 complementation factor\n",
+      "3        A2M           alpha-2-macroglobulin\n",
+      "4    A2M-AS1             A2M antisense RNA 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "hgnc_list = hgnc('~/Kevin/AddDB_updater/data/hgnc.tsv')\n",
+    "print(hgnc_list.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## GnomAD constraint score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def gnomad_score(file):\n",
+    "    gnomad = pd.read_csv(file, sep='\\t')\n",
+    "    gnomad.rename(columns={'gene': '#Gene_name'}, inplace=True)\n",
+    "    gnomad_select = gnomad[['#Gene_name', 'oe_lof_upper_rank',\n",
+    "       'oe_lof_upper_bin','oe_lof','oe_lof_lower','oe_lof_upper','oe_mis','oe_mis_lower', 'oe_mis_upper','oe_syn','oe_syn_lower', 'oe_syn_upper','constraint_flag']]\n",
+    "    return(gnomad_select)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  #Gene_name  oe_lof_upper_rank  oe_lof_upper_bin    oe_lof  oe_lof_lower  \\\n",
+      "0      MED13                0.0               0.0  0.000000         0.000   \n",
+      "1      NIPBL                1.0               0.0  0.006653         0.001   \n",
+      "2       SMC3                2.0               0.0  0.000000         0.000   \n",
+      "3      CNOT1                3.0               0.0  0.007998         0.002   \n",
+      "4        RLF                4.0               0.0  0.000000         0.000   \n",
+      "\n",
+      "   oe_lof_upper   oe_mis  oe_mis_lower  oe_mis_upper  oe_syn  oe_syn_lower  \\\n",
+      "0         0.030  0.77921         0.736         0.824  1.0890         1.005   \n",
+      "1         0.032  0.58688         0.554         0.621  1.0020         0.930   \n",
+      "2         0.037  0.28251         0.249         0.320  1.0578         0.946   \n",
+      "3         0.038  0.43290         0.403         0.464  1.0306         0.955   \n",
+      "4         0.040  0.68766         0.645         0.733  1.0153         0.930   \n",
+      "\n",
+      "   oe_syn_upper constraint_flag  \n",
+      "0         1.180             NaN  \n",
+      "1         1.079             NaN  \n",
+      "2         1.184             NaN  \n",
+      "3         1.112             NaN  \n",
+      "4         1.108             NaN  \n"
+     ]
+    }
+   ],
+   "source": [
+    "gnomad_score_list =  gnomad_score('~/Kevin/AddDB_updater/data/gnomad.v2.1.1.lof_metrics.by_gene.txt')\n",
+    "print(gnomad_score_list.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## OMIM genemap2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def omim(file):\n",
+    "    omim = pd.read_csv(file, sep='\\t',skiprows=3)\n",
+    "    omim_select = omim[['Approved Symbol','Phenotypes']]\n",
+    "    omim_select.rename(columns={'Approved Symbol': '#Gene_name'}, inplace=True)\n",
+    "    omim_select = omim_select.dropna(subset=['#Gene_name']) \n",
+    "    return(omim_select)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   #Gene_name                                         Phenotypes\n",
+      "5         CMM  {Melanoma, cutaneous malignant, 1}, 155600 (2)...\n",
+      "6         CCV  Cataract 8, multiple types, 115665 (2), Autoso...\n",
+      "8        DYX8  {Dyslexia, susceptibility to, 8}, 608995 (2), ...\n",
+      "10       IBD7         {Inflammatory bowel disease 7}, 605225 (2)\n",
+      "12      MYP14                              Myopia 14, 610320 (2)\n",
+      "13     PSORS7           {Psoriasis susceptibility 7}, 605606 (2)\n",
+      "14     PTPRZ2                                                NaN\n",
+      "15       SAI1                                                NaN\n",
+      "16     SAMD11                                                NaN\n",
+      "17      NOC2L                                                NaN\n"
+     ]
+    }
+   ],
+   "source": [
+    "omim_list = omim('~/Kevin/AddDB_updater/data/genemap2.txt')\n",
+    "print(omim_list.head(n=10))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## UniProt database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def uniprot(file):\n",
+    "    uniprot = pd.read_csv(file, sep='\\t')\n",
+    "    uniprot_select = uniprot.iloc[:,3:]\n",
+    "    uniprot_select.rename(columns={'Gene names  (primary )': '#Gene_name'}, inplace=True)\n",
+    "    return(uniprot_select)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  #Gene_name                                      Function [CC]  \\\n",
+      "0   TRBV11-2  FUNCTION: V region of the variable domain of T...   \n",
+      "1     TEX13A                                                NaN   \n",
+      "2      LARS2                                                NaN   \n",
+      "3    TXNDC11  FUNCTION: May act as a redox regulator involve...   \n",
+      "4        TXK  FUNCTION: Non-receptor tyrosine kinase that pl...   \n",
+      "\n",
+      "                                  Tissue specificity  \\\n",
+      "0                                                NaN   \n",
+      "1               TISSUE SPECIFICITY: Testis specific.   \n",
+      "2  TISSUE SPECIFICITY: Ubiquitously expressed, bu...   \n",
+      "3  TISSUE SPECIFICITY: Widely expressed at low le...   \n",
+      "4  TISSUE SPECIFICITY: Expressed in T-cells and s...   \n",
+      "\n",
+      "                              Involvement in disease  \n",
+      "0                                                NaN  \n",
+      "1                                                NaN  \n",
+      "2  DISEASE: Perrault syndrome 4 (PRLTS4) [MIM:615...  \n",
+      "3                                                NaN  \n",
+      "4                                                NaN  \n"
+     ]
+    }
+   ],
+   "source": [
+    "uniprot_list = uniprot('~/Kevin/AddDB_updater/data/uniprot.tsv')\n",
+    "print(uniprot_list.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Merge into one file\n",
+    "\n",
+    "This function will merge all databases into HGNC."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def merge_db(hgnc_file,omim_file,gnomad_score_file,uniprot_file):\n",
+    "    hgnc_list = hgnc(hgnc_file)\n",
+    "    omim_list = omim(omim_file).reset_index(drop=True)\n",
+    "    gnomad_score_list = gnomad_score(gnomad_score_file)\n",
+    "    uniprot_list = uniprot(uniprot_file)\n",
+    "    gene_fullxref = hgnc_list.merge(omim_list,on='#Gene_name').merge(gnomad_score_list,on='#Gene_name').merge(uniprot_list,on='#Gene_name')\n",
+    "    gene_fullxref = gene_fullxref.fillna('')\n",
+    "    return gene_fullxref"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  #Gene_name                                    Approved name  \\\n",
+      "0       A1BG                           alpha-1-B glycoprotein   \n",
+      "1       A1CF                   APOBEC1 complementation factor   \n",
+      "2        A2M                            alpha-2-macroglobulin   \n",
+      "3      A2ML1                     alpha-2-macroglobulin like 1   \n",
+      "4     A4GALT  alpha 1,4-galactosyltransferase (P blood group)   \n",
+      "\n",
+      "                                          Phenotypes oe_lof_upper_rank  \\\n",
+      "0                                                                13015   \n",
+      "1                                                                 9254   \n",
+      "2  Alpha-2-macroglobulin deficiency, 614036 (1), ...              5366   \n",
+      "3  {Otitis media, susceptibility to}, 166760 (3),...             10116   \n",
+      "4  [Blood group, P1Pk system, P(2) phenotype], 11...             16517   \n",
+      "\n",
+      "  oe_lof_upper_bin   oe_lof oe_lof_lower oe_lof_upper   oe_mis  oe_mis_lower  \\\n",
+      "0                6  0.78457        0.524        1.208   1.0141         0.922   \n",
+      "1                4  0.60537        0.425         0.88  0.84521         0.765   \n",
+      "2                2  0.40526        0.305        0.544  0.81065         0.758   \n",
+      "3                5  0.77171        0.629        0.952  0.96729         0.911   \n",
+      "4                8  0.94609        0.553        1.654   1.0345         0.930   \n",
+      "\n",
+      "   oe_mis_upper   oe_syn  oe_syn_lower  oe_syn_upper constraint_flag  \\\n",
+      "0         1.116   1.0299         0.897         1.184                   \n",
+      "1         0.934   1.0355         0.895         1.201                   \n",
+      "2         0.866  0.87995         0.796         0.973                   \n",
+      "3         1.027   1.0048         0.916         1.103                   \n",
+      "4         1.151   1.1197         0.967         1.300                   \n",
+      "\n",
+      "                                       Function [CC]  \\\n",
+      "0                                                      \n",
+      "1  FUNCTION: Essential component of the apolipopr...   \n",
+      "2  FUNCTION: Is able to inhibit all four classes ...   \n",
+      "3  FUNCTION: Is able to inhibit all four classes ...   \n",
+      "4  FUNCTION: Necessary for the biosynthesis of th...   \n",
+      "\n",
+      "                                  Tissue specificity  \\\n",
+      "0                        TISSUE SPECIFICITY: Plasma.   \n",
+      "1  TISSUE SPECIFICITY: Widely expressed with high...   \n",
+      "2  TISSUE SPECIFICITY: Secreted in plasma. {ECO:0...   \n",
+      "3  TISSUE SPECIFICITY: In the epidermis, expresse...   \n",
+      "4  TISSUE SPECIFICITY: Ubiquitous. Highly express...   \n",
+      "\n",
+      "                              Involvement in disease  \n",
+      "0                                                     \n",
+      "1                                                     \n",
+      "2                                                     \n",
+      "3  DISEASE: Otitis media (OM) [MIM:166760]: An in...  \n",
+      "4                                                     \n"
+     ]
+    }
+   ],
+   "source": [
+    "gene_fullxref_list = merge_db(hgnc_file='~/Kevin/AddDB_updater/data/hgnc.tsv',omim_file='~/Kevin/AddDB_updater/data/genemap2.txt', gnomad_score_file='~/Kevin/AddDB_updater/data/gnomad.v2.1.1.lof_metrics.by_gene.txt', uniprot_file='~/Kevin/AddDB_updater/data/uniprot.tsv')\n",
+    "gene_fullxref_list.to_csv('~/Kevin/AddDB_updater/data/test.txt',sep='\\t',index=False)\n",
+    "print(gene_fullxref_list.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}