replace character not compatible with regex

mobidic · May 29, 2019 · 0c99538 · 0c99538
1 parent f6fa0b3
commit 0c99538
Show file tree

Hide file tree

Showing 3 changed files with 193 additions and 5 deletions.
diff --git a/.ipynb_checkpoints/AddDB_updater-checkpoint.ipynb b/.ipynb_checkpoints/AddDB_updater-checkpoint.ipynb
@@ -13,12 +13,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "import subprocess"
    ]
   },
   {
@@ -488,6 +489,13 @@
     "    return gene_fullxref"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 45,
@@ -604,6 +612,91 @@
     "print(gene_fullxref_list[gene_fullxref_list.duplicated() == True])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Remove characters that are not permitted in regular expression (used in ANNOVAR)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gene_fullxref_list = pd.read_csv('~/Kevin/AddDB_updater/data/test2.txt',sep='\\t')\n",
+    "#COMMAND = \" sed 's/+/plus/g' | awk 'BEGIN{FS=OFS=\"\\t\"} {for (i=6;i<=7;i++) gsub(/-/,\"_\",$i)}1' |  awk 'BEGIN{FS=OFS=\"\\t\"} {gsub(/-/,\"_\",$(NF-1))}1'\" \n",
+    "#test = subprocess.call(['sed', 's/+/plus/g', test])\n",
+    "gene_fullxref_list = gene_fullxref_list.replace('\\+','plus',regex=True)\n",
+    "gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']] = gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']].replace('-','_',regex=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  #Gene_name                                    Approved name  \\\n",
+      "0       A1BG                           alpha_1_B glycoprotein   \n",
+      "1       A1CF                   APOBEC1 complementation factor   \n",
+      "2        A2M                            alpha_2_macroglobulin   \n",
+      "3      A2ML1                     alpha_2_macroglobulin like 1   \n",
+      "4     A4GALT  alpha 1,4_galactosyltransferase (P blood group)   \n",
+      "\n",
+      "                                          Phenotypes  oe_lof_upper_rank  \\\n",
+      "0                                                NaN            13015.0   \n",
+      "1                                                NaN             9254.0   \n",
+      "2  Alpha_2_macroglobulin deficiency, 614036 (1), ...             5366.0   \n",
+      "3  {Otitis media, susceptibility to}, 166760 (3),...            10116.0   \n",
+      "4  [Blood group, P1Pk system, P(2) phenotype], 11...            16517.0   \n",
+      "\n",
+      "   oe_lof_upper_bin   oe_lof  oe_lof_lower  oe_lof_upper   oe_mis  \\\n",
+      "0               6.0  0.78457         0.524         1.208  1.01410   \n",
+      "1               4.0  0.60537         0.425         0.880  0.84521   \n",
+      "2               2.0  0.40526         0.305         0.544  0.81065   \n",
+      "3               5.0  0.77171         0.629         0.952  0.96729   \n",
+      "4               8.0  0.94609         0.553         1.654  1.03450   \n",
+      "\n",
+      "   oe_mis_lower  oe_mis_upper   oe_syn  oe_syn_lower  oe_syn_upper  \\\n",
+      "0         0.922         1.116  1.02990         0.897         1.184   \n",
+      "1         0.765         0.934  1.03550         0.895         1.201   \n",
+      "2         0.758         0.866  0.87995         0.796         0.973   \n",
+      "3         0.911         1.027  1.00480         0.916         1.103   \n",
+      "4         0.930         1.151  1.11970         0.967         1.300   \n",
+      "\n",
+      "                                       Function [CC]  \\\n",
+      "0                                                NaN   \n",
+      "1  FUNCTION: Essential component of the apolipopr...   \n",
+      "2  FUNCTION: Is able to inhibit all four classes ...   \n",
+      "3  FUNCTION: Is able to inhibit all four classes ...   \n",
+      "4  FUNCTION: Necessary for the biosynthesis of th...   \n",
+      "\n",
+      "                              Involvement in disease  \\\n",
+      "0                                                NaN   \n",
+      "1                                                NaN   \n",
+      "2                                                NaN   \n",
+      "3  DISEASE: Otitis media (OM) [MIM:166760]: An in...   \n",
+      "4                                                NaN   \n",
+      "\n",
+      "                                  Tissue specificity  \n",
+      "0                        TISSUE SPECIFICITY: Plasma.  \n",
+      "1  TISSUE SPECIFICITY: Widely expressed with high...  \n",
+      "2  TISSUE SPECIFICITY: Secreted in plasma. {ECO:0...  \n",
+      "3  TISSUE SPECIFICITY: In the epidermis, expresse...  \n",
+      "4  TISSUE SPECIFICITY: Ubiquitous. Highly express...  \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(gene_fullxref_list.head())\n",
+    "gene_fullxref_list.to_csv('~/Kevin/AddDB_updater/data/aaa.txt',sep='\\t',index=False)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/AddDB_updater.ipynb b/AddDB_updater.ipynb
@@ -13,12 +13,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "import subprocess"
    ]
   },
   {
@@ -488,6 +489,13 @@
     "    return gene_fullxref"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 45,
@@ -604,6 +612,91 @@
     "print(gene_fullxref_list[gene_fullxref_list.duplicated() == True])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Remove characters that are not permitted in regular expression (used in ANNOVAR)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gene_fullxref_list = pd.read_csv('~/Kevin/AddDB_updater/data/test2.txt',sep='\\t')\n",
+    "#COMMAND = \" sed 's/+/plus/g' | awk 'BEGIN{FS=OFS=\"\\t\"} {for (i=6;i<=7;i++) gsub(/-/,\"_\",$i)}1' |  awk 'BEGIN{FS=OFS=\"\\t\"} {gsub(/-/,\"_\",$(NF-1))}1'\" \n",
+    "#test = subprocess.call(['sed', 's/+/plus/g', test])\n",
+    "gene_fullxref_list = gene_fullxref_list.replace('\\+','plus',regex=True)\n",
+    "gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']] = gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']].replace('-','_',regex=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  #Gene_name                                    Approved name  \\\n",
+      "0       A1BG                           alpha_1_B glycoprotein   \n",
+      "1       A1CF                   APOBEC1 complementation factor   \n",
+      "2        A2M                            alpha_2_macroglobulin   \n",
+      "3      A2ML1                     alpha_2_macroglobulin like 1   \n",
+      "4     A4GALT  alpha 1,4_galactosyltransferase (P blood group)   \n",
+      "\n",
+      "                                          Phenotypes  oe_lof_upper_rank  \\\n",
+      "0                                                NaN            13015.0   \n",
+      "1                                                NaN             9254.0   \n",
+      "2  Alpha_2_macroglobulin deficiency, 614036 (1), ...             5366.0   \n",
+      "3  {Otitis media, susceptibility to}, 166760 (3),...            10116.0   \n",
+      "4  [Blood group, P1Pk system, P(2) phenotype], 11...            16517.0   \n",
+      "\n",
+      "   oe_lof_upper_bin   oe_lof  oe_lof_lower  oe_lof_upper   oe_mis  \\\n",
+      "0               6.0  0.78457         0.524         1.208  1.01410   \n",
+      "1               4.0  0.60537         0.425         0.880  0.84521   \n",
+      "2               2.0  0.40526         0.305         0.544  0.81065   \n",
+      "3               5.0  0.77171         0.629         0.952  0.96729   \n",
+      "4               8.0  0.94609         0.553         1.654  1.03450   \n",
+      "\n",
+      "   oe_mis_lower  oe_mis_upper   oe_syn  oe_syn_lower  oe_syn_upper  \\\n",
+      "0         0.922         1.116  1.02990         0.897         1.184   \n",
+      "1         0.765         0.934  1.03550         0.895         1.201   \n",
+      "2         0.758         0.866  0.87995         0.796         0.973   \n",
+      "3         0.911         1.027  1.00480         0.916         1.103   \n",
+      "4         0.930         1.151  1.11970         0.967         1.300   \n",
+      "\n",
+      "                                       Function [CC]  \\\n",
+      "0                                                NaN   \n",
+      "1  FUNCTION: Essential component of the apolipopr...   \n",
+      "2  FUNCTION: Is able to inhibit all four classes ...   \n",
+      "3  FUNCTION: Is able to inhibit all four classes ...   \n",
+      "4  FUNCTION: Necessary for the biosynthesis of th...   \n",
+      "\n",
+      "                              Involvement in disease  \\\n",
+      "0                                                NaN   \n",
+      "1                                                NaN   \n",
+      "2                                                NaN   \n",
+      "3  DISEASE: Otitis media (OM) [MIM:166760]: An in...   \n",
+      "4                                                NaN   \n",
+      "\n",
+      "                                  Tissue specificity  \n",
+      "0                        TISSUE SPECIFICITY: Plasma.  \n",
+      "1  TISSUE SPECIFICITY: Widely expressed with high...  \n",
+      "2  TISSUE SPECIFICITY: Secreted in plasma. {ECO:0...  \n",
+      "3  TISSUE SPECIFICITY: In the epidermis, expresse...  \n",
+      "4  TISSUE SPECIFICITY: Ubiquitous. Highly express...  \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(gene_fullxref_list.head())\n",
+    "gene_fullxref_list.to_csv('~/Kevin/AddDB_updater/data/aaa.txt',sep='\\t',index=False)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/merge_db.py b/merge_db.py
@@ -115,8 +115,10 @@ def merge_db(hgnc_file,omim_file,gnomad_score_file,uniprot_file):
 today = datetime.date.today()
 filename = 'data/gene_fullxref_'
 filename += str(today)
-filename += '.txt'
+filename += '_.txt'
 
 gene_fullxref_list = merge_db(hgnc_file,omim_file, gnomad_score_file, uniprot_file)
+gene_fullxref_list = gene_fullxref_list.replace('\+','plus',regex=True)
+gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']] = gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']].replace('-','_',regex=True)
 gene_fullxref_list.to_csv(filename,sep='\t',index=False)