Skip to content

Commit

Permalink
replace character not compatible with regex
Browse files Browse the repository at this point in the history
  • Loading branch information
kyauy committed May 29, 2019
1 parent f6fa0b3 commit 0c99538
Show file tree
Hide file tree
Showing 3 changed files with 193 additions and 5 deletions.
97 changes: 95 additions & 2 deletions .ipynb_checkpoints/AddDB_updater-checkpoint.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
"import numpy as np\n",
"import subprocess"
]
},
{
Expand Down Expand Up @@ -488,6 +489,13 @@
" return gene_fullxref"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 45,
Expand Down Expand Up @@ -604,6 +612,91 @@
"print(gene_fullxref_list[gene_fullxref_list.duplicated() == True])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove characters that are not permitted in regular expression (used in ANNOVAR)."
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"gene_fullxref_list = pd.read_csv('~/Kevin/AddDB_updater/data/test2.txt',sep='\\t')\n",
"#COMMAND = \" sed 's/+/plus/g' | awk 'BEGIN{FS=OFS=\"\\t\"} {for (i=6;i<=7;i++) gsub(/-/,\"_\",$i)}1' | awk 'BEGIN{FS=OFS=\"\\t\"} {gsub(/-/,\"_\",$(NF-1))}1'\" \n",
"#test = subprocess.call(['sed', 's/+/plus/g', test])\n",
"gene_fullxref_list = gene_fullxref_list.replace('\\+','plus',regex=True)\n",
"gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']] = gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']].replace('-','_',regex=True)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" #Gene_name Approved name \\\n",
"0 A1BG alpha_1_B glycoprotein \n",
"1 A1CF APOBEC1 complementation factor \n",
"2 A2M alpha_2_macroglobulin \n",
"3 A2ML1 alpha_2_macroglobulin like 1 \n",
"4 A4GALT alpha 1,4_galactosyltransferase (P blood group) \n",
"\n",
" Phenotypes oe_lof_upper_rank \\\n",
"0 NaN 13015.0 \n",
"1 NaN 9254.0 \n",
"2 Alpha_2_macroglobulin deficiency, 614036 (1), ... 5366.0 \n",
"3 {Otitis media, susceptibility to}, 166760 (3),... 10116.0 \n",
"4 [Blood group, P1Pk system, P(2) phenotype], 11... 16517.0 \n",
"\n",
" oe_lof_upper_bin oe_lof oe_lof_lower oe_lof_upper oe_mis \\\n",
"0 6.0 0.78457 0.524 1.208 1.01410 \n",
"1 4.0 0.60537 0.425 0.880 0.84521 \n",
"2 2.0 0.40526 0.305 0.544 0.81065 \n",
"3 5.0 0.77171 0.629 0.952 0.96729 \n",
"4 8.0 0.94609 0.553 1.654 1.03450 \n",
"\n",
" oe_mis_lower oe_mis_upper oe_syn oe_syn_lower oe_syn_upper \\\n",
"0 0.922 1.116 1.02990 0.897 1.184 \n",
"1 0.765 0.934 1.03550 0.895 1.201 \n",
"2 0.758 0.866 0.87995 0.796 0.973 \n",
"3 0.911 1.027 1.00480 0.916 1.103 \n",
"4 0.930 1.151 1.11970 0.967 1.300 \n",
"\n",
" Function [CC] \\\n",
"0 NaN \n",
"1 FUNCTION: Essential component of the apolipopr... \n",
"2 FUNCTION: Is able to inhibit all four classes ... \n",
"3 FUNCTION: Is able to inhibit all four classes ... \n",
"4 FUNCTION: Necessary for the biosynthesis of th... \n",
"\n",
" Involvement in disease \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 DISEASE: Otitis media (OM) [MIM:166760]: An in... \n",
"4 NaN \n",
"\n",
" Tissue specificity \n",
"0 TISSUE SPECIFICITY: Plasma. \n",
"1 TISSUE SPECIFICITY: Widely expressed with high... \n",
"2 TISSUE SPECIFICITY: Secreted in plasma. {ECO:0... \n",
"3 TISSUE SPECIFICITY: In the epidermis, expresse... \n",
"4 TISSUE SPECIFICITY: Ubiquitous. Highly express... \n"
]
}
],
"source": [
"print(gene_fullxref_list.head())\n",
"gene_fullxref_list.to_csv('~/Kevin/AddDB_updater/data/aaa.txt',sep='\\t',index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
97 changes: 95 additions & 2 deletions AddDB_updater.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
"import numpy as np\n",
"import subprocess"
]
},
{
Expand Down Expand Up @@ -488,6 +489,13 @@
" return gene_fullxref"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 45,
Expand Down Expand Up @@ -604,6 +612,91 @@
"print(gene_fullxref_list[gene_fullxref_list.duplicated() == True])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove characters that are not permitted in regular expression (used in ANNOVAR)."
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"gene_fullxref_list = pd.read_csv('~/Kevin/AddDB_updater/data/test2.txt',sep='\\t')\n",
"#COMMAND = \" sed 's/+/plus/g' | awk 'BEGIN{FS=OFS=\"\\t\"} {for (i=6;i<=7;i++) gsub(/-/,\"_\",$i)}1' | awk 'BEGIN{FS=OFS=\"\\t\"} {gsub(/-/,\"_\",$(NF-1))}1'\" \n",
"#test = subprocess.call(['sed', 's/+/plus/g', test])\n",
"gene_fullxref_list = gene_fullxref_list.replace('\\+','plus',regex=True)\n",
"gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']] = gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']].replace('-','_',regex=True)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" #Gene_name Approved name \\\n",
"0 A1BG alpha_1_B glycoprotein \n",
"1 A1CF APOBEC1 complementation factor \n",
"2 A2M alpha_2_macroglobulin \n",
"3 A2ML1 alpha_2_macroglobulin like 1 \n",
"4 A4GALT alpha 1,4_galactosyltransferase (P blood group) \n",
"\n",
" Phenotypes oe_lof_upper_rank \\\n",
"0 NaN 13015.0 \n",
"1 NaN 9254.0 \n",
"2 Alpha_2_macroglobulin deficiency, 614036 (1), ... 5366.0 \n",
"3 {Otitis media, susceptibility to}, 166760 (3),... 10116.0 \n",
"4 [Blood group, P1Pk system, P(2) phenotype], 11... 16517.0 \n",
"\n",
" oe_lof_upper_bin oe_lof oe_lof_lower oe_lof_upper oe_mis \\\n",
"0 6.0 0.78457 0.524 1.208 1.01410 \n",
"1 4.0 0.60537 0.425 0.880 0.84521 \n",
"2 2.0 0.40526 0.305 0.544 0.81065 \n",
"3 5.0 0.77171 0.629 0.952 0.96729 \n",
"4 8.0 0.94609 0.553 1.654 1.03450 \n",
"\n",
" oe_mis_lower oe_mis_upper oe_syn oe_syn_lower oe_syn_upper \\\n",
"0 0.922 1.116 1.02990 0.897 1.184 \n",
"1 0.765 0.934 1.03550 0.895 1.201 \n",
"2 0.758 0.866 0.87995 0.796 0.973 \n",
"3 0.911 1.027 1.00480 0.916 1.103 \n",
"4 0.930 1.151 1.11970 0.967 1.300 \n",
"\n",
" Function [CC] \\\n",
"0 NaN \n",
"1 FUNCTION: Essential component of the apolipopr... \n",
"2 FUNCTION: Is able to inhibit all four classes ... \n",
"3 FUNCTION: Is able to inhibit all four classes ... \n",
"4 FUNCTION: Necessary for the biosynthesis of th... \n",
"\n",
" Involvement in disease \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 DISEASE: Otitis media (OM) [MIM:166760]: An in... \n",
"4 NaN \n",
"\n",
" Tissue specificity \n",
"0 TISSUE SPECIFICITY: Plasma. \n",
"1 TISSUE SPECIFICITY: Widely expressed with high... \n",
"2 TISSUE SPECIFICITY: Secreted in plasma. {ECO:0... \n",
"3 TISSUE SPECIFICITY: In the epidermis, expresse... \n",
"4 TISSUE SPECIFICITY: Ubiquitous. Highly express... \n"
]
}
],
"source": [
"print(gene_fullxref_list.head())\n",
"gene_fullxref_list.to_csv('~/Kevin/AddDB_updater/data/aaa.txt',sep='\\t',index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
4 changes: 3 additions & 1 deletion merge_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,10 @@ def merge_db(hgnc_file,omim_file,gnomad_score_file,uniprot_file):
today = datetime.date.today()
filename = 'data/gene_fullxref_'
filename += str(today)
filename += '.txt'
filename += '_.txt'

gene_fullxref_list = merge_db(hgnc_file,omim_file, gnomad_score_file, uniprot_file)
gene_fullxref_list = gene_fullxref_list.replace('\+','plus',regex=True)
gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']] = gene_fullxref_list.loc[:,['Approved name','Phenotypes','Function [CC]','Involvement in disease','Tissue specificity']].replace('-','_',regex=True)
gene_fullxref_list.to_csv(filename,sep='\t',index=False)

0 comments on commit 0c99538

Please sign in to comment.