Skip to content

Commit

Permalink
Add outputs of demo to notebook.
Browse files Browse the repository at this point in the history
  • Loading branch information
Adibvafa committed Jul 28, 2024
1 parent 45a23a5 commit 61f76aa
Show file tree
Hide file tree
Showing 2 changed files with 302 additions and 16 deletions.
308 changes: 297 additions & 11 deletions CodonTransformerDemo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -53,9 +53,98 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "761a2df97a0a4e08b8c008a417d89dd6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HTML(value='<b style=\"font-size:20px;\">Enter Protein Sequence:</b><div style=\"height:18px;\"></d…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fbbbcfe733e14738a7eb96bd37059922",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HTML(value='\\n <style>\\n .widget-textarea > textarea {\\n font-size: 12px;\\n …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "115ff897563f4f198dd09b9d134c6615",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HTML(value='<b style=\"font-size:20px;\">Select Organism:</b><div style=\"height:10px;\"></div>'), …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" <style>\n",
" .widget-dropdown > select {\n",
" font-size: 16px;\n",
" font-weight: normal;\n",
" background-color: #f0f0f0;\n",
" border-radius: 5px;\n",
" padding: 5px;\n",
" }\n",
" .widget-label {\n",
" font-size: 18px;\n",
" font-weight: bold;\n",
" }\n",
" .custom-container {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: flex-start;\n",
" }\n",
" .widget-dropdown option[value^=\"\"] {\n",
" font-family: sans-serif;\n",
" font-weight: bold;\n",
" font-size: 18px;\n",
" padding: 510px;\n",
" }\n",
" .widget-dropdown option[value*=\"Selected Organisms\"],\n",
" .widget-dropdown option[value*=\"All Organisms\"] {\n",
" text-align: center;\n",
" font-family: Arial, sans-serif;\n",
" font-weight: bold;\n",
" font-size: 20px;\n",
" color: #6900A1;\n",
" background-color: #00D8A1;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Sample: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG, Homo sapiens\n",
"user = UserContainer()\n",
Expand All @@ -65,9 +154,35 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-----------------------------\n",
"| Organism |\n",
"-----------------------------\n",
"Escherichia coli general\n",
"\n",
"-----------------------------\n",
"| Input Protein |\n",
"-----------------------------\n",
"MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG\n",
"\n",
"-----------------------------\n",
"| Processed Input |\n",
"-----------------------------\n",
"M_UNK A_UNK L_UNK W_UNK M_UNK R_UNK L_UNK L_UNK P_UNK L_UNK L_UNK A_UNK L_UNK L_UNK A_UNK L_UNK W_UNK G_UNK P_UNK D_UNK P_UNK A_UNK A_UNK A_UNK F_UNK V_UNK N_UNK Q_UNK H_UNK L_UNK C_UNK G_UNK S_UNK H_UNK L_UNK V_UNK E_UNK A_UNK L_UNK Y_UNK L_UNK V_UNK C_UNK G_UNK E_UNK R_UNK G_UNK F_UNK F_UNK Y_UNK T_UNK P_UNK K_UNK T_UNK R_UNK R_UNK E_UNK A_UNK E_UNK D_UNK L_UNK Q_UNK V_UNK G_UNK Q_UNK V_UNK E_UNK L_UNK G_UNK G_UNK __UNK\n",
"\n",
"-----------------------------\n",
"| Predicted DNA |\n",
"-----------------------------\n",
"ATGGCTTTATGGATGCGTCTGCTGCCGCTGCTGGCGCTGCTGGCGCTGTGGGGCCCGGACCCGGCGGCGGCGTTTGTGAATCAGCACCTGTGCGGCAGCCACCTGGTGGAAGCGCTGTATCTGGTGTGCGGTGAGCGCGGCTTCTTCTACACGCCCAAAACCCGCCGCGAAGCGGAAGATCTGCAGGTGGGCCAGGTGGAGCTGGGCGGCTAA\n"
]
}
],
"source": [
"output = predict_dna_sequence(\n",
" protein=user.protein,\n",
Expand Down Expand Up @@ -97,9 +212,91 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>protein_sequence</th>\n",
" <th>organism</th>\n",
" <th>predicted_dna</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>MSEKYIVTWDMLQIHARKLASRLMPSEQWKGIIAVSRGGLVPGALL...</td>\n",
" <td>Escherichia coli general</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MKNIIRTPETHPLTWRLRDDKQPVWLDEYRSKNGYEGARKALTGLS...</td>\n",
" <td>Escherichia coli general</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MDALQIAEDTLQTLVPHCPVPSGPRRIFLDANVKESYCPLVPHTMY...</td>\n",
" <td>Homo sapiens</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>MAFANFRRILRLSTFEKRKSREYEHVRRDLDPNEVWEIVGELGDGA...</td>\n",
" <td>Homo sapiens</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>MTEKDAGGFNMSTFMNRKFQEPIQQIKTFSWMGFSWTCRKRRKHYQ...</td>\n",
" <td>Arabidopsis thaliana</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" protein_sequence \\\n",
"0 MSEKYIVTWDMLQIHARKLASRLMPSEQWKGIIAVSRGGLVPGALL... \n",
"1 MKNIIRTPETHPLTWRLRDDKQPVWLDEYRSKNGYEGARKALTGLS... \n",
"2 MDALQIAEDTLQTLVPHCPVPSGPRRIFLDANVKESYCPLVPHTMY... \n",
"3 MAFANFRRILRLSTFEKRKSREYEHVRRDLDPNEVWEIVGELGDGA... \n",
"4 MTEKDAGGFNMSTFMNRKFQEPIQQIKTFSWMGFSWTCRKRRKHYQ... \n",
"\n",
" organism predicted_dna \n",
"0 Escherichia coli general None \n",
"1 Escherichia coli general None \n",
"2 Homo sapiens None \n",
"3 Homo sapiens None \n",
"4 Arabidopsis thaliana None "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Update with the actual path to your dataset\n",
"dataset_path = \"demo/sample_dataset.csv\"\n",
Expand All @@ -112,9 +309,98 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"CodonTransformer Predicting: 100%|██████████| 5/5 [00:00<00:00, 16.58 Sequences/s]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>protein_sequence</th>\n",
" <th>organism</th>\n",
" <th>predicted_dna</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>MSEKYIVTWDMLQIHARKLASRLMPSEQWKGIIAVSRGGLVPGALL...</td>\n",
" <td>Escherichia coli general</td>\n",
" <td>ATGAGCGAAAAATATATTGTCACCTGGGACATGCTGCAGATCCATG...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MKNIIRTPETHPLTWRLRDDKQPVWLDEYRSKNGYEGARKALTGLS...</td>\n",
" <td>Escherichia coli general</td>\n",
" <td>ATGAAAAATATTATTAGAACACCTGAAACCCATCCGCTGACCTGGC...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MDALQIAEDTLQTLVPHCPVPSGPRRIFLDANVKESYCPLVPHTMY...</td>\n",
" <td>Homo sapiens</td>\n",
" <td>ATGGATGCCCTGCAGATTGCTGAGGACACCCTGCAGACCCTGGTGC...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>MAFANFRRILRLSTFEKRKSREYEHVRRDLDPNEVWEIVGELGDGA...</td>\n",
" <td>Homo sapiens</td>\n",
" <td>ATGGCCTTTGCCAACTTCCGGAGAATCCTGCGGCTGTCCACCTTTG...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>MTEKDAGGFNMSTFMNRKFQEPIQQIKTFSWMGFSWTCRKRRKHYQ...</td>\n",
" <td>Arabidopsis thaliana</td>\n",
" <td>ATGACGGAGAAAGATGCTGGAGGTTTTAATATGTCAACTTTCATGA...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" protein_sequence \\\n",
"0 MSEKYIVTWDMLQIHARKLASRLMPSEQWKGIIAVSRGGLVPGALL... \n",
"1 MKNIIRTPETHPLTWRLRDDKQPVWLDEYRSKNGYEGARKALTGLS... \n",
"2 MDALQIAEDTLQTLVPHCPVPSGPRRIFLDANVKESYCPLVPHTMY... \n",
"3 MAFANFRRILRLSTFEKRKSREYEHVRRDLDPNEVWEIVGELGDGA... \n",
"4 MTEKDAGGFNMSTFMNRKFQEPIQQIKTFSWMGFSWTCRKRRKHYQ... \n",
"\n",
" organism predicted_dna \n",
"0 Escherichia coli general ATGAGCGAAAAATATATTGTCACCTGGGACATGCTGCAGATCCATG... \n",
"1 Escherichia coli general ATGAAAAATATTATTAGAACACCTGAAACCCATCCGCTGACCTGGC... \n",
"2 Homo sapiens ATGGATGCCCTGCAGATTGCTGAGGACACCCTGCAGACCCTGGTGC... \n",
"3 Homo sapiens ATGGCCTTTGCCAACTTCCGGAGAATCCTGCGGCTGTCCACCTTTG... \n",
"4 Arabidopsis thaliana ATGACGGAGAAAGATGCTGGAGGTTTTAATATGTCAACTTTCATGA... "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for index, data in tqdm(\n",
" dataset.iterrows(),\n",
Expand All @@ -130,7 +416,7 @@
" tokenizer_object=tokenizer,\n",
" model_object=model,\n",
" )\n",
" dataset.loc[index, \"predicted_dna\"] = output.predicted_dna\n",
" dataset.loc[index, \"predicted_dna\"] = outputs.predicted_dna\n",
"\n",
"dataset.to_csv(output_path)\n",
"dataset.head()"
Expand Down
Loading

0 comments on commit 61f76aa

Please sign in to comment.