Skip to content

Commit

Permalink
Add compatibility between new output type and demo.
Browse files Browse the repository at this point in the history
Adibvafa committed Jul 28, 2024
1 parent 3526c36 commit 45a23a5
Showing 1 changed file with 12 additions and 298 deletions.
310 changes: 12 additions & 298 deletions CodonTransformerDemo.ipynb
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -53,98 +53,9 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e5867c02ed854a2eb07a20cd25d3ecd7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HTML(value='<b style=\"font-size:20px;\">Enter Protein Sequence:</b><div style=\"height:18px;\"></d…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "10342006b7274512be6322777316e93c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HTML(value='\\n <style>\\n .widget-textarea > textarea {\\n font-size: 12px;\\n …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "397d496d9b494c80916a3ba933bec09b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HTML(value='<b style=\"font-size:20px;\">Select Organism:</b><div style=\"height:10px;\"></div>'), …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" <style>\n",
" .widget-dropdown > select {\n",
" font-size: 16px;\n",
" font-weight: normal;\n",
" background-color: #f0f0f0;\n",
" border-radius: 5px;\n",
" padding: 5px;\n",
" }\n",
" .widget-label {\n",
" font-size: 18px;\n",
" font-weight: bold;\n",
" }\n",
" .custom-container {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: flex-start;\n",
" }\n",
" .widget-dropdown option[value^=\"\"] {\n",
" font-family: sans-serif;\n",
" font-weight: bold;\n",
" font-size: 18px;\n",
" padding: 510px;\n",
" }\n",
" .widget-dropdown option[value*=\"Selected Organisms\"],\n",
" .widget-dropdown option[value*=\"All Organisms\"] {\n",
" text-align: center;\n",
" font-family: Arial, sans-serif;\n",
" font-weight: bold;\n",
" font-size: 20px;\n",
" color: #6900A1;\n",
" background-color: #00D8A1;\n",
" }\n",
" </style>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"outputs": [],
"source": [
"# Sample: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG, Homo sapiens\n",
"user = UserContainer()\n",
@@ -154,35 +65,9 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-----------------------------\n",
"| Organism |\n",
"-----------------------------\n",
"Homo sapiens\n",
"\n",
"-----------------------------\n",
"| Input Protein |\n",
"-----------------------------\n",
"MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG\n",
"\n",
"-----------------------------\n",
"| Processed Input |\n",
"-----------------------------\n",
"M_UNK A_UNK L_UNK W_UNK M_UNK R_UNK L_UNK L_UNK P_UNK L_UNK L_UNK A_UNK L_UNK L_UNK A_UNK L_UNK W_UNK G_UNK P_UNK D_UNK P_UNK A_UNK A_UNK A_UNK F_UNK V_UNK N_UNK Q_UNK H_UNK L_UNK C_UNK G_UNK S_UNK H_UNK L_UNK V_UNK E_UNK A_UNK L_UNK Y_UNK L_UNK V_UNK C_UNK G_UNK E_UNK R_UNK G_UNK F_UNK F_UNK Y_UNK T_UNK P_UNK K_UNK T_UNK R_UNK R_UNK E_UNK A_UNK E_UNK D_UNK L_UNK Q_UNK V_UNK G_UNK Q_UNK V_UNK E_UNK L_UNK G_UNK G_UNK __UNK\n",
"\n",
"-----------------------------\n",
"| Predicted DNA |\n",
"-----------------------------\n",
"ATGGCCCTGTGGATGAGGCTGCTGCCCCTGCTGGCCCTGCTGGCCCTGTGGGGGCCTGACCCAGCTGCCGCCTTTGTGAACCAGCACCTGTGTGGCAGCCACCTGGTGGAGGCCCTGTACCTGGTGTGTGGGGAGAGAGGCTTCTTCTACACACCCAAGACCAGAAGAGAGGCCGAGGACCTGCAGGTGGGCCAGGTGGAGCTGGGAGGCTGA\n"
]
}
],
"outputs": [],
"source": [
"output = predict_dna_sequence(\n",
" protein=user.protein,\n",
@@ -212,91 +97,9 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>protein_sequence</th>\n",
" <th>organism</th>\n",
" <th>predicted_dna</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>MSEKYIVTWDMLQIHARKLASRLMPSEQWKGIIAVSRGGLVPGALL...</td>\n",
" <td>Escherichia coli general</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MKNIIRTPETHPLTWRLRDDKQPVWLDEYRSKNGYEGARKALTGLS...</td>\n",
" <td>Escherichia coli general</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MDALQIAEDTLQTLVPHCPVPSGPRRIFLDANVKESYCPLVPHTMY...</td>\n",
" <td>Homo sapiens</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>MAFANFRRILRLSTFEKRKSREYEHVRRDLDPNEVWEIVGELGDGA...</td>\n",
" <td>Homo sapiens</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>MTEKDAGGFNMSTFMNRKFQEPIQQIKTFSWMGFSWTCRKRRKHYQ...</td>\n",
" <td>Arabidopsis thaliana</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" protein_sequence \\\n",
"0 MSEKYIVTWDMLQIHARKLASRLMPSEQWKGIIAVSRGGLVPGALL... \n",
"1 MKNIIRTPETHPLTWRLRDDKQPVWLDEYRSKNGYEGARKALTGLS... \n",
"2 MDALQIAEDTLQTLVPHCPVPSGPRRIFLDANVKESYCPLVPHTMY... \n",
"3 MAFANFRRILRLSTFEKRKSREYEHVRRDLDPNEVWEIVGELGDGA... \n",
"4 MTEKDAGGFNMSTFMNRKFQEPIQQIKTFSWMGFSWTCRKRRKHYQ... \n",
"\n",
" organism predicted_dna \n",
"0 Escherichia coli general None \n",
"1 Escherichia coli general None \n",
"2 Homo sapiens None \n",
"3 Homo sapiens None \n",
"4 Arabidopsis thaliana None "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Update with the actual path to your dataset\n",
"dataset_path = \"demo/sample_dataset.csv\"\n",
@@ -309,98 +112,9 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"CodonTransformer Predicting: 100%|██████████| 5/5 [00:00<00:00, 17.00 Sequences/s]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>protein_sequence</th>\n",
" <th>organism</th>\n",
" <th>predicted_dna</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>MSEKYIVTWDMLQIHARKLASRLMPSEQWKGIIAVSRGGLVPGALL...</td>\n",
" <td>Escherichia coli general</td>\n",
" <td>DNASequencePrediction(organism='Escherichia co...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MKNIIRTPETHPLTWRLRDDKQPVWLDEYRSKNGYEGARKALTGLS...</td>\n",
" <td>Escherichia coli general</td>\n",
" <td>DNASequencePrediction(organism='Escherichia co...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MDALQIAEDTLQTLVPHCPVPSGPRRIFLDANVKESYCPLVPHTMY...</td>\n",
" <td>Homo sapiens</td>\n",
" <td>DNASequencePrediction(organism='Homo sapiens',...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>MAFANFRRILRLSTFEKRKSREYEHVRRDLDPNEVWEIVGELGDGA...</td>\n",
" <td>Homo sapiens</td>\n",
" <td>DNASequencePrediction(organism='Homo sapiens',...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>MTEKDAGGFNMSTFMNRKFQEPIQQIKTFSWMGFSWTCRKRRKHYQ...</td>\n",
" <td>Arabidopsis thaliana</td>\n",
" <td>DNASequencePrediction(organism='Arabidopsis th...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" protein_sequence \\\n",
"0 MSEKYIVTWDMLQIHARKLASRLMPSEQWKGIIAVSRGGLVPGALL... \n",
"1 MKNIIRTPETHPLTWRLRDDKQPVWLDEYRSKNGYEGARKALTGLS... \n",
"2 MDALQIAEDTLQTLVPHCPVPSGPRRIFLDANVKESYCPLVPHTMY... \n",
"3 MAFANFRRILRLSTFEKRKSREYEHVRRDLDPNEVWEIVGELGDGA... \n",
"4 MTEKDAGGFNMSTFMNRKFQEPIQQIKTFSWMGFSWTCRKRRKHYQ... \n",
"\n",
" organism predicted_dna \n",
"0 Escherichia coli general DNASequencePrediction(organism='Escherichia co... \n",
"1 Escherichia coli general DNASequencePrediction(organism='Escherichia co... \n",
"2 Homo sapiens DNASequencePrediction(organism='Homo sapiens',... \n",
"3 Homo sapiens DNASequencePrediction(organism='Homo sapiens',... \n",
"4 Arabidopsis thaliana DNASequencePrediction(organism='Arabidopsis th... "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"for index, data in tqdm(\n",
" dataset.iterrows(),\n",
@@ -409,14 +123,14 @@
" total=dataset.shape[0],\n",
"):\n",
"\n",
" predicted_dna = predict_dna_sequence(\n",
" outputs = predict_dna_sequence(\n",
" protein=data[\"protein_sequence\"],\n",
" organism=data[\"organism\"],\n",
" device=DEVICE,\n",
" tokenizer_object=tokenizer,\n",
" model_object=model,\n",
" )\n",
" dataset.loc[index, \"predicted_dna\"] = predicted_dna\n",
" dataset.loc[index, \"predicted_dna\"] = output.predicted_dna\n",
"\n",
"dataset.to_csv(output_path)\n",
"dataset.head()"

0 comments on commit 45a23a5

Please sign in to comment.