Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add beam #47

Merged
merged 14 commits into from
Feb 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified archives/prospr_core.tar.gz
Binary file not shown.
Binary file modified archives/prospr_core.zip
Binary file not shown.
Binary file modified archives/prospr_data.tar.gz
Binary file not shown.
Binary file modified archives/prospr_data.zip
Binary file not shown.
9 changes: 8 additions & 1 deletion prospr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from prospr_core import AminoAcid, Protein, depth_first, depth_first_bnb
from prospr_core import (
AminoAcid,
Protein,
depth_first,
depth_first_bnb,
beam_search,
)
from .datasets import load_vanEck250, load_vanEck1000, load_vanEck_hratio
from .visualize import plot_protein

Expand All @@ -10,6 +16,7 @@
"Protein",
"depth_first",
"depth_first_bnb",
"beam_search",
"load_vanEck250",
"load_vanEck1000",
"load_vanEck_hratio",
Expand Down
6 changes: 6 additions & 0 deletions prospr/core/core_module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ namespace py = pybind11;
#include "src/protein.cpp"
#include "src/depth_first.cpp"
#include "src/depth_first_bnb.cpp"
#include "src/beam_search.cpp"


PYBIND11_MODULE(prospr_core, m) {
Expand Down Expand Up @@ -87,4 +88,9 @@ PYBIND11_MODULE(prospr_core, m) {
m.def("depth_first_bnb", depth_first_bnb,
"Finds the optimal conformation via depth-first branch-and-bound search",
py::arg("protein"), py::arg("prune_func")="");

/* Beam search function definition. */
m.def("beam_search", beam_search,
"Finds the optimal conformation via beam search",
py::arg("protein"), py::arg("beam_width")=-1);
}
27 changes: 27 additions & 0 deletions prospr/core/src/amino_acid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,17 @@
*/

#include "amino_acid.hpp"
#include <iostream>


/* Copy constructor. */
AminoAcid::AminoAcid(const AminoAcid &other) {
this->type = other.type;
this->index = other.index;
this->prev_move = other.prev_move;
this->next_move = other.next_move;
}

/* Construct a new AminoAcid. */
AminoAcid::AminoAcid(char type, int index, int prev_move, int next_move) {
this->type = type;
Expand All @@ -16,6 +25,16 @@ AminoAcid::AminoAcid(char type, int index, int prev_move, int next_move) {
this->next_move = next_move;
}

/* Overload assignment operator for copy-assignments. */
AminoAcid& AminoAcid::operator=(const AminoAcid& other) {
this->type = other.type;
this->index = other.index;
this->prev_move = other.prev_move;
this->next_move = other.next_move;

return *this;
}

/* Returns the AminoAcid's type. */
char AminoAcid::get_type() {
return type;
Expand Down Expand Up @@ -45,3 +64,11 @@ void AminoAcid::set_prev_move(int move) {
void AminoAcid::set_next_move(int move) {
next_move = move;
}

/* Overload << operator for printing AminoAcids. */
std::ostream &operator<<(std::ostream &os, AminoAcid& amino_acid) {
std::cout << "<AminoAcid '" << amino_acid.get_type() << "' idx=" \
<< amino_acid.get_index() << " moves=[" << amino_acid.get_prev_move() \
<< "," << amino_acid.get_next_move() << "]>" << std::flush;
return os;
}
11 changes: 11 additions & 0 deletions prospr/core/src/amino_acid.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,20 @@
#ifndef AMINO_ACID_H
#define AMINO_ACID_H

#include <string>


class AminoAcid {
public:
/* Copy constructor. */
AminoAcid(const AminoAcid &other);

/* Construct a new AminoAcid. */
AminoAcid(char type, int index, int prev_move=0, int next_move=0);

/* Overload assignment operator for copy-assignments. */
AminoAcid& operator=(const AminoAcid &other);

/* Returns the AminoAcid's type. */
char get_type();

Expand All @@ -39,4 +47,7 @@ class AminoAcid {
int next_move;
};

/* Overload << operator for printing AminoAcids. */
std::ostream &operator<<(std::ostream &os, AminoAcid& amino_acid);

#endif
189 changes: 189 additions & 0 deletions prospr/core/src/beam_search.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
/* File: beam_search.h
* Description: Source file for a beam search function.
* License: This file is licensed under the GNU LGPL V3 license by
* Okke van Eck (2020 - 2023). See the LICENSE file for the
* specifics.
*/

#include "beam_search.hpp"
#include <functional>
#include <iostream>
#include <queue>
#include <vector>
#include <math.h>
#include <numeric>
#include <cstring>


/* Type for storing info on bondable amino acids. */
struct BondInfo {
int max_length;
int no_neighbors;
std::vector<int> max_weights;
size_t num_idxs;
std::vector<int> h_idxs;
std::vector<std::vector<int>> bond_dists;
};

/* Type for ordering proteins in a priority queue. */
struct PrioProtein {
Protein protein;
int score;
};

/* Overload > operator for Conformation in the priority queue.
* Bigger indicates more potential for leading towards the minimum energy
* conformation.
*/
bool operator>(const PrioProtein& lhs,
const PrioProtein& rhs) {
return lhs.score > rhs.score;
}

/* Overload << operator for printing PrioProteins. */
std::ostream &operator<<(std::ostream &os, PrioProtein& prot) {
std::cout << "<" << prot.score << ", [";

for (int i: prot.protein.hash_fold()) {
std::cout << i << " ";
}

std::cout << "]>";
return os;
}

/* Compute how amino acids can possibly form bonds. */
BondInfo _comp_bondable_aminos(Protein* protein) {
/* Fetch protein specific information. */
int max_length = protein->get_sequence().length();
int no_neighbors = (int)pow(2, (protein->get_dim() - 1));
std::vector<int> max_weights = protein->get_max_weights();
std::vector<int> h_idxs = {};
std::vector<std::vector<int>> bond_dists = {};

/* Create vector with distances between aminos that can create bonds. */
std::vector<int> cur_dists = {};

for (int i = 0; i < max_length; i++) {
/* Only include indexes that can create bonds. */
if (max_weights[i] != 0) {
/* Create vector with distances to previous bondable aminos. */
for (int idx : h_idxs) {
if (i - idx >= 3 && (i - idx) % 2 == 1) {
cur_dists.push_back(i - idx);
}
}

/* Add distance vector to set and clear for next iterations. */
bond_dists.push_back(cur_dists);
cur_dists.clear();

/* Add current index for next iterations. */
h_idxs.push_back(i);
}
}

/* Store info on possible bonds in BondInfo struct and return. */
BondInfo binfo = {max_length, no_neighbors, max_weights, h_idxs.size(),
h_idxs, bond_dists};
return binfo;
}

/* Compute heuristic score for protein to use in priority queue. */
int comp_score(Protein* protein, BondInfo* binfo) {
/* Compute to be placed aminos possibly making bonds. */
int future_aminos = 0;
for (auto h_idx : binfo->h_idxs) {
if (h_idx >= protein->get_cur_len()) {
future_aminos++;
}
}

/* Compute branch score with the to be placed amino acids. */
int branch_score = 0;
for (size_t i = binfo->num_idxs - future_aminos; i < binfo->num_idxs; i++) {
/* Check if bondable amino is last of protein. */
if (binfo->h_idxs[i] == binfo->max_length - 1) {
/* The last amino being bondable can create an additional bond. */
branch_score += binfo->max_weights[binfo->h_idxs[i]] *
std::min((size_t)binfo->no_neighbors + 1,
binfo->bond_dists[i].size());
} else {
branch_score += binfo->max_weights[binfo->h_idxs[i]] *
std::min((size_t)binfo->no_neighbors,
binfo->bond_dists[i].size());
}
}

return protein->get_score() + branch_score;
}

/* A beam search function for finding a minimum energy conformation. */
Protein* beam_search(Protein* protein, int beam_width) {
int max_length = protein->get_sequence().length();
int dim = protein->get_dim();

/* The first two amino acids are fixed to prevent y-axis symmetry. */
if (max_length > 1)
protein->place_amino(-1);
if (max_length <= 2)
return protein;

/* Create vector for current proteins, and a priority queue to filter. */
std::vector<PrioProtein> beam = {};
std::priority_queue<PrioProtein,
std::vector<PrioProtein>,
std::greater<PrioProtein>> cur_proteins = {};

/* Create vector with all moves. */
std::vector<int> all_moves(dim * 2 + 1);
std::iota(all_moves.begin(), all_moves.end(), -dim);
all_moves.erase(all_moves.begin() + dim);

/* Compute future bondable connections for heuristic scoring. */
BondInfo binfo = _comp_bondable_aminos(protein);

/* Loop over proteins in beam until proteins are fully folded. */
PrioProtein cur_prioprot = {*protein, comp_score(protein, &binfo)};
beam.push_back(cur_prioprot);
Protein cur_protein, cur_expansion;
int num_elements;

while (beam[0].protein.get_cur_len() != max_length) {
/* Expand all proteins in the beam and add to priority queue. */
for (PrioProtein prio_prot : beam) {
cur_protein = prio_prot.protein;

for (int m : all_moves) {
if (cur_protein.is_valid(m)) {
cur_expansion = cur_protein;
cur_expansion.place_amino(m);
cur_prioprot = {cur_expansion,
comp_score(&cur_expansion, &binfo)};
cur_proteins.push(cur_prioprot);
}
}
}

/* Interpret beam_width of -1 as all elements. */
if (beam_width == -1) {
num_elements = cur_proteins.size();
} else {
num_elements = std::min(cur_proteins.size(), (size_t)beam_width);
}

/* Update beam with highest ranked proteins and clear priority queue. */
beam.clear();
for (int i = 0; i < num_elements; i++) {
beam.push_back(cur_proteins.top());
cur_proteins.pop();
}
cur_proteins = std::priority_queue<PrioProtein,
std::vector<PrioProtein>,
std::greater<PrioProtein>>();
}

/* First protein in priority queue will have highest score. */
*protein = beam[0].protein;
return protein;
}
17 changes: 17 additions & 0 deletions prospr/core/src/beam_search.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/* File: beam_search.h
* Description: Header file for a beam search function.
* License: This file is licensed under the GNU LGPL V3 license by
* Okke van Eck (2020 - 2023). See the LICENSE file for the
* specifics.
*/

#ifndef BEAM_SEARCH_H
#define BEAM_SEARCH_H

#include "protein.hpp"


/* A beam search function for finding a minimum energy conformation. */
Protein* beam_search(Protein* protein, int beam_width=-1);

#endif
14 changes: 5 additions & 9 deletions prospr/core/src/depth_first_bnb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,23 +68,19 @@ bool reach_prune(Protein* protein, int move, int best_score,
}
}

/* Check score improvement if branch cannot add any bonds. */
if (future_aminos == 0) {
protein->remove_amino();
return cur_score >= best_score;
}

/* Compute branch score with the to be placed amino acids. */
int branch_score = 0;
for (int i = p_vars->num_idxs - future_aminos; i < p_vars->num_idxs; i++) {
/* Check if bondable amino is last of protein. */
if (p_vars->h_idxs[i] == p_vars->max_length - 1) {
/* The last amino being bondable can create an additional bond. */
branch_score += p_vars->max_weights[p_vars->h_idxs[i]] *
std::min((size_t)p_vars->no_neighbors + 1, p_vars->bond_dists[i].size());
std::min((size_t)p_vars->no_neighbors + 1,
p_vars->bond_dists[i].size());
} else {
branch_score += p_vars->max_weights[p_vars->h_idxs[i]] *
std::min((size_t)p_vars->no_neighbors, p_vars->bond_dists[i].size());
std::min((size_t)p_vars->no_neighbors,
p_vars->bond_dists[i].size());
}
}

Expand Down Expand Up @@ -130,7 +126,7 @@ Protein* depth_first_bnb(Protein* protein, std::string prune_func) {
}
}

/* Add distance vector to set and clear for next iterations. */
/* Add distance vector to set and clear for next iterations. */
p_vars.bond_dists.push_back(cur_dists);
cur_dists.clear();

Expand Down
Loading