Skip to content

Commit

Permalink
import score matrices correctly in qual adj aligner
Browse files Browse the repository at this point in the history
jeizenga committed Apr 8, 2020
1 parent dee75e0 commit d0988f6
Showing 6 changed files with 152 additions and 158 deletions.
162 changes: 106 additions & 56 deletions src/aligner.cpp
Original file line number Diff line number Diff line change
@@ -124,22 +124,49 @@ unordered_set<vg::id_t> GSSWAligner::identify_pinning_points(const HandleGraph&
return return_val;
}

void GSSWAligner::load_scoring_matrix(istream& matrix_stream) {
if(score_matrix) free(score_matrix);
score_matrix = (int8_t*)calloc(25, sizeof(int8_t));
for(size_t i=0; i<25; i++){
if(!matrix_stream.good()){
std::cerr << "error: vg GSSWAligner::load_scoring_matrix requires a 5x5 whitespace separated integer matrix\n";
throw "";
}
int score;
matrix_stream >> score;
if(score > 127 || score < -127){
std::cerr << "error: vg GSSWAligner::load_scoring_matrix requires values in the range [-127,127]\n";
throw "";
int8_t* GSSWAligner::load_4x4_matrix(istream& matrix_stream) {

int8_t* matrix = (int8_t*) malloc(16 * sizeof(int8_t));
for (size_t i = 0; i < 16; i++) {
if (!matrix_stream.good()) {
std::cerr << "error: vg GSSWAligner::load_scoring_matrix requires a 5x5 whitespace separated integer matrix\n";
throw "";
}
score_matrix[i] = score;
int score;
matrix_stream >> score;
if (score > 127 || score < -127) {
std::cerr << "error: vg GSSWAligner::load_scoring_matrix requires values in the range [-127,127]\n";
throw "";
}
matrix[i] = score;
}
return matrix;
}

void Aligner::load_scoring_matrix(istream& matrix_stream) {

// load the 16 values
int8_t* matrix_4x4 = load_4x4_matrix(matrix_stream);

if (score_matrix) {
// get rid of the current internal matrix
free(score_matrix);
}
score_matrix = (int8_t*) malloc(sizeof(int8_t) * 25);

for (size_t i = 0, j = 0; i < 25; ++i) {
if (i % 5 == 4 || i / 5 == 4) {
// adding 0s in the 5th row and column for N-matches
score_matrix[i] = 0;
}
else {
// setting the substitution scores
score_matrix[i] = matrix_4x4[j];
++j;
}
}

free(matrix_4x4);
}

void GSSWAligner::gssw_mapping_to_alignment(gssw_graph* graph,
@@ -352,7 +379,7 @@ string GSSWAligner::graph_cigar(gssw_graph_mapping* gm) const {
return s.str();
}

void GSSWAligner::init_mapping_quality(double gc_content) {
void GSSWAligner::init_mapping_quality() {
log_base = gssw_dna_recover_log_base(match, mismatch, gc_content, 1e-12);
}

@@ -781,8 +808,7 @@ double GSSWAligner::estimate_max_possible_mapping_quality(int length, double min
}

double GSSWAligner::score_to_unnormalized_likelihood_ln(double score) const {
// Log base needs to be set, or this can't work. It's set by default in
// QualAdjAligner but needs to be set up manually in the normal Aligner.
// Log base needs to be set, or this can't work.
assert(log_base != 0);
// Likelihood is proportional to e^(lambda * score), so ln is just the exponent.
return log_base * score;
@@ -909,8 +935,8 @@ Aligner::Aligner(int8_t _match,
int8_t _gap_open,
int8_t _gap_extension,
int8_t _full_length_bonus,
double gc_content)
: Aligner(_match, _mismatch, _gap_open, _gap_extension, _full_length_bonus, gc_content, default_xdrop_max_gap_length)
double _gc_content)
: Aligner(_match, _mismatch, _gap_open, _gap_extension, _full_length_bonus, _gc_content, default_xdrop_max_gap_length)
{
}

@@ -919,7 +945,7 @@ Aligner::Aligner(int8_t _match,
int8_t _gap_open,
int8_t _gap_extension,
int8_t _full_length_bonus,
double gc_content,
double _gc_content,
uint32_t _xdrop_max_gap_length)
: xdrop(_match, _mismatch, _gap_open, _gap_extension, _full_length_bonus, _xdrop_max_gap_length)
{
@@ -928,20 +954,13 @@ Aligner::Aligner(int8_t _match,
gap_open = _gap_open;
gap_extension = _gap_extension;
full_length_bonus = _full_length_bonus;
gc_content = _gc_content;
// these are used when setting up the nodes
nt_table = gssw_create_nt_table();
score_matrix = gssw_create_score_matrix(match, mismatch);
GSSWAligner::init_mapping_quality(gc_content);
// bench_init(bench);
init_mapping_quality();
}

/*
Aligner::~Aligner()
{
// fprintf(stderr, "gssw: time(%lu), count(%lu)\n", bench_get(bench) / 1000, bench_get_count(bench));
}
*/

void Aligner::align_internal(Alignment& alignment, vector<Alignment>* multi_alignments, const HandleGraph& g,
const vector<handle_t>* topological_order, bool pinned, bool pin_left,
int32_t max_alt_alns, bool traceback_aln, bool print_score_matrices) const {
@@ -1432,32 +1451,65 @@ QualAdjAligner::QualAdjAligner(int8_t _match,
int8_t _gap_open,
int8_t _gap_extension,
int8_t _full_length_bonus,
int8_t _max_scaled_score,
uint8_t _max_qual_score,
double gc_content)
double _gc_content)
{

max_qual_score = _max_qual_score;
match = _match;
mismatch = _mismatch;
gap_open = _gap_open;
gap_extension = _gap_extension;
full_length_bonus = _full_length_bonus;
gc_content = _gc_content;

int8_t original_gap_open = gap_open;

// find max score and set it to be the max so that scaling doesn't change score
// TODO: hacky
int8_t max_score = max(match, max(mismatch, max(gap_open, gap_extension)));

nt_table = gssw_create_nt_table();
score_matrix = gssw_dna_scaled_adjusted_qual_matrix(_max_scaled_score, max_qual_score, &gap_open,
score_matrix = gssw_dna_scaled_adjusted_qual_matrix(max_score, 255, &gap_open,
&gap_extension, match, mismatch,
gc_content, 1e-12);
init_mapping_quality();
}

void QualAdjAligner::load_scoring_matrix(istream& matrix_stream) {

if (score_matrix) {
// get rid of the current internal matrix
free(score_matrix);
}

// load the 16 values
int8_t* matrix_4x4 = load_4x4_matrix(matrix_stream);

// TODO: this interface could really be improved in GSSW, oh well though

// convert gc content into base-wise frequencies
double* nt_freqs = (double*) malloc(sizeof(double) * 4);
nt_freqs[0] = 0.5 * (1 - gc_content);
nt_freqs[1] = 0.5 * gc_content;
nt_freqs[2] = 0.5 * gc_content;
nt_freqs[3] = 0.5 * (1 - gc_content);

// hacky way to update the stored match and mismatch scores, but it works
scale_factor = gap_open / original_gap_open;
match *= scale_factor;
mismatch *= scale_factor;
full_length_bonus *= scale_factor;
// find max score and set it to be the max so that scaling doesn't change score
// TODO: hacky
int8_t max_score = max(gap_open, gap_extension);
for (size_t i = 0; i < 16; ++i) {
max_score = max<int8_t>(max_score, abs(matrix_4x4[i]));
}

// find the quality-adjusted, scaled scores
int8_t* scaled_matrix_4x4 = gssw_scaled_adjusted_qual_matrix(max_score, 255, &gap_open, &gap_extension,
matrix_4x4, nt_freqs, 4, 1e-10);
// finally, add in the 0s to the 5-th row and column
score_matrix = gssw_add_ambiguous_char_to_adjusted_matrix(scaled_matrix_4x4, 255, 4);

// also update the mapping quality log base
log_base = gssw_recover_log_base(matrix_4x4, nt_freqs, 4, 1e-10);

GSSWAligner::init_mapping_quality(gc_content);
// free the temporary arrays we allocated
free(matrix_4x4);
free(nt_freqs);
free(scaled_matrix_4x4);
}

void QualAdjAligner::align_internal(Alignment& alignment, vector<Alignment>* multi_alignments, const HandleGraph& g,
@@ -1581,10 +1633,10 @@ void QualAdjAligner::align_internal(Alignment& alignment, vector<Alignment>* mul
}

// did we both 1) do DP (i.e. the graph is non-empty), and 2) find a traceback with positive score?
if (gms ? gms[0]->score > 0 : false) {
if (gms && gms[0]->score > 0) {

if (pin_left) {
// translate graph and mappings into original node space// translate nodes and mappings into original sequence
// translate graph and mappings into original node space
unreverse_graph(graph);
for (int32_t i = 0; i < max_alt_alns; i++) {
unreverse_graph_mapping(gms[i]);
@@ -1909,25 +1961,23 @@ const Aligner* AlignerClient::get_regular_aligner() const {

void AlignerClient::set_alignment_scores(int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend,
int8_t full_length_bonus, uint32_t xdrop_max_gap_length) {

// hacky, find max score so that scaling doesn't change score
int8_t max_score = match;
if (mismatch > max_score) max_score = mismatch;
if (gap_open > max_score) max_score = gap_open;
if (gap_extend > max_score) max_score = gap_extend;


qual_adj_aligner = unique_ptr<QualAdjAligner>(new QualAdjAligner(match, mismatch, gap_open, gap_extend,
full_length_bonus, max_score, 255, gc_content_estimate));
full_length_bonus, gc_content_estimate));
regular_aligner = unique_ptr<Aligner>(new Aligner(match, mismatch, gap_open, gap_extend,
full_length_bonus, gc_content_estimate, xdrop_max_gap_length));

}

void AlignerClient::load_scoring_matrix(std::ifstream& matrix_stream){
void AlignerClient::load_scoring_matrix(std::istream& matrix_stream){
matrix_stream.clear();
matrix_stream.seekg(0);
if(regular_aligner) regular_aligner->load_scoring_matrix(matrix_stream);
if (regular_aligner) {
regular_aligner->load_scoring_matrix(matrix_stream);
}
matrix_stream.clear();
matrix_stream.seekg(0);
if(qual_adj_aligner) qual_adj_aligner->load_scoring_matrix(matrix_stream);
if (qual_adj_aligner) {
qual_adj_aligner->load_scoring_matrix(matrix_stream);
}
}
44 changes: 26 additions & 18 deletions src/aligner.hpp
Original file line number Diff line number Diff line change
@@ -32,8 +32,6 @@ namespace vg {
static const int8_t default_gap_open = 6;
static const int8_t default_gap_extension = 1;
static const int8_t default_full_length_bonus = 5;
static const int8_t default_max_scaled_score = 32;
static const uint8_t default_max_qual_score = 255;
static const double default_gc_content = 0.5;
static const uint32_t default_xdrop_max_gap_length = 40;

@@ -59,9 +57,19 @@ namespace vg {
* The basic GSSW-based core aligner implementation, which can then be quality-adjusted or not.
*/
class GSSWAligner : public BaseAligner {


/// Reads a 4x4 substitution scoring matrix from an input stream (can be an ifstream).
/// Expecting 4 whitespace-separated 8-bit integers per line
virtual void load_scoring_matrix(std::istream& matrix_stream) = 0;

protected:

GSSWAligner() = default;
~GSSWAligner();
virtual ~GSSWAligner();

// allocates an array to hold a 4x4 substitution matrix and returns it
int8_t* load_4x4_matrix(std::istream& matrix_stream);

// for construction
// needed when constructing an alignable graph from the nodes
@@ -107,8 +115,8 @@ namespace vg {
const vector<double>* multiplicities = nullptr) const;
double estimate_next_best_score(int length, double min_diffs) const;

// must be called before querying mapping_quality
void init_mapping_quality(double gc_content);
// for calling in constructors so that mapping quality can be queried
void init_mapping_quality();

// TODO: this algorithm has numerical problems, just removing it for now
//vector<double> all_mapping_qualities_exact(vector<double> scaled_scores);
@@ -265,10 +273,6 @@ namespace vg {
virtual int32_t score_ungapped_alignment(const Alignment& aln,
bool strip_bonuses = false) const;

/// Reads a 5x5 substitution scoring matrix from an input stream (can be an ifstream)
/// expecting 5 whitespace-separated 8-bit integers per line
virtual void load_scoring_matrix(std::istream& matrix_stream);

/// Without necessarily rescoring the entire alignment, return the score
/// of the given alignment with bonuses removed. Assumes that bonuses
/// are actually included in the score.
@@ -287,6 +291,8 @@ namespace vg {
// log of the base of the logarithm underlying the log-odds interpretation of the scores
double log_base = 0.0;

// estimate of GC content
double gc_content = default_gc_content;
};

/**
@@ -322,6 +328,10 @@ namespace vg {
uint32_t _xdrop_max_gap_length);
~Aligner(void) = default;

/// Reads a 4x4 substitution scoring matrix from an input stream (can be an ifstream).
/// Expecting 4 whitespace-separated 8-bit integers per line
void load_scoring_matrix(istream& matrix_stream);

/// Store optimal local alignment against a graph in the Alignment object.
/// Gives the full length bonus separately on each end of the alignment.
/// Assumes that graph is topologically sorted by node index.
@@ -400,12 +410,14 @@ namespace vg {
int8_t _gap_open = default_gap_open,
int8_t _gap_extension = default_gap_extension,
int8_t _full_length_bonus = default_full_length_bonus,
int8_t _max_scaled_score = default_max_scaled_score,
uint8_t _max_qual_score = default_max_qual_score,
double gc_content = default_gc_content);
double _gc_content = default_gc_content);

~QualAdjAligner(void) = default;

/// Reads a 4x4 substitution scoring matrix from an input stream (can be an ifstream).
/// Expecting 4 whitespace-separated 8-bit integers per line
void load_scoring_matrix(istream& matrix_stream);

// base quality adjusted counterparts to functions of same name from Aligner
void align(Alignment& alignment, const HandleGraph& g, bool traceback_aln, bool print_score_matrices) const;
void align(Alignment& alignment, const HandleGraph& g, const vector<handle_t>& topological_order,
@@ -431,8 +443,6 @@ namespace vg {
void align_xdrop(Alignment& alignment, const HandleGraph& g, const vector<MaximalExactMatch>& mems, bool reverse_complemented) const;
void align_xdrop_multi(Alignment& alignment, const HandleGraph& g, const vector<MaximalExactMatch>& mems, bool reverse_complemented, int32_t max_alt_alns) const;
const XdropAligner& get_xdrop() const;

void init_mapping_quality(double gc_content);

int32_t score_exact_match(const Alignment& aln, size_t read_offset, size_t length) const;
int32_t score_exact_match(const string& sequence, const string& base_quality) const;
@@ -442,8 +452,6 @@ namespace vg {
int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const Path& path,
string::const_iterator seq_begin) const;

uint8_t max_qual_score;
int8_t scale_factor;

private:

@@ -485,8 +493,8 @@ namespace vg {
void set_alignment_scores(int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus,
uint32_t xdrop_max_gap_length = default_xdrop_max_gap_length);

/// Load a scoring amtrix from a file to set scores
void load_scoring_matrix(std::ifstream& matrix_stream);
/// Load a scoring matrix from a file to set scores
void load_scoring_matrix(std::istream& matrix_stream);

bool adjust_alignments_for_base_quality = false; // use base quality adjusted alignments

2 changes: 1 addition & 1 deletion src/subcommand/map_main.cpp
Original file line number Diff line number Diff line change
@@ -59,7 +59,7 @@ void help_map(char** argv) {
<< "scoring:" << endl
<< " -q, --match INT use this match score [1]" << endl
<< " -z, --mismatch INT use this mismatch penalty [4]" << endl
<< " --score-matrix FILE read a 5x5 integer substitution scoring matrix from a file" << endl
<< " --score-matrix FILE read a 4x4 integer substitution scoring matrix from a file" << endl
<< " -o, --gap-open INT use this gap open penalty [6]" << endl
<< " -y, --gap-extend INT use this gap extension penalty [1]" << endl
<< " -L, --full-l-bonus INT the full-length alignment bonus [5]" << endl
2 changes: 1 addition & 1 deletion src/subcommand/mpmap_main.cpp
Original file line number Diff line number Diff line change
@@ -95,7 +95,7 @@ void help_mpmap(char** argv) {
<< " -o, --gap-open INT use this gap open penalty [6]" << endl
<< " -y, --gap-extend INT use this gap extension penalty [1]" << endl
<< " -L, --full-l-bonus INT add this score to alignments that align each end of the read [5]" << endl
<< " -w, --score-matrix FILE read a 5x5 integer substitution scoring matrix from a file (in the order ACGTN)" << endl
<< " -w, --score-matrix FILE read a 4x4 integer substitution scoring matrix from a file (in the order ACGT)" << endl
<< " -m, --remove-bonuses remove full length alignment bonuses in reported scores" << endl
<< "computational parameters:" << endl
<< " -Z, --buffer-size INT buffer this many alignments together (per compute thread) before outputting to stdout [200]" << endl;
Loading
Oops, something went wrong.

0 comments on commit d0988f6

Please sign in to comment.