Skip to content

Commit

Permalink
debug fanout walking, adjust cluster scores for fanotus
Browse files Browse the repository at this point in the history
  • Loading branch information
jeizenga committed Jul 7, 2020
1 parent 271456f commit 709f0ad
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 35 deletions.
17 changes: 17 additions & 0 deletions src/aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1430,6 +1430,11 @@ int32_t Aligner::score_exact_match(string::const_iterator seq_begin, string::con
return score_exact_match(seq_begin, seq_end);
}

int32_t Aligner::score_mismatch(string::const_iterator seq_begin, string::const_iterator seq_end,
string::const_iterator base_qual_begin) const {
return -mismatch * (seq_end - seq_begin);
}

int32_t Aligner::score_mismatch(size_t length) const {
return -match * length;
}
Expand Down Expand Up @@ -2019,6 +2024,18 @@ int32_t QualAdjAligner::score_exact_match(string::const_iterator seq_begin, stri
return score;
}

int32_t QualAdjAligner::score_mismatch(string::const_iterator seq_begin, string::const_iterator seq_end,
string::const_iterator base_qual_begin) const {
int32_t score = 0;
for (auto seq_iter = seq_begin, qual_iter = base_qual_begin; seq_iter != seq_end; seq_iter++) {
// index 5 x 5 score matrices (ACGTN)
// always have match so that row and column index are same and can combine algebraically
score += score_matrix[25 * (*qual_iter) + 1];
qual_iter++;
}
return score;
}

int32_t QualAdjAligner::score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const Path& path,
string::const_iterator seq_begin) const {

Expand Down
9 changes: 7 additions & 2 deletions src/aligner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,10 @@ namespace vg {
/// Qualities may be ignored by some implementations.
virtual int32_t score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end,
string::const_iterator base_qual_begin) const = 0;

// TODO: Provide a generic way to score a mismatch.

virtual int32_t score_mismatch(string::const_iterator seq_begin, string::const_iterator seq_end,
string::const_iterator base_qual_begin) const = 0;

/// Compute the score of a path against the given range of subsequence with the given qualities.
virtual int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const Path& path,
string::const_iterator seq_begin) const = 0;
Expand Down Expand Up @@ -342,6 +343,8 @@ namespace vg {
string::const_iterator base_qual_begin) const;
int32_t score_exact_match(const string& sequence) const;
int32_t score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end) const;
int32_t score_mismatch(string::const_iterator seq_begin, string::const_iterator seq_end,
string::const_iterator base_qual_begin) const;

/// Score a mismatch given just the length. Only possible since we ignore qualities.
int32_t score_mismatch(size_t length) const;
Expand Down Expand Up @@ -396,6 +399,8 @@ namespace vg {
int32_t score_exact_match(const string& sequence, const string& base_quality) const;
int32_t score_exact_match(string::const_iterator seq_begin, string::const_iterator seq_end,
string::const_iterator base_qual_begin) const;
int32_t score_mismatch(string::const_iterator seq_begin, string::const_iterator seq_end,
string::const_iterator base_qual_begin) const;

int32_t score_partial_alignment(const Alignment& alignment, const HandleGraph& graph, const Path& path,
string::const_iterator seq_begin) const;
Expand Down
59 changes: 40 additions & 19 deletions src/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,8 +470,11 @@ void MEMClusterer::deduplicate_cluster_pairs(vector<pair<pair<size_t, size_t>, i
#endif
}

MEMClusterer::HitGraph::HitGraph(const vector<MaximalExactMatch>& mems, const Alignment& alignment, const GSSWAligner* aligner,
size_t min_mem_length, bool track_components) : track_components(track_components), components(0, false) {
MEMClusterer::HitGraph::HitGraph(const vector<MaximalExactMatch>& mems, const Alignment& alignment,
const GSSWAligner* aligner, size_t min_mem_length, bool track_components,
const match_fanouts_t* fanouts) :
track_components(track_components), components(0, false)
{
// there generally will be at least as many nodes as MEMs, so we can speed up the reallocation
nodes.reserve(mems.size());

Expand All @@ -492,7 +495,16 @@ MEMClusterer::HitGraph::HitGraph(const vector<MaximalExactMatch>& mems, const Al

int32_t mem_score = aligner->score_exact_match(mem.begin, mem.end,
alignment.quality().begin() + (mem.begin - alignment.sequence().begin()));


// adjust the score downward for fan-out mismatches
if (fanouts && fanouts->count(&mem)) {
for (const auto& fanout : fanouts->at(&mem)) {
mem_score += (aligner->score_mismatch(fanout.first, fanout.first + 1,
alignment.quality().begin() + (fanout.first - alignment.sequence().begin()))
- aligner->score_exact_match(fanout.first, fanout.first + 1,
alignment.quality().begin() + (fanout.first - alignment.sequence().begin())));
}
}

#ifdef debug_mem_clusterer
cerr << "adding nodes for MEM " << mem << endl;
Expand Down Expand Up @@ -1029,7 +1041,8 @@ vector<MEMClusterer::cluster_t> MEMClusterer::HitGraph::clusters(const Alignment
int32_t max_qual_score,
int32_t log_likelihood_approx_factor,
size_t min_median_mem_coverage_for_split,
double suboptimal_edge_pruning_factor) {
double suboptimal_edge_pruning_factor,
const match_fanouts_t* fanouts) {

vector<cluster_t> to_return;
if (nodes.size() == 0) {
Expand Down Expand Up @@ -1208,18 +1221,21 @@ vector<MEMClusterer::cluster_t> MEMClusterer::clusters(const Alignment& alignmen
int32_t max_qual_score,
int32_t log_likelihood_approx_factor,
size_t min_median_mem_coverage_for_split,
double suboptimal_edge_pruning_factor) {
double suboptimal_edge_pruning_factor,
const match_fanouts_t* fanouts) {

HitGraph hit_graph = make_hit_graph(alignment, mems, aligner, min_mem_length);
HitGraph hit_graph = make_hit_graph(alignment, mems, aligner, min_mem_length, fanouts);
return hit_graph.clusters(alignment, aligner, max_qual_score, log_likelihood_approx_factor,
min_median_mem_coverage_for_split, suboptimal_edge_pruning_factor);
min_median_mem_coverage_for_split, suboptimal_edge_pruning_factor,
fanouts);

}

MEMClusterer::HitGraph NullClusterer::make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems,
const GSSWAligner* aligner, size_t min_mem_length) {
const GSSWAligner* aligner, size_t min_mem_length,
const match_fanouts_t* fanouts) {
// intialize the hit nodes, but do not add any edges, and ignore the min mem length
return HitGraph(mems, alignment, aligner, 1);
return HitGraph(mems, alignment, aligner, 1, fanouts);
}

vector<pair<pair<size_t, size_t>, int64_t>> NullClusterer::pair_clusters(const Alignment& alignment_1,
Expand Down Expand Up @@ -1650,9 +1666,10 @@ vector<pair<size_t, size_t>> SnarlOrientedDistanceMeasurer::exclude_merges(vecto


MEMClusterer::HitGraph OrientedDistanceClusterer::make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems,
const GSSWAligner* aligner, size_t min_mem_length) {
const GSSWAligner* aligner, size_t min_mem_length,
const match_fanouts_t* fanouts) {

HitGraph hit_graph(mems, alignment, aligner, min_mem_length);
HitGraph hit_graph(mems, alignment, aligner, min_mem_length, fanouts);

// Get all the distances between nodes, in a forrest of unrooted trees of
// nodes that we know are on a consistent strand.
Expand Down Expand Up @@ -3106,11 +3123,12 @@ TVSClusterer::TVSClusterer(const HandleGraph* handle_graph, MinimumDistanceIndex
}

MEMClusterer::HitGraph TVSClusterer::make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems,
const GSSWAligner* aligner, size_t min_mem_length) {
const GSSWAligner* aligner, size_t min_mem_length,
const match_fanouts_t* fanouts) {


// intialize with nodes
HitGraph hit_graph(mems, alignment, aligner, min_mem_length);
HitGraph hit_graph(mems, alignment, aligner, min_mem_length, fanouts);

// assumes that MEMs are given in lexicographic order by read interval
for (size_t i = 0; i < hit_graph.nodes.size(); i++) {
Expand Down Expand Up @@ -3353,10 +3371,11 @@ vector<pair<pair<size_t, size_t>, int64_t>> MinDistanceClusterer::pair_clusters(
MEMClusterer::HitGraph MinDistanceClusterer::make_hit_graph(const Alignment& alignment,
const vector<MaximalExactMatch>& mems,
const GSSWAligner* aligner,
size_t min_mem_length) {
size_t min_mem_length,
const match_fanouts_t* fanouts) {

// intialize with nodes
HitGraph hit_graph(mems, alignment, aligner, min_mem_length);
HitGraph hit_graph(mems, alignment, aligner, min_mem_length, fanouts);

// assumes that MEMs are given in lexicographic order by read interval
for (size_t i = 0, j_begin = 1; i < hit_graph.nodes.size(); ++i) {
Expand Down Expand Up @@ -3433,10 +3452,11 @@ GreedyMinDistanceClusterer::GreedyMinDistanceClusterer(MinimumDistanceIndex* dis
}

MEMClusterer::HitGraph GreedyMinDistanceClusterer::make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems,
const GSSWAligner* aligner, size_t min_mem_length) {
const GSSWAligner* aligner, size_t min_mem_length,
const match_fanouts_t* fanouts) {

// init the hit graph's nodes
HitGraph hit_graph(mems, alignment, aligner, min_mem_length);
HitGraph hit_graph(mems, alignment, aligner, min_mem_length, fanouts);

// we will initialize this with the next backward and forward comparisons for each hit node
vector<pair<int64_t, int64_t>> next_comparisons;
Expand Down Expand Up @@ -3585,10 +3605,11 @@ ComponentMinDistanceClusterer::ComponentMinDistanceClusterer(MinimumDistanceInde
}

MEMClusterer::HitGraph ComponentMinDistanceClusterer::make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems,
const GSSWAligner* aligner, size_t min_mem_length) {
const GSSWAligner* aligner, size_t min_mem_length,
const match_fanouts_t* fanouts) {

// init the hit graph's nodes
HitGraph hit_graph(mems, alignment, aligner, min_mem_length);
HitGraph hit_graph(mems, alignment, aligner, min_mem_length, fanouts);

// shim the hit graph nodes into the seed clusterer algorithm interface
vector<SnarlSeedClusterer::Seed> positions(hit_graph.nodes.size());
Expand Down
27 changes: 17 additions & 10 deletions src/cluster.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ class MEMClusterer {
/// Each cluster is a vector of hits.
using cluster_t = vector<hit_t>;

/// Represents the mismatches that were allowed in "MEMs" from the fanout
/// match algorithm
using match_fanouts_t = unordered_map<const MaximalExactMatch*, deque<pair<string::const_iterator, char>>>;

/// Returns a vector of clusters. Each cluster is represented a vector of MEM hits. Each hit
/// contains a pointer to the original MEM and the position of that particular hit in the graph.
vector<cluster_t> clusters(const Alignment& alignment,
Expand All @@ -183,7 +187,8 @@ class MEMClusterer {
int32_t max_qual_score = 60,
int32_t log_likelihood_approx_factor = 0,
size_t min_median_mem_coverage_for_split = 0,
double suboptimal_edge_pruning_factor = .75);
double suboptimal_edge_pruning_factor = .75,
const match_fanouts_t* fanouts = nullptr);

/**
* Given two vectors of clusters and bounds on the distance between clusters,
Expand Down Expand Up @@ -217,7 +222,8 @@ class MEMClusterer {
/// Initializes a hit graph and adds edges to it, this must be implemented by any inheriting
/// class
virtual HitGraph make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems,
const GSSWAligner* aligner, size_t min_mem_length) = 0;
const GSSWAligner* aligner, size_t min_mem_length,
const match_fanouts_t* fanouts) = 0;

/// Once the distance between two hits has been estimated, estimate the score of the hit graph edge
/// connecting them
Expand All @@ -234,7 +240,7 @@ class MEMClusterer::HitGraph {

/// Initializes nodes in the hit graph, but does not add edges
HitGraph(const vector<MaximalExactMatch>& mems, const Alignment& alignment, const GSSWAligner* aligner,
size_t min_mem_length = 1, bool track_components = false);
size_t min_mem_length = 1, bool track_components = false, const match_fanouts_t* fanouts = nullptr);

/// Add an edge
void add_edge(size_t from, size_t to, int32_t weight, int64_t distance);
Expand All @@ -245,7 +251,8 @@ class MEMClusterer::HitGraph {
int32_t max_qual_score,
int32_t log_likelihood_approx_factor,
size_t min_median_mem_coverage_for_split,
double suboptimal_edge_pruning_factor);
double suboptimal_edge_pruning_factor,
const match_fanouts_t* fanouts);

vector<HitNode> nodes;

Expand Down Expand Up @@ -356,7 +363,7 @@ class NullClusterer : public MEMClusterer {
/// Concrete implementation of virtual method from MEMClusterer
/// Note: ignores the min_mem_length parameter
HitGraph make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems, const GSSWAligner* aligner,
size_t min_mem_length);
size_t min_mem_length, const match_fanouts_t* fanouts);
};


Expand Down Expand Up @@ -556,7 +563,7 @@ class OrientedDistanceClusterer : public MEMClusterer {

/// Concrete implementation of virtual method from MEMClusterer
HitGraph make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems, const GSSWAligner* aligner,
size_t min_mem_length);
size_t min_mem_length, const match_fanouts_t* fanouts);

OrientedDistanceMeasurer& distance_measurer;
size_t max_expected_dist_approx_error;
Expand Down Expand Up @@ -668,7 +675,7 @@ class TVSClusterer : public MEMClusterer {

/// Concrete implementation of virtual method from MEMClusterer
HitGraph make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems, const GSSWAligner* aligner,
size_t min_mem_length);
size_t min_mem_length, const match_fanouts_t* fanouts);

TargetValueSearch tvs;
};
Expand All @@ -695,7 +702,7 @@ class MinDistanceClusterer : public MEMClusterer {

/// Concrete implementation of virtual method from MEMClusterer
virtual HitGraph make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems, const GSSWAligner* aligner,
size_t min_mem_length);
size_t min_mem_length, const match_fanouts_t* fanouts);

const HandleGraph* handle_graph;
MinimumDistanceIndex* distance_index;
Expand All @@ -714,7 +721,7 @@ class GreedyMinDistanceClusterer : public MinDistanceClusterer {

/// Concrete implementation of virtual method from MEMClusterer, overides the inherited one from MinDistanceClusterer
HitGraph make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems, const GSSWAligner* aligner,
size_t min_mem_length);
size_t min_mem_length, const match_fanouts_t* fanouts);


/// How far apart do we expect the seeds to be on the read?
Expand Down Expand Up @@ -745,7 +752,7 @@ class ComponentMinDistanceClusterer : public MinDistanceClusterer {

/// Concrete implementation of virtual method from MEMClusterer, overides the inherited one from MinDistanceClusterer
HitGraph make_hit_graph(const Alignment& alignment, const vector<MaximalExactMatch>& mems, const GSSWAligner* aligner,
size_t min_mem_length);
size_t min_mem_length, const match_fanouts_t* fanouts);


/// Minimum distance between two seeds on the read
Expand Down
8 changes: 7 additions & 1 deletion src/multipath_alignment_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,12 @@ namespace vg {

#ifdef debug_multipath_alignment
cerr << "walking MEM hit " << hit_pos << " " << hit.first->sequence() << endl;
if (fanout_breaks && fanout_breaks->count(hit.first)) {
cerr << "fan-out breaks:" << endl;
for (auto fanout : fanout_breaks->at(hit.first)) {
cerr << "\t" << (fanout.first - hit.first->begin) << ": " << *fanout.first << " -> " << fanout.second << endl;
}
}
#endif

auto hit_range = injection_trans.equal_range(id(hit_pos));
Expand Down Expand Up @@ -804,7 +810,7 @@ namespace vg {
&& fanout_breaks->at(hit.first)[fanout_idx].first == read_iter) {
// we're at the next place where we substituted the read character
// for a different one
read_char = fanout_breaks->at(hit.first)[get<4>(back)].second;
read_char = fanout_breaks->at(hit.first)[fanout_idx].second;
#ifdef debug_multipath_alignment
cerr << "\tapplying fanout break to " << read_char << " instead of " << *read_iter << " at index " << (read_iter - begin) << " of MEM" << endl;
#endif
Expand Down
Loading

0 comments on commit 709f0ad

Please sign in to comment.