Skip to content
This repository has been archived by the owner on Apr 27, 2018. It is now read-only.

Commit

Permalink
1.0dev.69: add -mpc option, implement previously unimplemented -omax
Browse files Browse the repository at this point in the history
  • Loading branch information
bolosky committed Mar 10, 2015
1 parent 14ea58d commit 81b4222
Show file tree
Hide file tree
Showing 20 changed files with 468 additions and 100 deletions.
6 changes: 6 additions & 0 deletions SNAPLib/AlignerContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,12 @@ AlignerContext::initialize()
noTruncation = options->noTruncation;
maxSecondaryAlignmentAdditionalEditDistance = options->maxSecondaryAlignmentAdditionalEditDistance;
maxSecondaryAlignments = options->maxSecondaryAlignments;
maxSecondaryAlignmentsPerContig = options->maxSecondaryAlignmentsPerContig;

if (maxSecondaryAlignmentAdditionalEditDistance < 0 && (maxSecondaryAlignments < 1000000 || maxSecondaryAlignmentsPerContig > 0)) {
WriteErrorMessage("You set -omax and/or -mpc without setting -om. They're meaningful only in the context of -om, so you probably didn't really mean to do that.\n");
soft_exit(1);
}
minReadLength = options->minReadLength;

if (index != NULL && (int)minReadLength < index->getSeedLength()) {
Expand Down
1 change: 1 addition & 0 deletions SNAPLib/AlignerContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ class AlignerContext : public TaskContextBase
bool noTruncation;
int maxSecondaryAlignmentAdditionalEditDistance;
int maxSecondaryAlignments;
int maxSecondaryAlignmentsPerContig;
unsigned minReadLength;


Expand Down
136 changes: 77 additions & 59 deletions SNAPLib/AlignerOptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ AlignerOptions::AlignerOptions(
ignoreSecondaryAlignments(true),
maxSecondaryAlignmentAdditionalEditDistance(-1),
maxSecondaryAlignments(0x7fffffff),
maxSecondaryAlignmentsPerContig(-1), // -1 means don't limit
preserveClipping(false),
expansionFactor(1.0),
noUkkonen(false),
Expand Down Expand Up @@ -101,54 +102,58 @@ AlignerOptions::usage()
void
AlignerOptions::usageMessage()
{
WriteErrorMessage(
"Usage: \n%s\n"
"Options:\n"
" -o filename output alignments to filename in SAM or BAM format, depending on the file extension or\n"
" explicit type specifier (see below). Use a dash with an explicit type specifier to write to\n"
WriteErrorMessage(
"Usage: \n%s\n"
"Options:\n"
" -o filename output alignments to filename in SAM or BAM format, depending on the file extension or\n"
" explicit type specifier (see below). Use a dash with an explicit type specifier to write to\n"
" stdout, so for example -o -sam - would write SAM output to stdout\n"
" -d maximum edit distance allowed per read or pair (default: %d)\n"
" -n number of seeds to use per read\n"
" -sc Seed coverage (i.e., readSize/seedSize). Floating point. Exclusive with -n. (default uses -n)\n"
" -h maximum hits to consider per seed (default: %d)\n"
" -ms minimum seed matches per location (default: %d)\n"
" -t number of threads (default is one per core)\n"
" -b bind each thread to its processor (this is the default)\n"
" --b Don't bind each thread to its processor (note the double dash)\n"
" -P disables cache prefetching in the genome; may be helpful for machines\n"
" with small caches or lots of cores/cache\n"
" -so sort output file by alignment location\n"
" -sm memory to use for sorting in Gb\n"
" -x explore some hits of overly popular seeds (useful for filtering)\n"
" -f stop on first match within edit distance limit (filtering mode)\n"
" -F filter output (a=aligned only, s=single hit only (MAPQ >= %d), u=unaligned only, l=long enough to align (see -mrl))\n"
" -S suppress additional processing (sorted BAM output only)\n"
" i=index, d=duplicate marking\n"
" -d maximum edit distance allowed per read or pair (default: %d)\n"
" -n number of seeds to use per read\n"
" -sc Seed coverage (i.e., readSize/seedSize). Floating point. Exclusive with -n. (default uses -n)\n"
" -h maximum hits to consider per seed (default: %d)\n"
" -ms minimum seed matches per location (default: %d)\n"
" -t number of threads (default is one per core)\n"
" -b bind each thread to its processor (this is the default)\n"
" --b Don't bind each thread to its processor (note the double dash)\n"
" -P disables cache prefetching in the genome; may be helpful for machines\n"
" with small caches or lots of cores/cache\n"
" -so sort output file by alignment location\n"
" -sm memory to use for sorting in Gb\n"
" -x explore some hits of overly popular seeds (useful for filtering)\n"
" -f stop on first match within edit distance limit (filtering mode)\n"
" -F filter output (a=aligned only, s=single hit only (MAPQ >= %d), u=unaligned only, l=long enough to align (see -mrl))\n"
" -S suppress additional processing (sorted BAM output only)\n"
" i=index, d=duplicate marking\n"
#if USE_DEVTEAM_OPTIONS
" -I ignore IDs that don't match in the paired-end aligner\n"
" -I ignore IDs that don't match in the paired-end aligner\n"
#ifdef _MSC_VER // Only need this on Windows, since memory allocation is fast on Linux
" -B Insert barrier after per-thread memory allocation to improve timing accuracy\n"
" -B Insert barrier after per-thread memory allocation to improve timing accuracy\n"
#endif // _MSC_VER
#endif // USE_DEVTEAM_OPTIONS
" -Cxx must be followed by two + or - symbols saying whether to clip low-quality\n"
" bases from front and back of read respectively; default: back only (-C-+)\n"
" -M indicates that CIGAR strings in the generated SAM file should use M (alignment\n"
" match) rather than = and X (sequence (mis-)match). This is the default\n"
" -= use the new style CIGAR strings with = and X rather than M. The opposite of -M\n"
" -G specify a gap penalty to use when generating CIGAR strings\n"
" -pf specify the name of a file to contain the run speed\n"
" --hp Indicates not to use huge pages (this may speed up index load and slow down alignment) This is the default\n"
" -hp Indicates to use huge pages (this may speed up alignment and slow down index load).\n"
" -D Specifies the extra search depth (the edit distance beyond the best hit that SNAP uses to compute MAPQ). Default 2\n"
" -rg Specify the default read group if it is not specified in the input file\n"
" -R Specify the entire read group line for the SAM/BAM output. This must include an ID tag. If it doesn't start with\n"
" '@RG' SNAP will add that. Specify tabs by \\t. Two backslashes will generate a single backslash.\n"
" backslash followed by anything else is illegal. So, '-R @RG\\tID:foo\\tDS:my data' would generate reads\n"
" with defualt tag foo, and an @RG line that also included the DS:my data field.\n"
" -sa Include reads from SAM or BAM files with the secondary (0x100) or supplementary (0x800) flag set; default is to drop them.\n"
" -om Output multiple alignments. Takes as a parameter the maximum extra edit distance relative to the best alignment\n"
" to allow for secondary alignments\n"
" -omax Limit the number of alignments per read generated by -om.\n"
" -Cxx must be followed by two + or - symbols saying whether to clip low-quality\n"
" bases from front and back of read respectively; default: back only (-C-+)\n"
" -M indicates that CIGAR strings in the generated SAM file should use M (alignment\n"
" match) rather than = and X (sequence (mis-)match). This is the default\n"
" -= use the new style CIGAR strings with = and X rather than M. The opposite of -M\n"
" -G specify a gap penalty to use when generating CIGAR strings\n"
" -pf specify the name of a file to contain the run speed\n"
" --hp Indicates not to use huge pages (this may speed up index load and slow down alignment) This is the default\n"
" -hp Indicates to use huge pages (this may speed up alignment and slow down index load).\n"
" -D Specifies the extra search depth (the edit distance beyond the best hit that SNAP uses to compute MAPQ). Default 2\n"
" -rg Specify the default read group if it is not specified in the input file\n"
" -R Specify the entire read group line for the SAM/BAM output. This must include an ID tag. If it doesn't start with\n"
" '@RG' SNAP will add that. Specify tabs by \\t. Two backslashes will generate a single backslash.\n"
" backslash followed by anything else is illegal. So, '-R @RG\\tID:foo\\tDS:my data' would generate reads\n"
" with defualt tag foo, and an @RG line that also included the DS:my data field.\n"
" -sa Include reads from SAM or BAM files with the secondary (0x100) or supplementary (0x800) flag set; default is to drop them.\n"
" -om Output multiple alignments. Takes as a parameter the maximum extra edit distance relative to the best alignment\n"
" to allow for secondary alignments\n"
" -omax Limit the number of alignments per read generated by -om. This means that if -om would generate more\n"
" than -omax secondary alignments, SNAP will write out only the best -omax of them, where 'best' means\n"
" 'with the lowest edit distance'. Ties are broken arbitrarily.\n"
" -mpc Limit the number of alignments generated by -om to this many per contig (chromosome/FASTA entry);\n"
" 'mpc' means 'max per contig; default unlimited. This filter is applied prior to -omax.\n"
" -pc Preserve the soft clipping for reads coming from SAM or BAM files\n"
" -xf Increase expansion factor for BAM and GZ files (default %.1f)\n"
" -hdp Use Hadoop-style prefixes (reporter:status:...) on error messages, and emit hadoop-style progress messages\n"
Expand Down Expand Up @@ -428,24 +433,37 @@ AlignerOptions::parse(
n++;

return true;
} else if (strcmp(argv[n], "-omax") == 0) {
if (n + 1 >= argc) {
WriteErrorMessage("-omax requires an additional value\n");
return false;
}
//
// Check that the parameter is actually numeric. This is to avoid having someone do "-omax -anotherSwitch" and
// having the additional switche silently consumed here.
//
if (argv[n + 1][0] < '0' || argv[n + 1][0] > '9') {
WriteErrorMessage("-omax requires a numerical parameter.\n");
return false;
}
maxSecondaryAlignments = atoi(argv[n + 1]);
} else if (strcmp(argv[n], "-omax") == 0) {
if (n + 1 >= argc) {
WriteErrorMessage("-omax requires an additional value\n");
return false;
}

maxSecondaryAlignments = atoi(argv[n + 1]);

n++;
if (maxSecondaryAlignments <= 0) {
WriteErrorMessage("-omax must be strictly positive\n");
}

return true;
n++;

return true;
} else if (strcmp(argv[n], "-mpc") == 0) {
if (n + 1 >= argc) {
WriteErrorMessage("-mpc requires an additional value\n");
return false;
}

maxSecondaryAlignmentsPerContig = atoi(argv[n + 1]);

if (maxSecondaryAlignmentsPerContig <= 0) {
WriteErrorMessage("-mpc must be strictly positive\n");
return false;
}

n++;

return true;
} else if (strcmp(argv[n], "-wbs") == 0) {
if (n + 1 >= argc) {
WriteErrorMessage("-wbs requires an additional value\n");
Expand Down
1 change: 1 addition & 0 deletions SNAPLib/AlignerOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ struct AlignerOptions : public AbstractOptions
bool ignoreSecondaryAlignments; // on input, default true
int maxSecondaryAlignmentAdditionalEditDistance;
int maxSecondaryAlignments;
int maxSecondaryAlignmentsPerContig;
bool preserveClipping;
float expansionFactor;
bool noUkkonen;
Expand Down
107 changes: 107 additions & 0 deletions SNAPLib/AlignmentResult.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*++
Module Name:
AlignmentResult.cpp
Abstract:
Code for SNAP genome alignment results
Authors:
Bill Bolosky, March, 2015
Environment:
Revision History:
--*/

#include "stdafx.h"
#include "AlignmentResult.h"
#include "GenomeIndex.h"



int
SingleAlignmentResult::compareByContigAndScore(const void *first_, const void *second_)
{
extern GenomeIndex *g_index; // Sorry, but no easy way to get this into here

const SingleAlignmentResult *first = (SingleAlignmentResult *)first_;
const SingleAlignmentResult *second = (SingleAlignmentResult *)second_;

int firstContig = g_index->getGenome()->getContigNumAtLocation(first->location);
int secondContig = g_index->getGenome()->getContigNumAtLocation(second->location);

if (firstContig < secondContig) {
return -1;
} else if (firstContig > secondContig) {
return 1;
} else if (first->score < second->score) {
return -1;
} else if (first->score > second->score) {
return 1;
} else {
return 0;
}
}

int
SingleAlignmentResult::compareByScore(const void *first_, const void *second_)
{
const SingleAlignmentResult *first = (SingleAlignmentResult *)first_;
const SingleAlignmentResult *second = (SingleAlignmentResult *)second_;

if (first->score < second->score) {
return -1;
} else if (first->score > second->score) {
return 1;
} else {
return 0;
}
}

int
PairedAlignmentResult::compareByContigAndScore(const void *first_, const void *second_)
{
extern GenomeIndex *g_index; // Sorry, but no easy way to get this into here

const PairedAlignmentResult *first = (PairedAlignmentResult *)first_;
const PairedAlignmentResult *second = (PairedAlignmentResult *)second_;

int firstContig = g_index->getGenome()->getContigNumAtLocation(first->location[0]);
int secondContig = g_index->getGenome()->getContigNumAtLocation(second->location[0]);

if (firstContig < secondContig) {
return -1;
} else if (firstContig > secondContig) {
return 1;
} else if (first->score < second->score) {
return -1;
} else if (first->score > second->score) {
return 1;
} else {
return 0;
}
}

int
PairedAlignmentResult::compareByScore(const void *first_, const void *second_)
{
const PairedAlignmentResult *first = (PairedAlignmentResult *)first_;
const PairedAlignmentResult *second = (PairedAlignmentResult *)second_;

int firstScore = first->score[0] + first->score[1];
int secondScore = second->score[0] + second->score[1];

if (firstScore < secondScore) {
return -1;
} else if (firstScore > secondScore) {
return 1;
} else {
return 0;
}
}
5 changes: 5 additions & 0 deletions SNAPLib/AlignmentResult.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ struct SingleAlignmentResult {

int mapq; // mapping quality, encoded like a Phred score (but as an integer, not ASCII Phred + 33).

static int compareByContigAndScore(const void *first, const void *second); // qsort()-style compare routine
static int compareByScore(const void *first, const void *second); // qsort()-style compare routine
};

// Does an AlignmentResult represent a single location?
Expand Down Expand Up @@ -85,4 +87,7 @@ struct PairedAlignmentResult {
_int64 nanosInAlignTogether;
unsigned nLVCalls;
unsigned nSmallHits;

static int compareByContigAndScore(const void *first, const void *second); // qsort()-style compare routine
static int compareByScore(const void *first, const void *second); // qsort()-style compare routine
};
Loading

0 comments on commit 81b4222

Please sign in to comment.