Skip to content

Commit

Permalink
Merge pull request #20 from prophyle/optimizations
Browse files Browse the repository at this point in the history
Optimizations
  • Loading branch information
karel-brinda authored Sep 27, 2024
2 parents 2523914 + d7b48f2 commit f7b520b
Show file tree
Hide file tree
Showing 18 changed files with 1,287 additions and 191 deletions.
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
.PHONY: all clean test cpptest verify quick-verify

CXX= g++
CXXFLAGS= -g -Wall -Wno-unused-function -std=c++17 -O2
CXXFLAGS= -g -Wall -Wno-unused-function -std=c++17 -O3
LDFLAGS= -lz -lpthread
SRC= src
UINT256= $(SRC)/uint256_t
SCRIPTS= scripts
DATA= data
TESTS= tests
Expand All @@ -24,13 +25,13 @@ verify: $(PROG) $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa
quick-verify: $(PROG) $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa
python $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa --quick --interpath $(DATA)/spyogenes.fa

$(PROG): $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h
$(PROG): $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h $(wildcard $(UINT256)/*.cpp $(UINT256)/*.h $(UINT256)/*.include)
./create-version.sh
$(CXX) $(CXXFLAGS) $(SRC)/main.cpp $(SRC)/kthread.c -o $@ $(LDFLAGS)


prophasmtest: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS)

gtest-all.o: $(GTEST)/src/gtest-all.cc $(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include -I $(GTEST) -DGTEST_CREATE_SHARED_LIBRARY=1 -c -pthread $(GTEST)/src/gtest-all.cc -o $@
Expand Down
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ ProphAsm2 is a versatile tool for computing simplitigs/SPSS
from *k-mer sets* and for *k-mer set operations*.
The new features compared to the original [ProphAsm](https://github.com/prophyle/prophasm)
include a largely speed and memory optimization, parallelization,
support for k-mer sizes up to 64 and support for minimum abundances.
support for k-mer sizes up to 128 and support for minimum abundances.

Various types of sequencing datasets can be used as the input for
ProphAsm, including genomes, pan-genomes, metagenomes or sequencing reads.
Expand Down Expand Up @@ -74,8 +74,19 @@ Set operations:
USAGE-BEGIN
-->
```
Usage: prophasm2 [options]
Command-line parameters:
-k INT K-mer size.
-i FILE Input FASTA file (can be used multiple times).
-o FILE Output FASTA file (if used, must be used as many times as -i).
-x FILE Compute intersection, subtract it, save it.
-s FILE Output file with k-mer statistics.
-t INT Number of threads (default 1).
-m INT Minimum abundance of k-mers to appear in the assembly (default 1).
-S Silent mode.
-u Do not consider k-mer and its reverse complement as equivalent.
Note that '-' can be used for standard input/output.
```
<!---
USAGE-END
Expand Down
4 changes: 2 additions & 2 deletions scripts/verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,14 @@ def main():
print("Testing ProphAsm2 outputs valid intersection on files " + args.path + " and " + args.interpath)
for complements in [True]:
for m in range(1, 3):
for k in range(2, 65, 23 if args.quick else 1):
for k in range(2, 129, 23 if args.quick else 1):
success &= verify_intersection(args.path, args.interpath, k, complements, m)
print("")

print("Testing ProphAsm2 outputs valid simplitigs on file " + args.path)
for complements in [True, False]:
for m in range(1, 4):
for k in range(2, 65, 11 if args.quick else 1):
for k in range(2, 129, 11 if args.quick else 1):
success &= verify_instance(args.path, k, complements, m)
print("")

Expand Down
11 changes: 8 additions & 3 deletions src/khash.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ typedef unsigned long long khint64_t;
#endif
#endif /* klib_unused */

typedef khint32_t khint_t;
typedef khint64_t khint_t;
typedef khint_t khiter_t;

#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
Expand All @@ -176,6 +176,10 @@ typedef khint_t khiter_t;
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif

#ifndef kroundup64
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
#endif

#ifndef kcalloc
#define kcalloc(N,Z) calloc(N,Z)
#endif
Expand Down Expand Up @@ -246,7 +250,7 @@ static const double __ac_HASH_UPPER = 0.77;
khint32_t *new_flags = 0; \
khint_t j = 1; \
{ \
kroundup32(new_n_buckets); \
kroundup64(new_n_buckets); \
if (new_n_buckets < 4) new_n_buckets = 4; \
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
else { /* hash table size to be changed (shrink or expand); rehash */ \
Expand Down Expand Up @@ -382,7 +386,8 @@ static const double __ac_HASH_UPPER = 0.77;
@param key The integer [khint64_t]
@return The hash value [khint_t]
*/
#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
//#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
#define kh_int64_hash_func(key) (khint64_t)(__ac_Wang_hash(key))
/*! @function
@abstract 64-bit integer comparison function
*/
Expand Down
50 changes: 33 additions & 17 deletions src/khash_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,22 @@
#include "kmers.h"
#include "khash.h"

typedef unsigned char byte;

#define kh_int128_hash_func(key) kh_int64_hash_func((khint64_t)((key)>>65^(key)^(key)<<21))
#define kh_int128_hash_equal(a, b) ((a) == (b))
#define kh_int256_hash_func(key) kh_int128_hash_func((__uint128_t)((key)>>129^(key)^(key)<<35))
#define kh_int256_hash_equal(a, b) ((a) == (b))

#define KHASH_MAP_INIT_INT128(name, khval_t) \
KHASH_INIT(name, __uint128_t, khval_t, 1, kh_int128_hash_func, kh_int128_hash_equal)

#define KHASH_SET_INIT_INT128(name) \
KHASH_INIT(name, __uint128_t, char, 0, kh_int128_hash_func, kh_int128_hash_equal)

typedef unsigned char byte;
#define KHASH_MAP_INIT_INT256(name, khval_t) \
KHASH_INIT(name, uint256_t, khval_t, 1, kh_int256_hash_func, kh_int128_hash_equal)

KHASH_MAP_INIT_INT256(S256M, byte)
KHASH_MAP_INIT_INT128(S128M, byte)
KHASH_MAP_INIT_INT64(S64M, byte)
KHASH_SET_INIT_INT128(S128S)
KHASH_SET_INIT_INT64(S64S)

byte MINIMUM_ABUNDANCE = 1;
Expand All @@ -40,26 +41,35 @@ struct DifferenceInPlaceData##variant {
}; \
\
/* Determine whether the canonical k-mer is present.*/ \
bool containsKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \
if (complements) kMer = CanonicalKMer(kMer, k); \
inline bool containsCanonicalKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer) { \
bool contains_key = kh_get_S##variant(kMers, kMer) != kh_end(kMers); \
if (MINIMUM_ABUNDANCE == 1) return contains_key; \
if (!contains_key) return false; \
return kh_val(kMers, kh_get_S##variant(kMers, kMer)) >= MINIMUM_ABUNDANCE; \
} \
\
/* Remove the canonical k-mer from the set.*/ \
void eraseKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \
/* Determine whether the canonical form of a k-mer is present.*/ \
inline bool containsKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \
if (complements) kMer = CanonicalKMer(kMer, k); \
return containsCanonicalKMer(kMers, kMer); \
} \
\
/* Remove the canonical k-mer from the set.*/ \
inline void eraseCanonicalKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer) { \
auto key = kh_get_S##variant(kMers, kMer); \
if (key != kh_end(kMers)) { \
kh_del_S##variant(kMers, key); \
} \
} \
\
/* Insert the canonical k-mer into the set. */ \
void insertKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements, bool force = false) { \
/* Remove the canonical form of a k-mer from the set.*/ \
inline void eraseKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \
if (complements) kMer = CanonicalKMer(kMer, k); \
eraseCanonicalKMer(kMers, kMer); \
} \
\
/* Insert the canonical k-mer into the set. */ \
inline void insertCanonicalKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, bool force=false) { \
int ret; \
if (MINIMUM_ABUNDANCE == (byte)1) { \
kh_put_S##variant(kMers, kMer, &ret); \
Expand All @@ -76,6 +86,12 @@ void insertKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool compl
} \
} \
\
/* Insert the canonical k-mer into the set. */ \
inline void insertKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements, bool force=false) { \
if (complements) kMer = CanonicalKMer(kMer, k); \
insertCanonicalKMer(kMers, kMer, force); \
} \
\
/* Parallel wrapper for differenceInPlace. */ \
void DifferenceInPlaceThread##variant(void *arg, long i, int _) { \
auto data = (DifferenceInPlaceData##variant*)arg; \
Expand All @@ -84,12 +100,12 @@ void DifferenceInPlaceThread##variant(void *arg, long i, int _) {

INIT_KHASH_UTILS(64, 64S)
INIT_KHASH_UTILS(64, 64M)
INIT_KHASH_UTILS(128, 128S)
INIT_KHASH_UTILS(128, 128M)
INIT_KHASH_UTILS(256, 256M)

/// Return the next k-mer in the k-mer set and update the index.
template <typename KHT, typename kmer_t>
kmer_t nextKMer(KHT *kMers, size_t &lastIndex, kmer_t &kMer) {
inline kmer_t nextKMer(KHT *kMers, size_t &lastIndex, kmer_t &kMer) {
for (size_t i = kh_begin(kMers) + lastIndex; i != kh_end(kMers); ++i, ++lastIndex) {
if (!kh_exist(kMers, i)) continue;
kMer = kh_key(kMers, i);
Expand All @@ -116,7 +132,7 @@ std::vector<kmer64_t> kMersToVec(kh_S64M_t *kMers) {

/// Compute the intersection of several k-mer sets.
template <typename KHT>
KHT *getIntersection(KHT* result, std::vector<KHT*> &kMerSets, int k, bool complements) {
KHT *getIntersection(KHT* result, std::vector<KHT*> &kMerSets) {
if (kMerSets.size() < 2) return result;
KHT* smallestSet = kMerSets[0];
for (size_t i = 1; i < kMerSets.size(); ++i) {
Expand All @@ -125,16 +141,16 @@ KHT *getIntersection(KHT* result, std::vector<KHT*> &kMerSets, int k, bool compl
for (auto i = kh_begin(smallestSet); i != kh_end(smallestSet); ++i) {
if (!kh_exist(smallestSet, i)) continue;
auto kMer = kh_key(smallestSet, i);
if (!containsKMer(smallestSet, kMer, k, complements)) continue;
if (!containsCanonicalKMer(smallestSet, kMer)) continue;
bool everywhere = true;
for (size_t i = 0; i < kMerSets.size(); ++i) if (kMerSets[i] != smallestSet) {
if (!containsKMer(kMerSets[i], kMer, k, complements)) {
if (!containsCanonicalKMer(kMerSets[i], kMer)) {
everywhere = false;
break;
}
}
if (everywhere) {
insertKMer(result, kMer, k, complements, true);
insertCanonicalKMer(result, kMer, true);
}
}
return result;
Expand Down
22 changes: 17 additions & 5 deletions src/kmers.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@
#include <iostream>
#include <cstdint>

#include "uint256_t/uint256_t.h"

typedef uint64_t kmer64_t;
typedef __uint128_t kmer128_t;
typedef uint256_t kmer256_t;

/// Convert the given basic nucleotide to int so it can be used for indexing in AC.
/// If non-existing nucleotide is given, return -1.
int NucleotideToInt (char c) {
inline int NucleotideToInt (char c) {
switch (c) {
case 'A': return 0;
case 'C': return 1;
Expand All @@ -25,13 +28,13 @@ int NucleotideToInt (char c) {

/// Compute the prefix of size d of the given k-mer.
template <typename kmer_t>
kmer_t BitPrefix(kmer_t kMer, int k, int d) {
inline kmer_t BitPrefix(kmer_t kMer, int k, int d) {
return kMer >> ((k - d) << kmer_t(1));
}

/// Compute the suffix of size d of the given k-mer.
template <typename kmer_t>
kmer_t BitSuffix(kmer_t kMer, int d) {
inline kmer_t BitSuffix(kmer_t kMer, int d) {
return kMer & ((kmer_t(1) << (d << kmer_t(1))) - kmer_t(1));
}

Expand Down Expand Up @@ -71,9 +74,17 @@ inline kmer128_t word_reverse_complement(kmer128_t w) {
w = ( w >> 64 ) | ( w << 64);
return ((U)-1) - w;
}
/// Compute the reverse complement of a word.
/// Copyright: Jellyfish GPL-3.0
inline kmer256_t word_reverse_complement(kmer256_t w) {
kmer128_t low = word_reverse_complement(w.lower());
kmer128_t high = word_reverse_complement(w.upper());
return kmer256_t(low, high);
}

constexpr int KMER_SIZE_64 = 64;
constexpr int KMER_SIZE_128 = 128;
constexpr int KMER_SIZE_256 = 256;
#define INIT_KMERS(type) \
\
/* Get the mask to mask k-mers. */ \
Expand All @@ -89,6 +100,7 @@ inline kmer##type##_t ReverseComplement(kmer##type##_t kMer, int k) {

INIT_KMERS(64)
INIT_KMERS(128)
INIT_KMERS(256)

/// Return the lexicographically smaller of the k-mer and its reverse complement.
template <typename kmer_t>
Expand All @@ -102,7 +114,7 @@ const char letters[4] {'A', 'C', 'G', 'T'};
/// Return the index-th nucleotide from the encoded k-mer.
template <typename kmer_t>
inline char NucleotideAtIndex(kmer_t encoded, int k, int index) {
return letters[(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
return letters[(uint32_t)(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
}

/// Convert the encoded KMer representation to string.
Expand All @@ -111,7 +123,7 @@ std::string NumberToKMer(kmer_t encoded, int length) {
std::string ret(length, 'N');
for (int i = 0; i < length; ++i) {
// The last two bits correspond to one nucleotide.
ret[length - i -1] = letters[encoded & 3];
ret[length - i -1] = letters[(uint32_t)(encoded & 3)];
// Move to the next letter.
encoded >>= 2;
}
Expand Down
Loading

0 comments on commit f7b520b

Please sign in to comment.