Skip to content

Commit

Permalink
Updated in-memory structure.
Browse files Browse the repository at this point in the history
  • Loading branch information
Simon Gog committed Jan 18, 2013
1 parent 95b5856 commit 4a46fbb
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 23 deletions.
22 changes: 16 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ all: ${BIN_DIR}/rosa_helping_structures.o \
${BIN_DIR}/rosa_sd2_delta_P3 \
${BIN_DIR}/rosa_sd2_delta_P4 \
${BIN_DIR}/rosa_sd2_delta_P5 \
${BIN_DIR}/fm16_huff_rrr \
${BIN_DIR}/fm64_huff_rrr \
${BIN_DIR}/create_pattern_files
# ${BIN_DIR}/rosa_sd2\
# ${BIN_DIR}/rosa_sd_load_only\
Expand All @@ -26,7 +28,6 @@ all: ${BIN_DIR}/rosa_helping_structures.o \
# ${BIN_DIR}/rosa_rrr63 \
# ${BIN_DIR}/rosa_gap \
# ${BIN_DIR}/fm_huff \
# ${BIN_DIR}/fm_huff_rrr \
# ${BIN_DIR}/fm_huff_rrr127 \
# ${BIN_DIR}/fm_rlmn

Expand Down Expand Up @@ -134,11 +135,20 @@ ${BIN_DIR}/rosa_sd2_delta_P5: ${BIN_DIR}/pattern_file.o ${BIN_DIR}/rosa_helping_
# ${BIN_DIR}/rosa_helping_functions.o ${BIN_DIR}/rosa_helping_structures.o ${BIN_DIR}/pattern_file.o \
# -lsdsl -ldivsufsort -ldivsufsort64
#
#${BIN_DIR}/fm_huff_rrr: ${BIN_DIR}/pattern_file.o ${BIN_DIR}/rosa_helping_functions.o ${BIN_DIR}/rosa_helping_structures.o
# g++ ${CFLAGS} -I${INCLUDE_PATH} -L${LIB_PATH} -DFM_HUFF_RRR \
# ${SRC_DIR}/rosa_main.cpp -o ${BIN_DIR}/fm_huff_rrr \
# ${BIN_DIR}/rosa_helping_functions.o ${BIN_DIR}/rosa_helping_structures.o ${BIN_DIR}/pattern_file.o \
# -lsdsl -ldivsufsort -ldivsufsort64
${BIN_DIR}/fm16_huff_rrr: ${BIN_DIR}/pattern_file.o ${BIN_DIR}/rosa_helping_functions.o ${BIN_DIR}/rosa_helping_structures.o ${BIN_DIR}/bu_interval.o ${SRC_DIR}/rosa_main.cpp
g++ ${CFLAGS} -I${INCLUDE_PATH} -L${LIB_PATH} -DFM16_HUFF_RRR \
${SRC_DIR}/rosa_main.cpp -o ${BIN_DIR}/fm16_huff_rrr \
${BIN_DIR}/rosa_helping_functions.o ${BIN_DIR}/rosa_helping_structures.o ${BIN_DIR}/pattern_file.o \
${BIN_DIR}/bu_interval.o \
-lsdsl -ldivsufsort -ldivsufsort64

${BIN_DIR}/fm64_huff_rrr: ${BIN_DIR}/pattern_file.o ${BIN_DIR}/rosa_helping_functions.o ${BIN_DIR}/rosa_helping_structures.o ${BIN_DIR}/bu_interval.o ${SRC_DIR}/rosa_main.cpp
g++ ${CFLAGS} -I${INCLUDE_PATH} -L${LIB_PATH} -DFM64_HUFF_RRR \
${SRC_DIR}/rosa_main.cpp -o ${BIN_DIR}/fm64_huff_rrr \
${BIN_DIR}/rosa_helping_functions.o ${BIN_DIR}/rosa_helping_structures.o ${BIN_DIR}/pattern_file.o \
${BIN_DIR}/bu_interval.o \
-lsdsl -ldivsufsort -ldivsufsort64

#
#${BIN_DIR}/fm_huff_rrr127: ${BIN_DIR}/pattern_file.o ${BIN_DIR}/rosa_helping_functions.o ${BIN_DIR}/rosa_helping_structures.o
# g++ ${CFLAGS} -I${INCLUDE_PATH} -L${LIB_PATH} -DFM_HUFF_RRR127 \
Expand Down
103 changes: 90 additions & 13 deletions src/in_memory_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <sdsl/int_vector.hpp>
#include <sdsl/util.hpp> // for basename
#include <sdsl/algorithms_for_string_matching.hpp> // for backward search
#include <sdsl/csa_construct.hpp>
#include <sdsl/construct.hpp>
#include <cstring> // remove?
#include <string>
#include <fstream>
Expand Down Expand Up @@ -39,11 +39,14 @@ class in_memory_index
Csa m_csa;
size_type m_b;
string m_file_name;
size_type m_k;
size_type m_lz_width;
mutable size_type m_count_disk_access;
mutable size_type m_count_gap_disk_access;
mutable size_type m_count_int_steps;
mutable size_type m_count_int_match;
mutable size_type m_count_queries;
mutable size_type m_count_block_length;

public:
const string& file_name;
Expand All @@ -52,7 +55,10 @@ class in_memory_index
const size_type& count_int_steps;
const size_type& count_int_match;
const size_type& count_queries;
const size_type& count_block_length;
const string tmp_cst_suffix;
const size_type &k;
const uint8_t &lz_width;

//! Constructor
/*!
Expand All @@ -66,41 +72,61 @@ class in_memory_index
* \Time complexity
* \f$ \Order{n \log\sigma} \f$, where n is the length of the text.
*/
in_memory_index(const char* file_name=NULL, size_type b=4000,
in_memory_index(const char* file_name=NULL, size_type b=4096, size_type f_fac_dens=1,
bool output_tikz=false, bool delete_tmp=false, const char* tmp_file_dir="./", const char* output_dir=NULL):
k(m_k),
lz_width(m_lz_width),
file_name(m_file_name),
count_disk_access(m_count_disk_access),
count_gap_disk_access(m_count_gap_disk_access),
count_int_steps(m_count_int_steps),
count_int_match(m_count_int_match),
count_queries(m_count_queries),
count_block_length(m_count_block_length),
tmp_cst_suffix("") {
m_k=0; m_lz_width = 0;
if (file_name == NULL)
return;
m_b = b;
m_file_name = string(file_name);
std::string tmp_file_dir2 = (util::dirname(tmp_file_dir)+"/"+util::basename(tmp_file_dir)+"/");
std::string tmp_dir = (util::dirname(tmp_file_dir)+"/"+util::basename(tmp_file_dir)+"/");
std::ifstream in(get_int_idx_filename().c_str());
if (!in) {
tMSS file_map;
construct_csa_of_reversed_text(m_file_name.c_str(), m_csa, file_map, delete_tmp, tmp_file_dir2, ("reversed_"+util::basename(string(file_name))).c_str());
string rev_file_name = get_rev_file_name(m_file_name);
ifstream rev_text_stream(rev_file_name.c_str());
if (!rev_text_stream) {
int_vector<8> text;
util::load_vector_from_file(text, m_file_name.c_str(), 1);
for (size_type i=0, j=text.size()-1; i < j and j < text.size(); ++i, --j) {
std::swap(text[i], text[j]);
}
util::store_to_plain_array<uint8_t>(text, rev_file_name.c_str());
}
cache_config config(delete_tmp, tmp_dir, (util::basename(rev_file_name)));
construct(m_csa, rev_file_name.c_str(), config, 1);
} else {
load(in);
in.close();
}
}

static string get_int_idx_filename(const char* file_name, size_type b, const char* output_dir=NULL) {
void factor_frequency(){}

string get_rev_file_name(const string& file_name) {
return file_name+"_rev";
}

static string get_int_idx_filename(const char* file_name, size_type b, size_type fac_dens=0, const char* output_dir=NULL) {
return get_output_dir(file_name, output_dir) + "/" + util::basename(file_name)
+ "."+util::to_string(b)+"." + INDEX_SUF +".int_idx";
+"." + INDEX_SUF +".int_idx";
}

string get_ext_idx_filename() {
return get_int_idx_filename();
}

static string get_ext_idx_filename(const char* file_name, size_type b, const char* output_dir=NULL) {
return get_int_idx_filename(file_name, b, output_dir);
static string get_ext_idx_filename(const char* file_name, size_type b, size_type fac_dens=0, const char* output_dir=NULL) {
return get_int_idx_filename(file_name, b, fac_dens, output_dir);
}

double get_ext_idx_size_in_mega_byte() {
Expand All @@ -115,6 +141,33 @@ class in_memory_index

}

//! Get the name of the file where the factorization of the index is stored.
string get_factorization_filename() const {
return "";
}

//! Get the name of the file where the factorization of the index is stored.
static string get_factorization_filename(const char* file_name, size_type b, const char* output_dir=NULL) {
return "";
}



size_type size()const{
return m_csa.size();
}

void reconstruct_text(const std::string delimiter=""){
}

void output_factors(){}

void output_tikz(){}

size_type greedy_parse(string tmp_dir, bit_vector &factor_borders, size_type fac_dens) {
return 0;
}

//! Count the number of occurrences of a pattern of length m
/*!
* \param pattern A pointer to the start of the pattern.
Expand All @@ -138,6 +191,28 @@ class in_memory_index
return rb-lb+1;
}

size_type locate(const unsigned char* pattern, size_type m, vector<size_type>& res)const {
size_type lb = 0, rb = m_csa.size()-1, d = 0;
const unsigned char* cp = pattern;//+m-1;
while (d < m and rb-lb+1 > 0) {
size_type lb2, rb2;
algorithm::backward_search(m_csa, lb, rb, *cp, lb2, rb2);
++d;
++cp;
lb = lb2; rb = rb2;
#ifdef OUTPUT_STATS
++m_count_disk_access; // a potential disk access for every step
#endif
}
if ( rb > lb+1 ){
res.resize(rb+1-lb);
for(size_type i=lb; i<=rb; ++i){
res[i-lb] = m_csa[i];
}
}
return rb-lb+1;
}

//! Match the pattern p reversed backwards against the pruned BWT until the interval <= b.
/*! \param pattern Pointer to the beginning of the pattern.
* \param m The length of the pattern.
Expand Down Expand Up @@ -169,11 +244,13 @@ class in_memory_index
return true;
}

size_type serialize(std::ostream& out)const {
size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
size_type written_bytes = 0;
written_bytes += m_csa.serialize(out);
written_bytes += util::write_member(m_b, out);
written_bytes += util::write_member(m_file_name, out);
written_bytes += m_csa.serialize(out, child, "csa");
written_bytes += util::write_member(m_b, out, child, "b");
written_bytes += util::write_member(m_file_name, out, child, "file_name");
structure_tree::add_size(child, written_bytes);
return written_bytes;
}

Expand Down
20 changes: 16 additions & 4 deletions src/rosa_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,31 @@
#include "in_memory_index.hpp"
typedef in_memory_index<csa_wt<wt_huff<>,64000,64000> > tIDX;
#endif
#ifdef FM_HUFF_RRR
#define INDEX_SUF "fm_huff_rrr"

#ifdef FM16_HUFF_RRR
#define INDEX_SUF "fm16_huff_rrr"
#include "in_memory_index.hpp"
#include <sdsl/rrr_vector.hpp>
typedef rrr_vector<63> tRRR;
typedef in_memory_index<csa_wt<wt_huff<tRRR, tRRR::rank_1_type, tRRR::select_1_type, tRRR::select_0_type>,64000,64000> > tIDX;
typedef in_memory_index<csa_wt<wt_huff<tRRR>,16,64000,text_order_sa_sampling<sd_vector<> > > > tIDX;
#endif

#ifdef FM64_HUFF_RRR
#define INDEX_SUF "fm64_huff_rrr"
#include "in_memory_index.hpp"
#include <sdsl/rrr_vector.hpp>
typedef rrr_vector<63> tRRR;
typedef in_memory_index<csa_wt<wt_huff<tRRR>,64,64000,text_order_sa_sampling<sd_vector<> > > > tIDX;
#endif



#ifdef FM_HUFF_RRR127
#define INDEX_SUF "fm_huff_rrr127"
#include "in_memory_index.hpp"
#include <sdsl/rrr_vector.hpp>
typedef rrr_vector<127> tRRR;
typedef in_memory_index<csa_wt<wt_huff<tRRR, tRRR::rank_1_type, tRRR::select_1_type, tRRR::select_0_type>,64000,64000> > tIDX;
typedef in_memory_index<csa_wt<wt_huff<tRRR, tRRR::rank_1_type, tRRR::select_1_type, tRRR::select_0_type>,64000,64000, text_order_sa_sampling<sd_vector<> > > > tIDX;
#endif
#ifdef FM_RL
#define INDEX_SUF "fm_rlmn"
Expand Down

0 comments on commit 4a46fbb

Please sign in to comment.