Skip to content

Commit

Permalink
Simplify facet count implementation.
Browse files Browse the repository at this point in the history
  • Loading branch information
kishorenc committed Jul 26, 2024
1 parent 058eb9d commit 6876408
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 681 deletions.
10 changes: 7 additions & 3 deletions include/facet_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ class facet_index_t {
std::string facet_value;
uint32_t count;
uint32_t facet_id;

bool operator<(const facet_count_t& other) const {
return count > other.count;
}
};

const static size_t MAX_FACET_VAL_LEN = 255;
Expand All @@ -70,7 +74,7 @@ class facet_index_t {
struct facet_id_seq_ids_t {
void* seq_ids;
uint32_t facet_id;
std::list<facet_count_t>::iterator facet_count_it;
std::multiset<facet_count_t>::iterator facet_count_it;

facet_id_seq_ids_t() {
seq_ids = nullptr;
Expand All @@ -82,8 +86,8 @@ class facet_index_t {

struct facet_doc_ids_list_t {
std::map<std::string, facet_id_seq_ids_t> fvalue_seq_ids;
std::list<facet_count_t> counts;
std::map<uint32_t, std::list<facet_count_t>::iterator> count_map;
std::multiset<facet_count_t> counts;

posting_list_t* seq_id_hashes = nullptr;
spp::sparse_hash_map<uint32_t, int64_t> fhash_to_int64_map;

Expand Down
152 changes: 19 additions & 133 deletions src/facet_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ void facet_index_t::insert(const std::string& field_name,
}

auto& seq_ids = seq_ids_it->second;
std::list<facet_count_t>& count_list = facet_index.counts;

if(fvalue_index_it == fvalue_index.end()) {
facet_id_seq_ids_t fis;
Expand All @@ -61,28 +60,7 @@ void facet_index_t::insert(const std::string& field_name,
if(facet_index.has_value_index) {
fis.seq_ids = ids_t::create(seq_ids);
auto new_count = ids_t::num_ids(fis.seq_ids);
auto& count_map = facet_index.count_map;
auto count_map_it = count_map.lower_bound(new_count);

count_list.emplace_front(fvalue.facet_value, new_count, facet_id);
fis.facet_count_it = count_list.begin();
auto curr = fis.facet_count_it;

if(count_map_it == facet_index.count_map.end()) {
// no element >= new_count, so it remains at head of list
facet_index.count_map.emplace(curr->count, curr);
} else if(count_map_it->second->count == new_count) {
// `curr` must be placed _before_ anchor node
// a) [7], 10, <7>, 3
auto& existing_node = count_map_it->second;
count_list.splice(existing_node, count_list, curr);
} else {
// b) [9], 12, 10, <10>, 7, 3
// `curr` placed _after_ anchor node
auto& gt_node = count_map_it->second;
count_list.splice(std::next(gt_node), count_list, curr);
count_map.emplace(curr->count, curr);
}
fis.facet_count_it = facet_index.counts.emplace(fvalue.facet_value, new_count, facet_id);
}

fvalue_index.emplace(fvalue.facet_value, fis);
Expand All @@ -94,14 +72,9 @@ void facet_index_t::insert(const std::string& field_name,
auto facet_count_it = fvalue_index_it->second.facet_count_it;

if(facet_count_it->facet_id == facet_id) {
auto& count_map = facet_index.count_map;
auto old_count = facet_count_it->count;
facet_count_it->count = ids_t::num_ids(fvalue_index_it->second.seq_ids);
auto new_count = facet_count_it->count;

// Relocate `curr` to a new sorted location within count list:
auto curr = facet_count_it;
update_count_nodes(count_list, count_map, old_count, new_count, curr);
auto facet_count_node = facet_index.counts.extract(facet_count_it);
facet_count_node.value().count = ids_t::num_ids(fvalue_index_it->second.seq_ids);
facet_index.counts.insert(std::move(facet_count_node));
} else {
LOG(ERROR) << "Wrong reference stored for facet " << fvalue.facet_value << " with facet_id " << facet_id;
}
Expand All @@ -116,90 +89,6 @@ void facet_index_t::insert(const std::string& field_name,
}
}

void facet_index_t::update_count_nodes(std::list<facet_count_t>& count_list,
std::map<uint32_t, std::list<facet_count_t>::iterator>& count_map,
uint32_t old_count, uint32_t new_count,
std::list<facet_count_t>::iterator& curr) {

auto count_map_it = count_map.lower_bound(new_count);

if(count_map_it == count_map.end()) {
// all elements are < new_count

// 5, 4, [4 -> 7]
// 5, 4, [4 -> 7], 3
// 5, [4 -> 7]
// [4 -> 5]

// delete count map entry if `curr` is the anchor for `old_count`
if(std::next(curr) == count_list.end() || std::next(curr)->count < old_count) {
count_map.erase(old_count);

// find a replacement for orig_count
if(curr != count_list.begin() && std::prev(curr)->count == old_count) {
count_map.emplace(old_count, std::prev(curr));
}
}

// `curr` placed at head of the count list
count_list.splice(count_list.begin(), count_list, curr);
count_map.emplace(new_count, curr);
} else if(count_map_it->first == new_count) {
// entry for new_count already exists in count map
// a) 10, 7, [5 -> 7], 3
// 10, 7, 5, [5 -> 7]
// 10, 7, 5, [5 -> 8]
// 10, 7, [5 -> 7]
// 10, 7, [5 -> 7], 5

// [9 -> 8], 8
// 10, [9 -> 8], 8

auto existing_node = count_map_it->second;

// delete count map entry if `curr` is the anchor for `old_count`
if(std::next(curr) == count_list.end() || std::next(curr)->count < old_count) {
count_map.erase(old_count);

// find a replacement for orig_count
if(curr != count_list.begin() && std::prev(curr)->count == old_count) {
count_map.emplace(old_count, std::prev(curr));
}
}

// `curr` placed before anchor node
count_list.splice(existing_node, count_list, curr);
} else {
// b) 10, 7, [7 -> 9], 3
// 10, 7, [7 -> 9]
// 10, [7 -> 9]
// 10, [7 -> 9], 7
// 10, 7, [5 -> 9], 3

// [10 -> 9]
// [5 -> 4], 2
// [5 -> 1], 2
// 5, 5, [5 -> 4], 5
// 5, 5, 5, [5 -> 4]

auto gt_node = count_map_it->second;

// delete old entry if `orig_count` iterator is same as `curr`
if(std::next(curr) == count_list.end() || std::next(curr)->count < old_count) {
count_map.erase(old_count);

// find a replacement for orig_count
if(curr != count_list.begin() && std::prev(curr)->count == old_count) {
count_map.emplace(old_count, std::prev(curr));
}
}

// `curr` placed after anchor node
count_list.splice(std::next(gt_node), count_list, curr);
count_map.emplace(new_count, curr);
}
}

bool facet_index_t::contains(const std::string& field_name) {
const auto& facet_field_it = facet_field_map.find(field_name);
if(facet_field_it == facet_field_map.end()) {
Expand Down Expand Up @@ -233,7 +122,6 @@ void facet_index_t::get_stringified_value(const nlohmann::json& value, const fie
}
else if(afield.is_bool()) {
bool raw_val = value.get<bool>();
auto fhash = (uint32_t)raw_val;
auto str_val = (raw_val == 1) ? "true" : "false";
values.emplace_back(str_val);
}
Expand Down Expand Up @@ -273,27 +161,24 @@ void facet_index_t::remove(const nlohmann::json& doc, const field& afield, const
void*& ids = fvalue_it->second.seq_ids;
if(ids && ids_t::contains(ids, seq_id)) {
ids_t::erase(ids, seq_id);
auto& count_list = facet_field_it->second.counts;
auto curr = fvalue_it->second.facet_count_it;
auto old_count = curr->count;
curr->count = ids_t::num_ids(ids);
auto new_count = curr->count;

// move the node lower in the count list
auto& count_map = facet_field_it->second.count_map;
update_count_nodes(count_list, count_map, old_count, new_count, curr);
auto new_count = ids_t::num_ids(ids);
auto& counts = facet_field_it->second.counts;

if(ids_t::num_ids(ids) == 0) {
if(new_count == 0) {
ids_t::destroy_list(ids);
dead_fvalues.push_back(fvalue_it->first);

//remove from int64 lookup map first
// remove from int64 lookup map first
auto& fhash_int64_map = facet_field_it->second.fhash_to_int64_map;
uint32_t fhash = fvalue_it->second.facet_id;
fhash_int64_map.erase(fhash);

count_map.erase(new_count);
count_list.erase(curr);
counts.erase(fvalue_it->second.facet_count_it);
} else {
// update count
auto count_node = counts.extract(fvalue_it->second.facet_count_it);
count_node.value().count = ids_t::num_ids(ids);
counts.insert(std::move(count_node));
}
}
}
Expand Down Expand Up @@ -343,7 +228,7 @@ size_t facet_index_t::intersect(facet& a_facet, const field& facet_field,
size_t max_facets = is_wildcard_no_filter_query ? std::min((size_t)max_facet_count, counter_list.size()) :
std::min((size_t)2 * max_facet_count, counter_list.size());

auto intersect_fn = [&] (std::list<facet_count_t>::const_iterator facet_count_it) {
auto intersect_fn = [&] (std::multiset<facet_count_t>::const_iterator facet_count_it) {
uint32_t count = 0;
uint32_t doc_id = 0;
if(has_facet_query) {
Expand Down Expand Up @@ -535,9 +420,11 @@ posting_list_t* facet_index_t::get_facet_hash_index(const std::string &field_nam
}

const spp::sparse_hash_map<uint32_t , int64_t >& facet_index_t::get_fhash_int64_map(const std::string& field_name) {
static const spp::sparse_hash_map<uint32_t, int64_t> empty_map{};
const auto facet_field_map_it = facet_field_map.find(field_name);

if(facet_field_map_it == facet_field_map.end()) {
return spp::sparse_hash_map<uint32_t , int64_t >{}; // field is not initialized or dropped
return empty_map; // field is not initialized or dropped
}

const auto& facet_index = facet_field_map_it->second;
Expand Down Expand Up @@ -597,7 +484,7 @@ void facet_index_t::check_for_high_cardinality(const string& field_name, size_t
auto num_facet_values = facet_field_map_it->second.fvalue_seq_ids.size();
bool is_sparse_field = false;

if(facet_field_map.size() > 100 && total_num_docs > 10*1000) {
if(total_num_docs > 10*1000) {
size_t num_docs_with_facet = facet_field_map_it->second.seq_id_hashes->num_ids();
if(num_docs_with_facet > 0 && num_docs_with_facet < 100) {
is_sparse_field = true;
Expand All @@ -613,7 +500,6 @@ void facet_index_t::check_for_high_cardinality(const string& field_name, size_t
}

facet_field_map_it->second.counts.clear();
facet_field_map_it->second.count_map.clear();
facet_field_map_it->second.has_value_index = false;
//LOG(INFO) << "Dropped value index for field " << field_name;
}
Expand Down
2 changes: 1 addition & 1 deletion src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ void Index::index_field_in_memory(const field& afield, std::vector<index_record>
std::unordered_map<uint32_t, std::vector<facet_value_id_t>> seq_id_to_fvalues;

size_t total_num_docs = seq_ids->num_ids();
if(afield.facet && total_num_docs > 10*1000 && search_schema.size() > 100) {
if(afield.facet && total_num_docs > 10*1000) {
facet_index_v4->check_for_high_cardinality(afield.name, total_num_docs);
}

Expand Down
12 changes: 6 additions & 6 deletions test/collection_optimized_faceting_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2646,15 +2646,15 @@ TEST_F(CollectionOptimizedFacetingTest, ValueIndexStatsMinMax) {

ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
ASSERT_EQ("9", results["facet_counts"][0]["counts"][0]["value"]);
ASSERT_EQ("9.2", results["facet_counts"][0]["counts"][1]["value"]);
ASSERT_EQ("9.3", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
ASSERT_EQ("9.2", results["facet_counts"][0]["counts"][1]["value"].get<std::string>());

//stats
ASSERT_EQ(5, results["facet_counts"][0]["stats"].size());
ASSERT_FLOAT_EQ(9.1, results["facet_counts"][0]["stats"]["avg"].get<double>());
ASSERT_FLOAT_EQ(9.25, results["facet_counts"][0]["stats"]["avg"].get<double>());
ASSERT_FLOAT_EQ(8.800000190734863, results["facet_counts"][0]["stats"]["min"].get<double>());
ASSERT_FLOAT_EQ(9.300000190734863, results["facet_counts"][0]["stats"]["max"].get<double>());
ASSERT_FLOAT_EQ(18.2, results["facet_counts"][0]["stats"]["sum"].get<double>());
ASSERT_FLOAT_EQ(18.5, results["facet_counts"][0]["stats"]["sum"].get<double>());
ASSERT_FLOAT_EQ(2, results["facet_counts"][0]["stats"]["total_values"].get<size_t>());
}

Expand Down Expand Up @@ -2744,9 +2744,9 @@ TEST_F(CollectionOptimizedFacetingTest, StringFacetsCountListOrderTest) {

ASSERT_EQ(1, results["facet_counts"].size());
ASSERT_EQ(2, results["facet_counts"][0]["counts"].size());
ASSERT_EQ("The Dark Knight", results["facet_counts"][0]["counts"][0]["value"]);
ASSERT_EQ("The Dark Knight", results["facet_counts"][0]["counts"][0]["value"].get<std::string>());
ASSERT_EQ(6, results["facet_counts"][0]["counts"][0]["count"]);
ASSERT_EQ("The Godfather", results["facet_counts"][0]["counts"][1]["value"]);
ASSERT_EQ("The Shawshank Redemption", results["facet_counts"][0]["counts"][1]["value"].get<std::string>());
ASSERT_EQ(2, results["facet_counts"][0]["counts"][1]["count"]);
}

Expand Down
Loading

0 comments on commit 6876408

Please sign in to comment.