Skip to content

Commit

Permalink
Revise perfect hash to align with libgrape-lite's pthash (#1992)
Browse files Browse the repository at this point in the history
Fixes #1852

Signed-off-by: vegetableysm <yuanshumin.ysm@alibaba-inc.com>
  • Loading branch information
vegetableysm authored Sep 2, 2024
1 parent a9344ae commit da21407
Show file tree
Hide file tree
Showing 12 changed files with 247 additions and 1,802 deletions.
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
path = modules/graph/thirdparty/GraphAr
url = https://github.com/alibaba/GraphAr.git
shallow = true
[submodule "modules/graph/thirdparty/libgrape-lite"]
path = modules/graph/thirdparty/libgrape-lite
[submodule "thirdparty/libgrape-lite"]
path = thirdparty/libgrape-lite
url = https://github.com/alibaba/libgrape-lite.git
shallow = true
[submodule "modules/graph/thirdparty/powturbo"]
Expand Down
4 changes: 0 additions & 4 deletions NOTICE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,6 @@ This product includes software from the ClickHouse project
* Copyright 2016-2022 ClickHouse, Inc.
* https://github.com/ClickHouse/ClickHouse

This product includes software from the BBHash project
* Copyright (c) 2015 Guillaume Rizk
* https://github.com/rizkg/BBHash

This product includes software from the rax project (BSD, 2-clause)
* Copyright (c) 2017-2019, Salvatore Sanfilippo <antirez at gmail dot com>
* https://github.com/antirez/rax
1 change: 0 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,6 @@ We thank the following excellent open-source projects:
- `skywalking-infra-e2e <https://github.com/apache/skywalking-infra-e2e>`_ A generation End-to-End Testing framework.
- `skywalking-swck <https://github.com/apache/skywalking-swck>`_ A kubernetes operator for the Apache Skywalking.
- `wyhash <https://github.com/alainesp/wy>`_, C++ wrapper around wyhash and wyrand.
- `BBHash <https://github.com/rizkg/BBHash>`_, a fast, minimal-memory perfect hash function.
- `rax <https://github.com/antirez/rax>`_, an ANSI C radix tree implementation.
- `MurmurHash3 <https://github.com/aappleby/smhasher>`_, a fast non-cryptographic hash function.

Expand Down
32 changes: 24 additions & 8 deletions modules/basic/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,34 @@ file(GLOB_RECURSE BASIC_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")

add_library(vineyard_basic ${BASIC_SRC_FILES})
target_add_debuginfo(vineyard_basic)
find_package(MPI REQUIRED)
target_link_libraries(vineyard_basic PUBLIC vineyard_client
${ARROW_SHARED_LIB}
${GLOG_LIBRARIES}
${MPI_CXX_LIBRARIES}
)
target_include_directories(vineyard_basic PUBLIC ${ARROW_INCLUDE_DIR})
target_include_directories(vineyard_basic PUBLIC ${ARROW_INCLUDE_DIR} ${MPI_CXX_INCLUDE_PATH})

find_package(libgrapelite 0.3.4 QUIET)
if(LIBGRAPELITE_INCLUDE_DIRS)
message(STATUS "-- Found libgrape-lite: ${LIBGRAPELITE_INCLUDE_DIRS}")
target_include_directories(vineyard_basic PUBLIC ${LIBGRAPELITE_INCLUDE_DIRS})
else()
# use bundled libgrape-lite
message(STATUS "-- Building libgrape-lite from submodule: ${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite")
set(BUILD_LIBGRAPELITE_DOCS OFF CACHE BOOL "no libgrape-lite docs")
set(BUILD_LIBGRAPELITE_TESTS OFF CACHE BOOL "no libgrape-lite tests")
# use `add_subdirectory` to use the same CMAKE_BUILD_TYPE with vineyard itself and
# ensure the libgrapelite-targets-{debug/release}.cmake been generated during installation.
add_subdirectory("${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite"
"${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite"
)
target_include_directories(vineyard_basic PUBLIC
$<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite>
$<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/thirdparty/libgrape-lite/thirdparty>
$<INSTALL_INTERFACE:include>
)
endif()

# install bundled thirdparty: flat_hash_map
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/flat_hash_map
Expand All @@ -64,13 +87,6 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/wyhash
PATTERN "*.hpp" # select C++ template header files
)

install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/BBHash
DESTINATION include/vineyard/contrib # target directory
FILES_MATCHING # install only matched files
PATTERN "*.h" # select header files
PATTERN "*.hpp" # select C++ template header files
)

# install bundled thirdparty: cityhash
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/cityhash
DESTINATION include/vineyard/contrib # target directory
Expand Down
106 changes: 69 additions & 37 deletions modules/basic/ds/hashmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,7 @@ limitations under the License.
#include "client/ds/blob.h"
#include "client/ds/i_object.h"
#include "common/util/arrow.h" // IWYU pragma: keep

#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
#include "BBHash/BooPHF.h"
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
#include "grape/vertex_map/idxers/pthash_idxer.h"

namespace vineyard {

Expand Down Expand Up @@ -229,8 +220,6 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
public:
static_assert(std::is_pod<V>::value, "V in perfect hashmap must be POD type");

typedef boomphf::SingleHashFunctor<K> hasher_t;

explicit PerfectHashmapBuilder(Client& client)
: PerfectHashmapBaseBuilder<K, V>(client) {}

Expand All @@ -248,12 +237,21 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V* values, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements));
for (size_t i = 0; i < n_elements; ++i) {
this->builder_.add((reinterpret_cast<const K*>(keys->data()))[i]);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements,
return detail::perfect_hash::build_values(
idxer_, reinterpret_cast<const K*>(keys->data()), n_elements,
values, shuffled_values);
});
}
Expand All @@ -266,11 +264,27 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V* values, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(bphf_, keys->GetArray()));
for (auto iter =
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->begin());
iter !=
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->end());
iter++) {
this->builder_.add(*iter);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(bphf_, keys->GetArray(), values,
shuffled_values);
return detail::perfect_hash::build_values(idxer_, keys->GetArray(),
values, shuffled_values);
});
return Status::OK();
}
Expand All @@ -289,12 +303,21 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V begin_value, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements));
for (size_t i = 0; i < n_elements; ++i) {
this->builder_.add((reinterpret_cast<const K*>(keys->data()))[i]);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements,
return detail::perfect_hash::build_values(
idxer_, reinterpret_cast<const K*>(keys->data()), n_elements,
begin_value, shuffled_values);
});
}
Expand All @@ -307,11 +330,27 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V begin_value, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(bphf_, keys->GetArray()));
for (auto iter =
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->begin());
iter !=
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->end());
iter++) {
this->builder_.add(*iter);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finishInplace(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(bphf_, keys->GetArray(),
begin_value, shuffled_values);
return detail::perfect_hash::build_values(
idxer_, keys->GetArray(), begin_value, shuffled_values);
});
return Status::OK();
}
Expand All @@ -323,15 +362,7 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
*
*/
Status Build(Client& client) override {
size_t size = detail::boomphf::bphf_serde::compute_size(bphf_);
std::unique_ptr<BlobWriter> blob_writer;
RETURN_ON_ERROR(client.CreateBlob(size, blob_writer));
char* dst = detail::boomphf::bphf_serde::ser(blob_writer->data(), bphf_);
RETURN_ON_ASSERT(dst == blob_writer->data() + size,
"boomphf serialization error: buffer size mismatched");
std::shared_ptr<Object> blob;
RETURN_ON_ERROR(blob_writer->Seal(client, blob));
this->set_ph_(std::dynamic_pointer_cast<Blob>(blob));
this->set_ph_(buf);
return Status::OK();
}

Expand Down Expand Up @@ -359,10 +390,11 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
return Status::OK();
}

boomphf::mphf<K, hasher_t> bphf_;
grape::PTHashIdxerBuilder<K, uint64_t> builder_;
grape::PTHashIdxer<K, uint64_t> idxer_;
std::shared_ptr<Object> buf;

const int concurrency_ = std::thread::hardware_concurrency();
const double gamma_ = 2.5f;
};

} // namespace vineyard
Expand Down
Loading

0 comments on commit da21407

Please sign in to comment.