Skip to content

Commit

Permalink
Add a Stable Hash Algorithm
Browse files Browse the repository at this point in the history
Use SipHash-2-4 to replace llvm::MD5 as the hashing implementation backing Fingerprints and the interface hash.
  • Loading branch information
CodaFi committed Jan 22, 2021
1 parent 31d43bf commit f5cb08e
Show file tree
Hide file tree
Showing 4 changed files with 671 additions and 0 deletions.
236 changes: 236 additions & 0 deletions include/swift/Basic/StableHasher.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
//===--- StableHasher.h - Stable Hashing ------------------------*- C++ -*-===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
// An implementation of a stable hashing for Swift.
//
// Derived from the reference implementation for SipHash 2-4:
// https://github.com/veorq/SipHash
//
// With inline buffering derived from the hash implementation in the Swift
// Standard Library.
//
//===----------------------------------------------------------------------===//

#ifndef SWIFT_BASIC_STABLEHASHER_H
#define SWIFT_BASIC_STABLEHASHER_H

#include "llvm/Support/Endian.h"
#include "llvm/ADT/StringRef.h"
#include <algorithm>
#include <cstring>
#include <string>
#include <vector>
#include <tuple>
#include <utility>

namespace swift {

/// A \c StableHasher is an implementation of a 128-bit stable hash - built for
/// speed.
///
/// A "stable" hash in this context refers to the idea that the output of this
/// hasher is deterministic across instantiations of the compiler. In order to
/// support this goal, this hasher disallows run-dependent or otherwise
/// unstable values from entering into the hash-combiner. For example, this
/// hasher will statically reject attempts to hash-combine pointers and
/// aggregates of pointers. Note that this relies on user cooperation as well.
/// The order of hash-combines is pivotal, thus e.g. collection values should
/// have a guaranteed order or be sorted before being hash-combined.
///
/// Stable hash values must also be independent of the host architecture. For
/// integral types and enumerations, the default hash-combiner will
/// automatically byte-swap to a common little-endian format.
///
/// This hasher also allows for extending the hash-combiner to user-defined
/// types. To do so, define a (partial) specialization of
/// \c swift::StableHasher::Combiner<T>
///
/// template <typename T>
/// struct swift::StableHasher::Combiner<std::optional<T>> {
/// static void combine(StableHasher &hasher, const std::optional<T> &O) {
/// if (!O.has_value()) {
/// hasher.combine(0);
/// } else {
/// hasher.combine(1);
/// swift::StableHasher::Combiner<T>::combine(hasher, O.value());
/// }
/// }
/// };
///
/// The current implementation is the 128-bit (extended) SipHash 2-4, which
/// has been empirically demonstrated to have the best throughput relative to
/// the other SipHash tunings.
class StableHasher final {
private:
struct State {
uint64_t v0 = 0x736F6D6570736575;
uint64_t v1 = 0x646F72616E646f6D;
uint64_t v2 = 0x6C7967656E657261;
uint64_t v3 = 0x7465646279746573;
} state;

// A buffer of up to 8 items that this hasher uses to amortize the cost
// of the hashing function for hash-combines shorter than 64-bits.
uint8_t byteBuffer[8] = {0};
// msb lsb
// +---------+-------+-------+-------+-------+-------+-------+-------+
// |byteCount| length (<= 56 bits) |
// +---------+-------+-------+-------+-------+-------+-------+-------+
uint64_t lengthAndByteCount = 0;

public:
static StableHasher defaultHasher() { StableHasher hasher{0, 0}; return hasher; }

explicit StableHasher(uint64_t leftSeed, uint64_t rightSeed) {
state.v3 ^= rightSeed;
state.v2 ^= leftSeed;
state.v1 ^= rightSeed;
state.v0 ^= leftSeed;

state.v1 ^= 0xEE;
}

public:
template <typename T> struct Combiner {
// static void combine(StableHasher &hasher, const T &Val);
};

public:
/// Consume this stable hasher and compute the final 128-bit stable hash value.
std::pair<uint64_t, uint64_t> finalize() &&;

template <uint64_t N> void combine(uint8_t (&bits)[N]) {
static_assert(N > 0, "Cannot append VLA");
static_assert(N <= 8, "Can only append up to 64 bits at a time");

lengthAndByteCount += N;

const uint64_t bufLen = getBufferLength();
const uint64_t available = sizeof(byteBuffer) - bufLen;

// Cram as many bytes into the buffer as we can.
const uint64_t nhead = std::min(N, available);
if (nhead == sizeof(byteBuffer)) {
// We have headroom available for all 64 bits. Eagerly compress the
// now-full buffer into our state.
std::copy(bits, bits + sizeof(byteBuffer), byteBuffer);
} else if (N >= available) {
// There was some excess - append as many bytes as we can hold and
// compress the buffer into our state.
std::copy(bits, bits + nhead, byteBuffer + bufLen);
} else {
// We have headroom available for these bits.
std::copy(bits, bits + N, byteBuffer + bufLen);
return setBufferLength(bufLen + N);
}

constexpr auto endian = llvm::support::endianness::little;
compress(llvm::support::endian::read<uint64_t>(byteBuffer, endian));

// Now reseed the buffer with the remaining bytes.
const uint64_t remainder = N - available;
std::copy(bits + available, bits + N, byteBuffer);
return setBufferLength(remainder);
}

template <
typename T,
typename std::enable_if<std::is_integral<T>::value>::type * = nullptr>
void combine(T bits) {
constexpr auto endian = llvm::support::endianness::little;
uint8_t buf[sizeof(T)] = {0};
bits = llvm::support::endian::byte_swap<T>(bits, endian);
std::memcpy(buf, &bits, sizeof(T));
combine<sizeof(T)>(buf);
}

template <
typename EnumType,
typename std::enable_if<std::is_enum<EnumType>::value>::type * = nullptr>
void combine(EnumType value) {
using Underlying = typename std::underlying_type<EnumType>::type;
return this->template combine<Underlying>(static_cast<Underlying>(value));
}

template <typename T>
auto combine(const T *ptr) -> decltype("Cannot hash-combine pointers!"){};

template <typename T, typename... Ts>
void combine(const T &arg, const Ts &... args) {
return combine_many(arg, args...);
}

template <typename T, typename U> void combine(const std::pair<T, U> &arg) {
return combine_many(arg.first, arg.second);
}

template <typename T> void combine(const std::basic_string<T> &arg) {
return combine_range(arg.begin(), arg.end());
}

void combine(llvm::StringRef arg) {
return combine_range(arg.begin(), arg.end());
}

template <typename T,
decltype(StableHasher::Combiner<T>::combine) * = nullptr>
void combine(const T &val) {
return StableHasher::Combiner<T>::combine(*this, val);
}

template <typename ValueT> void combine_range(ValueT first, ValueT last) {
combine(std::distance(first, last));
while (first != last) {
combine(*first++);
}
}

template <typename... Ts> void combine(const std::tuple<Ts...> &arg) {
return combine_tuple(arg, typename std::index_sequence_for<Ts...>{});
}

private:
template <typename... Ts, unsigned... Indices>
void combine_tuple(const std::tuple<Ts...> &arg,
std::index_sequence<Indices...> indices) {
return combine_many(hash_value(std::get<Indices>(arg))...);
}

// base case.
void combine_many() {}

// recursive case
template <typename T, typename... Ts>
void combine_many(const T &arg, const Ts &... args) {
combine(arg);
return combine_many(args...);
}

private:
/// Return the number of bytes in the inline buffer.
uint64_t getBufferLength() const { return lengthAndByteCount >> 56; }
/// Set the number of bytes in the inline buffer.
void setBufferLength(uint64_t newLen) {
lengthAndByteCount = getDigestLength() | (newLen << 56);
}

/// Return the number of bytes that have been hash-combined so far.
uint64_t getDigestLength() const {
return lengthAndByteCount & ~(uint64_t(0xFF) << 56);
}

void compress(uint64_t value);
};

} // namespace swift

#endif // SWIFT_BASIC_STABLEHASHER_H
102 changes: 102 additions & 0 deletions lib/Basic/StableHasher.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
//===--- StableHasher.cpp - Stable Hasher ---------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#include "swift/Basic/StableHasher.h"

using namespace swift;

#define ROTATE_LEFT(ROTAND, DISTANCE) \
(uint64_t)((ROTAND) << (DISTANCE)) | ((ROTAND) >> (64 - (DISTANCE)))

namespace {
static inline void sip_round(uint64_t &v0, uint64_t &v1, uint64_t &v2,
uint64_t &v3) {
v0 += v1;
v1 = ROTATE_LEFT(v1, 13);
v1 ^= v0;
v0 = ROTATE_LEFT(v0, 32);
v2 += v3;
v3 = ROTATE_LEFT(v3, 16);
v3 ^= v2;
v0 += v3;
v3 = ROTATE_LEFT(v3, 21);
v3 ^= v0;
v2 += v1;
v1 = ROTATE_LEFT(v1, 17);
v1 ^= v2;
v2 = ROTATE_LEFT(v2, 32);
}
}; // end anonymous namespace

void StableHasher::compress(uint64_t value) {
state.v3 ^= value;
for (unsigned i = 0; i < 2; ++i) {
::sip_round(state.v0, state.v1, state.v2, state.v3);
}
state.v0 ^= value;
}

std::pair<uint64_t, uint64_t> StableHasher::finalize() && {
auto fillBitsFromBuffer = [](uint64_t fill, uint8_t *bits) {
uint64_t head = 0;
switch (fill) {
case 7:
head |= uint64_t(bits[6]) << 48;
LLVM_FALLTHROUGH;
case 6:
head |= uint64_t(bits[5]) << 40;
LLVM_FALLTHROUGH;
case 5:
head |= uint64_t(bits[4]) << 32;
LLVM_FALLTHROUGH;
case 4:
head |= uint64_t(bits[3]) << 24;
LLVM_FALLTHROUGH;
case 3:
head |= uint64_t(bits[2]) << 16;
LLVM_FALLTHROUGH;
case 2:
head |= uint64_t(bits[1]) << 8;
LLVM_FALLTHROUGH;
case 1:
head |= uint64_t(bits[0]);
break;
case 0:
break;
default:
break;
}
return head;
};

const uint64_t b = fillBitsFromBuffer(getBufferLength(), byteBuffer);
compress(((getDigestLength() & 0xFF) << 56) | b);

state.v2 ^= 0xEE;

for (unsigned i = 0; i < 4; ++i) {
::sip_round(state.v0, state.v1, state.v2, state.v3);
}

const uint64_t h1 = state.v0 ^ state.v1 ^ state.v2 ^ state.v3;

state.v1 ^= 0xDD;

for (unsigned i = 0; i < 4; ++i) {
::sip_round(state.v0, state.v1, state.v2, state.v3);
}

const uint64_t h2 = state.v0 ^ state.v1 ^ state.v2 ^ state.v3;

return std::make_pair(h1, h2);
}

1 change: 1 addition & 0 deletions unittests/Basic/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ add_swift_unittest(SwiftBasicTests
PrefixMapTest.cpp
RangeTest.cpp
SourceManagerTest.cpp
StableHasher.cpp
STLExtrasTest.cpp
StringExtrasTest.cpp
SuccessorMapTest.cpp
Expand Down
Loading

0 comments on commit f5cb08e

Please sign in to comment.