Skip to content

Commit

Permalink
Infer potential drifting clock by comparing validator's nominated clo…
Browse files Browse the repository at this point in the history
…se time with close time externalized
  • Loading branch information
marta-lokhova committed Jul 31, 2024
1 parent 4860c68 commit 3b2b2d9
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 0 deletions.
64 changes: 64 additions & 0 deletions src/herder/HerderImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ using namespace std;
namespace stellar
{

// Roughly ~10 minutes of consensus
constexpr uint32 const CLOSE_TIME_DRIFT_LEDGER_WINDOW_SIZE = 120;
// 10 seconds of drift threshold
constexpr uint32 const CLOSE_TIME_DRIFT_SECONDS_THRESHOLD = 10;

constexpr uint32 const TRANSACTION_QUEUE_TIMEOUT_LEDGERS = 4;
constexpr uint32 const TRANSACTION_QUEUE_BAN_LEDGERS = 10;
constexpr uint32 const TRANSACTION_QUEUE_SIZE_MULTIPLIER = 2;
Expand Down Expand Up @@ -399,13 +404,49 @@ HerderImpl::writeDebugTxSet(LedgerCloseData const& lcd)
}
}

void
recordExternalizeAndCheckCloseTimeDrift(
uint64 slotIndex, StellarValue const& value,
std::map<uint32_t, std::pair<uint64_t, std::optional<uint64_t>>>& ctMap)
{
auto it = ctMap.find(slotIndex);
if (it != ctMap.end())
{
it->second.second = value.closeTime;
}

if (ctMap.size() >= CLOSE_TIME_DRIFT_LEDGER_WINDOW_SIZE)
{
medida::Histogram h(medida::SamplingInterface::SampleType::kSliding);
for (auto const& [ledgerSeq, closeTimePair] : ctMap)
{
auto const& [localCT, externalizedCT] = closeTimePair;
if (externalizedCT)
{
h.Update(*externalizedCT - localCT);
}
}
auto drift = static_cast<int>(h.GetSnapshot().get75thPercentile());
if (std::abs(drift) > CLOSE_TIME_DRIFT_SECONDS_THRESHOLD)
{
CLOG_WARNING(Herder, POSSIBLY_BAD_LOCAL_CLOCK);
CLOG_WARNING(Herder, "Close time local drift is: {}", drift);
}

ctMap.clear();
}
}

void
HerderImpl::valueExternalized(uint64 slotIndex, StellarValue const& value,
bool isLatestSlot)
{
ZoneScoped;
const int DUMP_SCP_TIMEOUT_SECONDS = 20;

recordExternalizeAndCheckCloseTimeDrift(slotIndex, value,
mDriftCTSlidingWindow);

if (isLatestSlot)
{
// called both here and at the end (this one is in case of an exception)
Expand Down Expand Up @@ -1321,6 +1362,29 @@ HerderImpl::triggerNextLedger(uint32_t ledgerSeqToTrigger,
// so this is the most appropriate value to use as closeTime.
uint64_t nextCloseTime =
VirtualClock::to_time_t(mApp.getClock().system_now());
if (ledgerSeqToTrigger == lcl.header.ledgerSeq + 1)
{
auto it = mDriftCTSlidingWindow.find(ledgerSeqToTrigger);
if (it == mDriftCTSlidingWindow.end())
{
// Record local close time _before_ it gets adjusted to be valid
// below
mDriftCTSlidingWindow[ledgerSeqToTrigger] =
std::make_pair(nextCloseTime, std::nullopt);
while (mDriftCTSlidingWindow.size() >
CLOSE_TIME_DRIFT_LEDGER_WINDOW_SIZE)
{
mDriftCTSlidingWindow.erase(mDriftCTSlidingWindow.begin());
}
}
else
{
CLOG_WARNING(Herder,
"Herder::triggerNextLedger called twice on ledger {}",
ledgerSeqToTrigger);
}
}

if (nextCloseTime <= lcl.header.scpValue.closeTime)
{
nextCloseTime = lcl.header.scpValue.closeTime + 1;
Expand Down
5 changes: 5 additions & 0 deletions src/herder/HerderImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,11 @@ class HerderImpl : public Herder
// restores SCP state based on the last messages saved on disk
void restoreSCPState();

// Map SCP slots to local time of nomination and the time slot was
// externalized by the network
std::map<uint32_t, std::pair<uint64_t, std::optional<uint64_t>>>
mDriftCTSlidingWindow;

// saves upgrade parameters
void persistUpgrades();
void restoreUpgrades();
Expand Down
3 changes: 3 additions & 0 deletions src/main/ErrorMessages.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ constexpr auto const POSSIBLY_CORRUPTED_QUORUM_SET =
"Check your QUORUM_SET for corrupted nodes";
constexpr auto const UPGRADE_STELLAR_CORE =
"Upgrade this stellar-core installation to newest version";
constexpr auto const POSSIBLY_BAD_LOCAL_CLOCK =
"It is possible the clock on the local node has drifted. Run NTP "
"periodically to avoid this.";
}

0 comments on commit 3b2b2d9

Please sign in to comment.