Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Report drifting close time on local node #4101

Merged
merged 1 commit into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions src/herder/HerderImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ using namespace std;
namespace stellar
{

// Roughly ~10 minutes of consensus
constexpr uint32 const CLOSE_TIME_DRIFT_LEDGER_WINDOW_SIZE = 120;
// 10 seconds of drift threshold
constexpr uint32 const CLOSE_TIME_DRIFT_SECONDS_THRESHOLD = 10;

constexpr uint32 const TRANSACTION_QUEUE_TIMEOUT_LEDGERS = 4;
constexpr uint32 const TRANSACTION_QUEUE_BAN_LEDGERS = 10;
constexpr uint32 const TRANSACTION_QUEUE_SIZE_MULTIPLIER = 2;
Expand Down Expand Up @@ -399,13 +404,49 @@ HerderImpl::writeDebugTxSet(LedgerCloseData const& lcd)
}
}

void
recordExternalizeAndCheckCloseTimeDrift(
uint64 slotIndex, StellarValue const& value,
std::map<uint32_t, std::pair<uint64_t, std::optional<uint64_t>>>& ctMap)
{
auto it = ctMap.find(slotIndex);
if (it != ctMap.end())
{
it->second.second = value.closeTime;
}

if (ctMap.size() >= CLOSE_TIME_DRIFT_LEDGER_WINDOW_SIZE)
{
medida::Histogram h(medida::SamplingInterface::SampleType::kSliding);
SirTyson marked this conversation as resolved.
Show resolved Hide resolved
for (auto const& [ledgerSeq, closeTimePair] : ctMap)
{
auto const& [localCT, externalizedCT] = closeTimePair;
if (externalizedCT)
{
h.Update(*externalizedCT - localCT);
}
}
auto drift = static_cast<int>(h.GetSnapshot().get75thPercentile());
if (std::abs(drift) > CLOSE_TIME_DRIFT_SECONDS_THRESHOLD)
{
CLOG_WARNING(Herder, POSSIBLY_BAD_LOCAL_CLOCK);
CLOG_WARNING(Herder, "Close time local drift is: {}", drift);
}

ctMap.clear();
}
}

void
HerderImpl::valueExternalized(uint64 slotIndex, StellarValue const& value,
bool isLatestSlot)
{
ZoneScoped;
const int DUMP_SCP_TIMEOUT_SECONDS = 20;

recordExternalizeAndCheckCloseTimeDrift(slotIndex, value,
mDriftCTSlidingWindow);

if (isLatestSlot)
{
// called both here and at the end (this one is in case of an exception)
Expand Down Expand Up @@ -1321,6 +1362,29 @@ HerderImpl::triggerNextLedger(uint32_t ledgerSeqToTrigger,
// so this is the most appropriate value to use as closeTime.
uint64_t nextCloseTime =
VirtualClock::to_time_t(mApp.getClock().system_now());
if (ledgerSeqToTrigger == lcl.header.ledgerSeq + 1)
{
auto it = mDriftCTSlidingWindow.find(ledgerSeqToTrigger);
if (it == mDriftCTSlidingWindow.end())
{
// Record local close time _before_ it gets adjusted to be valid
// below
mDriftCTSlidingWindow[ledgerSeqToTrigger] =
std::make_pair(nextCloseTime, std::nullopt);
while (mDriftCTSlidingWindow.size() >
CLOSE_TIME_DRIFT_LEDGER_WINDOW_SIZE)
{
mDriftCTSlidingWindow.erase(mDriftCTSlidingWindow.begin());
}
}
else
{
CLOG_WARNING(Herder,
SirTyson marked this conversation as resolved.
Show resolved Hide resolved
"Herder::triggerNextLedger called twice on ledger {}",
ledgerSeqToTrigger);
}
}

if (nextCloseTime <= lcl.header.scpValue.closeTime)
{
nextCloseTime = lcl.header.scpValue.closeTime + 1;
Expand Down
5 changes: 5 additions & 0 deletions src/herder/HerderImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,11 @@ class HerderImpl : public Herder
// restores SCP state based on the last messages saved on disk
void restoreSCPState();

// Map SCP slots to local time of nomination and the time slot was
// externalized by the network
std::map<uint32_t, std::pair<uint64_t, std::optional<uint64_t>>>
mDriftCTSlidingWindow;

// saves upgrade parameters
void persistUpgrades();
void restoreUpgrades();
Expand Down
3 changes: 3 additions & 0 deletions src/main/ErrorMessages.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ constexpr auto const POSSIBLY_CORRUPTED_QUORUM_SET =
"Check your QUORUM_SET for corrupted nodes";
constexpr auto const UPGRADE_STELLAR_CORE =
"Upgrade this stellar-core installation to newest version";
constexpr auto const POSSIBLY_BAD_LOCAL_CLOCK =
"It is possible the clock on the local node has drifted. Run NTP "
"periodically to avoid this.";
}
Loading