Skip to content

Commit

Permalink
Merge pull request #15 from graydon/corrected-quantile-calculation
Browse files Browse the repository at this point in the history
Correct quantile calculation
  • Loading branch information
graydon authored Mar 1, 2020
2 parents d10f3c8 + bdf24a6 commit 9e0470f
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 32 deletions.
87 changes: 64 additions & 23 deletions src/medida/stats/snapshot.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <cmath>
#include <cstddef>
#include <stdexcept>
#include <cassert>

namespace medida {
namespace stats {
Expand Down Expand Up @@ -133,31 +134,71 @@ std::vector<double> Snapshot::Impl::getValues() const {
}


double Snapshot::Impl::getValue(double quantile) const {
if (quantile < 0.0 || quantile > 1.0) {
throw std::invalid_argument("quantile is not in [0..1]");
}

if (values_.empty()) {
return 0.0;
}

auto pos = quantile * (values_.size() + 1);

if (pos < 1) {
return values_.front();
}

if (pos >= values_.size()) {
return values_.back();
}

double lower = values_[pos - 1];
double upper = values_[pos];
return lower + (pos - std::floor(pos)) * (upper - lower);
double Snapshot::Impl::getValue(double quantile) const
{
// Calculating a quantile is _mostly_ just about scaling the requested
// quantile from the range it's given in [0.0, 1.0] to an index value in the
// range of valid indices for the sorted data. Unfortunately there are two
// complications:
//
// 1. If the scaled quantile doesn't land exactly on an integer value, you
// have to interpolate "somehow" between the values at ceiling and
// floor indices. It turns out there's little agreement in the world of
// stats about which form of interpolation is best or how to achieve
// it. R itself has 9 variants available, but the "most popular" (and
// its default) appears to be algorithm R7 from Hyndman and Fan (1996).
//
// 2. Even "textbook" algorithms like R7 are described using 1-based
// indexing, which makes it somewhat non-obvious to transcribe directly
// or even copy from other scientific languages (that do 1-based) into
// C++ 0-based indexing. So we have to try our own hand at implementing
// it "from intent" rather than copying code directly from elsewhere.
//
// We've tested this with enough test vectors from R to convince ourselves
// it's a faithful implementation.
//
// https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/quantile
// https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample

if (quantile < 0.0 || quantile > 1.0)
{
throw std::invalid_argument("quantile is not in [0..1]");
}

if (values_.empty())
{
return 0.0;
}

// Step 1: define range of actually-allowed indexes: [0, max_idx]
size_t max_idx = values_.size() - 1;

// Step 2: calculate "ideal" fractional index (with 1.0 => max_idx).
double ideal_index = quantile * max_idx;

// Step 3: calculate ideal-index floor and integral low and hi indexes.
double floor_ideal = std::floor(ideal_index);
assert(floor_ideal >= 0.0);
size_t lo_idx = static_cast<size_t>(floor_ideal);
assert(lo_idx <= max_idx);
size_t hi_idx = lo_idx + 1;

// Step 4: if there's no upper sample to interpolate with, just return
// the highest one.
if (hi_idx > max_idx)
{
return values_.back();
}

// Step 5: return linear interpolation of elements at lo_idx and hi_idx.
double delta = ideal_index - floor_ideal;
assert(delta >= 0.0);
assert(delta < 1.0);
double lower = values_.at(lo_idx);
double upper = values_.at(hi_idx);
return lower + (delta * (upper - lower));
}


double Snapshot::Impl::getMedian() const {
return getValue(kMEDIAN_Q);
}
Expand Down
10 changes: 5 additions & 5 deletions test/stats/test_snapshot.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,27 +34,27 @@ TEST_F(SnapshotTest, hasAMedian) {


TEST_F(SnapshotTest, hasAp75) {
EXPECT_DOUBLE_EQ(4.5, snapshot.get75thPercentile());
EXPECT_DOUBLE_EQ(4, snapshot.get75thPercentile());
}


TEST_F(SnapshotTest, hasAp95) {
EXPECT_DOUBLE_EQ(5.0, snapshot.get95thPercentile());
EXPECT_DOUBLE_EQ(4.7999999999999998, snapshot.get95thPercentile());
}


TEST_F(SnapshotTest, hasAp98) {
EXPECT_DOUBLE_EQ(5.0, snapshot.get98thPercentile());
EXPECT_DOUBLE_EQ(4.9199999999999999, snapshot.get98thPercentile());
}


TEST_F(SnapshotTest, hasAp99) {
EXPECT_DOUBLE_EQ(5.0, snapshot.get99thPercentile());
EXPECT_DOUBLE_EQ(4.96, snapshot.get99thPercentile());
}


TEST_F(SnapshotTest, hasAp999) {
EXPECT_DOUBLE_EQ(5.0, snapshot.get999thPercentile());
EXPECT_DOUBLE_EQ(4.9960000000000004, snapshot.get999thPercentile());
}


Expand Down
4 changes: 2 additions & 2 deletions test/test_histogram.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ TEST(HistogramTest, aHistogramWith1000Elements) {

auto snapshot = histogram.GetSnapshot();
EXPECT_NEAR(500.5, snapshot.getMedian(), 0.0001);
EXPECT_NEAR(750.75, snapshot.get75thPercentile(), 0.0001);
EXPECT_NEAR(990.99, snapshot.get99thPercentile(), 0.0001);
EXPECT_NEAR(750.25, snapshot.get75thPercentile(), 0.0001);
EXPECT_NEAR(990.00999999999999, snapshot.get99thPercentile(), 0.0001);
EXPECT_EQ(1000, snapshot.size());
}
4 changes: 2 additions & 2 deletions test/test_timer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ TEST_F(TimerTest, timingASeriesOfEvents) {

auto snapshot = timer.GetSnapshot();
EXPECT_NEAR(20.0, snapshot.getMedian(), 0.001);
EXPECT_NEAR(35.0, snapshot.get75thPercentile(), 0.001);
EXPECT_NEAR(40.0, snapshot.get99thPercentile(), 0.001);
EXPECT_NEAR(30.0, snapshot.get75thPercentile(), 0.001);
EXPECT_NEAR(39.600000000000001, snapshot.get99thPercentile(), 0.001);
EXPECT_EQ(5, snapshot.size());
}

Expand Down

0 comments on commit 9e0470f

Please sign in to comment.