diff --git a/src/medida/stats/snapshot.cc b/src/medida/stats/snapshot.cc index 48cd592..d8c8242 100644 --- a/src/medida/stats/snapshot.cc +++ b/src/medida/stats/snapshot.cc @@ -8,6 +8,7 @@ #include #include #include +#include namespace medida { namespace stats { @@ -133,31 +134,71 @@ std::vector Snapshot::Impl::getValues() const { } -double Snapshot::Impl::getValue(double quantile) const { - if (quantile < 0.0 || quantile > 1.0) { - throw std::invalid_argument("quantile is not in [0..1]"); - } - - if (values_.empty()) { - return 0.0; - } - - auto pos = quantile * (values_.size() + 1); - - if (pos < 1) { - return values_.front(); - } - - if (pos >= values_.size()) { - return values_.back(); - } - - double lower = values_[pos - 1]; - double upper = values_[pos]; - return lower + (pos - std::floor(pos)) * (upper - lower); +double Snapshot::Impl::getValue(double quantile) const +{ + // Calculating a quantile is _mostly_ just about scaling the requested + // quantile from the range it's given in [0.0, 1.0] to an index value in the + // range of valid indices for the sorted data. Unfortunately there are two + // complications: + // + // 1. If the scaled quantile doesn't land exactly on an integer value, you + // have to interpolate "somehow" between the values at ceiling and + // floor indices. It turns out there's little agreement in the world of + // stats about which form of interpolation is best or how to achieve + // it. R itself has 9 variants available, but the "most popular" (and + // its default) appears to be algorithm R7 from Hyndman and Fan (1996). + // + // 2. Even "textbook" algorithms like R7 are described using 1-based + // indexing, which makes it somewhat non-obvious to transcribe directly + // or even copy from other scientific languages (that do 1-based) into + // C++ 0-based indexing. So we have to try our own hand at implementing + // it "from intent" rather than copying code directly from elsewhere. + // + // We've tested this with enough test vectors from R to convince ourselves + // it's a faithful implementation. + // + // https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/quantile + // https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample + + if (quantile < 0.0 || quantile > 1.0) + { + throw std::invalid_argument("quantile is not in [0..1]"); + } + + if (values_.empty()) + { + return 0.0; + } + + // Step 1: define range of actually-allowed indexes: [0, max_idx] + size_t max_idx = values_.size() - 1; + + // Step 2: calculate "ideal" fractional index (with 1.0 => max_idx). + double ideal_index = quantile * max_idx; + + // Step 3: calculate ideal-index floor and integral low and hi indexes. + double floor_ideal = std::floor(ideal_index); + assert(floor_ideal >= 0.0); + size_t lo_idx = static_cast(floor_ideal); + assert(lo_idx <= max_idx); + size_t hi_idx = lo_idx + 1; + + // Step 4: if there's no upper sample to interpolate with, just return + // the highest one. + if (hi_idx > max_idx) + { + return values_.back(); + } + + // Step 5: return linear interpolation of elements at lo_idx and hi_idx. + double delta = ideal_index - floor_ideal; + assert(delta >= 0.0); + assert(delta < 1.0); + double lower = values_.at(lo_idx); + double upper = values_.at(hi_idx); + return lower + (delta * (upper - lower)); } - double Snapshot::Impl::getMedian() const { return getValue(kMEDIAN_Q); } diff --git a/test/stats/test_snapshot.cc b/test/stats/test_snapshot.cc index 742cfa3..9b68ea3 100644 --- a/test/stats/test_snapshot.cc +++ b/test/stats/test_snapshot.cc @@ -34,27 +34,27 @@ TEST_F(SnapshotTest, hasAMedian) { TEST_F(SnapshotTest, hasAp75) { - EXPECT_DOUBLE_EQ(4.5, snapshot.get75thPercentile()); + EXPECT_DOUBLE_EQ(4, snapshot.get75thPercentile()); } TEST_F(SnapshotTest, hasAp95) { - EXPECT_DOUBLE_EQ(5.0, snapshot.get95thPercentile()); + EXPECT_DOUBLE_EQ(4.7999999999999998, snapshot.get95thPercentile()); } TEST_F(SnapshotTest, hasAp98) { - EXPECT_DOUBLE_EQ(5.0, snapshot.get98thPercentile()); + EXPECT_DOUBLE_EQ(4.9199999999999999, snapshot.get98thPercentile()); } TEST_F(SnapshotTest, hasAp99) { - EXPECT_DOUBLE_EQ(5.0, snapshot.get99thPercentile()); + EXPECT_DOUBLE_EQ(4.96, snapshot.get99thPercentile()); } TEST_F(SnapshotTest, hasAp999) { - EXPECT_DOUBLE_EQ(5.0, snapshot.get999thPercentile()); + EXPECT_DOUBLE_EQ(4.9960000000000004, snapshot.get999thPercentile()); } diff --git a/test/test_histogram.cc b/test/test_histogram.cc index b883dcf..bb19dae 100644 --- a/test/test_histogram.cc +++ b/test/test_histogram.cc @@ -46,7 +46,7 @@ TEST(HistogramTest, aHistogramWith1000Elements) { auto snapshot = histogram.GetSnapshot(); EXPECT_NEAR(500.5, snapshot.getMedian(), 0.0001); - EXPECT_NEAR(750.75, snapshot.get75thPercentile(), 0.0001); - EXPECT_NEAR(990.99, snapshot.get99thPercentile(), 0.0001); + EXPECT_NEAR(750.25, snapshot.get75thPercentile(), 0.0001); + EXPECT_NEAR(990.00999999999999, snapshot.get99thPercentile(), 0.0001); EXPECT_EQ(1000, snapshot.size()); } diff --git a/test/test_timer.cc b/test/test_timer.cc index 8b1d23c..5f9d660 100644 --- a/test/test_timer.cc +++ b/test/test_timer.cc @@ -76,8 +76,8 @@ TEST_F(TimerTest, timingASeriesOfEvents) { auto snapshot = timer.GetSnapshot(); EXPECT_NEAR(20.0, snapshot.getMedian(), 0.001); - EXPECT_NEAR(35.0, snapshot.get75thPercentile(), 0.001); - EXPECT_NEAR(40.0, snapshot.get99thPercentile(), 0.001); + EXPECT_NEAR(30.0, snapshot.get75thPercentile(), 0.001); + EXPECT_NEAR(39.600000000000001, snapshot.get99thPercentile(), 0.001); EXPECT_EQ(5, snapshot.size()); }