Merge pull request #15 from graydon/corrected-quantile-calculation

Correct quantile calculation
stellar · Mar 1, 2020 · 9e0470f · 9e0470f
2 parents d10f3c8 + bdf24a6
commit 9e0470f
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 32 deletions.
diff --git a/src/medida/stats/snapshot.cc b/src/medida/stats/snapshot.cc
@@ -8,6 +8,7 @@
 #include <cmath>
 #include <cstddef>
 #include <stdexcept>
+#include <cassert>
 
 namespace medida {
 namespace stats {
@@ -133,31 +134,71 @@ std::vector<double> Snapshot::Impl::getValues() const {
 }
 
 
-double Snapshot::Impl::getValue(double quantile) const {
-  if (quantile < 0.0 || quantile > 1.0) {
-    throw std::invalid_argument("quantile is not in [0..1]");
-  }
-
-  if (values_.empty()) {
-    return 0.0;
-  }
-
-  auto pos = quantile * (values_.size() + 1);
-
-  if (pos < 1) {
-    return values_.front();
-  }
-
-  if (pos >= values_.size()) {
-    return values_.back();
-  }
-
-  double lower = values_[pos - 1];
-  double upper = values_[pos];
-  return lower + (pos - std::floor(pos)) * (upper - lower);
+double Snapshot::Impl::getValue(double quantile) const
+{
+    // Calculating a quantile is _mostly_ just about scaling the requested
+    // quantile from the range it's given in [0.0, 1.0] to an index value in the
+    // range of valid indices for the sorted data. Unfortunately there are two
+    // complications:
+    //
+    //   1. If the scaled quantile doesn't land exactly on an integer value, you
+    //      have to interpolate "somehow" between the values at ceiling and
+    //      floor indices. It turns out there's little agreement in the world of
+    //      stats about which form of interpolation is best or how to achieve
+    //      it. R itself has 9 variants available, but the "most popular" (and
+    //      its default) appears to be algorithm R7 from Hyndman and Fan (1996).
+    //
+    //   2. Even "textbook" algorithms like R7 are described using 1-based
+    //      indexing, which makes it somewhat non-obvious to transcribe directly
+    //      or even copy from other scientific languages (that do 1-based) into
+    //      C++ 0-based indexing. So we have to try our own hand at implementing
+    //      it "from intent" rather than copying code directly from elsewhere.
+    //
+    // We've tested this with enough test vectors from R to convince ourselves
+    // it's a faithful implementation.
+    //
+    // https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/quantile
+    // https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample
+
+    if (quantile < 0.0 || quantile > 1.0)
+    {
+        throw std::invalid_argument("quantile is not in [0..1]");
+    }
+
+    if (values_.empty())
+    {
+        return 0.0;
+    }
+
+    // Step 1: define range of actually-allowed indexes: [0, max_idx]
+    size_t max_idx = values_.size() - 1;
+
+    // Step 2: calculate "ideal" fractional index (with 1.0 => max_idx).
+    double ideal_index = quantile * max_idx;
+
+    // Step 3: calculate ideal-index floor and integral low and hi indexes.
+    double floor_ideal = std::floor(ideal_index);
+    assert(floor_ideal >= 0.0);
+    size_t lo_idx = static_cast<size_t>(floor_ideal);
+    assert(lo_idx <= max_idx);
+    size_t hi_idx = lo_idx + 1;
+
+    // Step 4: if there's no upper sample to interpolate with, just return
+    // the highest one.
+    if (hi_idx > max_idx)
+    {
+        return values_.back();
+    }
+
+    // Step 5: return linear interpolation of elements at lo_idx and hi_idx.
+    double delta = ideal_index - floor_ideal;
+    assert(delta >= 0.0);
+    assert(delta < 1.0);
+    double lower = values_.at(lo_idx);
+    double upper = values_.at(hi_idx);
+    return lower + (delta * (upper - lower));
 }
 
-
 double Snapshot::Impl::getMedian() const {
   return getValue(kMEDIAN_Q);
 }

diff --git a/test/stats/test_snapshot.cc b/test/stats/test_snapshot.cc
@@ -34,27 +34,27 @@ TEST_F(SnapshotTest, hasAMedian) {
 
 
 TEST_F(SnapshotTest, hasAp75) {
-  EXPECT_DOUBLE_EQ(4.5, snapshot.get75thPercentile());
+  EXPECT_DOUBLE_EQ(4, snapshot.get75thPercentile());
 }
 
 
 TEST_F(SnapshotTest, hasAp95) {
-  EXPECT_DOUBLE_EQ(5.0, snapshot.get95thPercentile());
+  EXPECT_DOUBLE_EQ(4.7999999999999998, snapshot.get95thPercentile());
 }
 
 
 TEST_F(SnapshotTest, hasAp98) {
-  EXPECT_DOUBLE_EQ(5.0, snapshot.get98thPercentile());
+  EXPECT_DOUBLE_EQ(4.9199999999999999, snapshot.get98thPercentile());
 }
 
 
 TEST_F(SnapshotTest, hasAp99) {
-  EXPECT_DOUBLE_EQ(5.0, snapshot.get99thPercentile());
+  EXPECT_DOUBLE_EQ(4.96, snapshot.get99thPercentile());
 }
 
 
 TEST_F(SnapshotTest, hasAp999) {
-  EXPECT_DOUBLE_EQ(5.0, snapshot.get999thPercentile());
+  EXPECT_DOUBLE_EQ(4.9960000000000004, snapshot.get999thPercentile());
 }
 
 

diff --git a/test/test_histogram.cc b/test/test_histogram.cc
@@ -46,7 +46,7 @@ TEST(HistogramTest, aHistogramWith1000Elements) {
 
   auto snapshot = histogram.GetSnapshot();
   EXPECT_NEAR(500.5, snapshot.getMedian(), 0.0001);
-  EXPECT_NEAR(750.75, snapshot.get75thPercentile(), 0.0001);
-  EXPECT_NEAR(990.99, snapshot.get99thPercentile(), 0.0001);
+  EXPECT_NEAR(750.25, snapshot.get75thPercentile(), 0.0001);
+  EXPECT_NEAR(990.00999999999999, snapshot.get99thPercentile(), 0.0001);
   EXPECT_EQ(1000, snapshot.size());
 }
diff --git a/test/test_timer.cc b/test/test_timer.cc
@@ -76,8 +76,8 @@ TEST_F(TimerTest, timingASeriesOfEvents) {
 
   auto snapshot = timer.GetSnapshot();
   EXPECT_NEAR(20.0, snapshot.getMedian(), 0.001);
-  EXPECT_NEAR(35.0, snapshot.get75thPercentile(), 0.001);
-  EXPECT_NEAR(40.0, snapshot.get99thPercentile(), 0.001);
+  EXPECT_NEAR(30.0, snapshot.get75thPercentile(), 0.001);
+  EXPECT_NEAR(39.600000000000001, snapshot.get99thPercentile(), 0.001);
   EXPECT_EQ(5, snapshot.size());
 }