You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bookkeeper.apache.org by si...@apache.org on 2019/04/10 01:47:51 UTC

[bookkeeper] branch branch-4.8 updated: ISSUE #2053: Bugfix for Percentile Calculation in FastCodahale Timer Implementation

This is an automated email from the ASF dual-hosted git repository.

sijie pushed a commit to branch branch-4.8
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git


The following commit(s) were added to refs/heads/branch-4.8 by this push:
     new 3e48a98  ISSUE #2053: Bugfix for Percentile Calculation in FastCodahale Timer Implementation
3e48a98 is described below

commit 3e48a98f1861700c8bebb0048864c869552e12c7
Author: Nicolas Michael <gi...@nmichael.de>
AuthorDate: Tue Apr 9 18:44:52 2019 -0700

    ISSUE #2053: Bugfix for Percentile Calculation in FastCodahale Timer Implementation
    
    This bugfix for the FastCodahale timer implementation ensures that percentiles provided by a FastSnapshot are calculated correctly even if the total count of events (provided by FastTimer) is out of sync with the recorded events in the percentile buckets.
    
    ### Motivation
    
    FastCodahale Timer implementation may miscalculate percentiles if snapshots of values are slightly out of sync, and if only few events have been recorded.
    
    FastCodahale Timers use fine-grained locking and are meant to tolerate that (some) values change while being recorded or while snapshots are created. Currently, the total count of requests is not synchronized with the number of requests recorded in percentile buckets. If a snapshot is created while the total count of the timer has been incremented beyond the sum of values in the percentile buckets, the percentile calculation may produce wrong values.
    
    For example, if 3 percentile values have been recorded, but the overall count is 4, then the percentile calculation would be based on 4 values. This becomes most obvious if a percentile > .75 (e.g. p95) is being calculated. For this, the implementation will try to find 0.95 * 4 values, which is more than the 3 values recorded in the buckets. Since no bucket fulfills the criteria, the bound of the last (overflow) bucket will be returned, i.e. Long.MAX_VALUE.
    
    ### Changes
    
    FastSnapshots now bases the percentile calculation on the sum of values in the percentile buckets rather than a count provided by the caller (i.e. FastTimer). This ensures that percentiles are calculated correctly without the need of having all counters fully synchronized.
    
    Master Issue: #2053
    
    
    Reviewers: Jia Zhai <zh...@apache.org>, Sijie Guo <si...@apache.org>
    
    This closes #2054 from nicmichael/fast-codahale-bugfix, closes #2053
    
    (cherry picked from commit 848f8527f9d4745753b2f1e22ac1b8e981ea0d1d)
    Signed-off-by: Sijie Guo <si...@apache.org>
---
 .../bookkeeper/stats/codahale/FastSnapshot.java    | 19 ++++++++++++++++--
 .../bookkeeper/stats/codahale/FastTimerTest.java   | 23 ++++++++++++++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/bookkeeper-stats-providers/codahale-metrics-provider/src/main/java/org/apache/bookkeeper/stats/codahale/FastSnapshot.java b/bookkeeper-stats-providers/codahale-metrics-provider/src/main/java/org/apache/bookkeeper/stats/codahale/FastSnapshot.java
index ee6466e..f6ceb88 100644
--- a/bookkeeper-stats-providers/codahale-metrics-provider/src/main/java/org/apache/bookkeeper/stats/codahale/FastSnapshot.java
+++ b/bookkeeper-stats-providers/codahale-metrics-provider/src/main/java/org/apache/bookkeeper/stats/codahale/FastSnapshot.java
@@ -32,6 +32,7 @@ public class FastSnapshot extends Snapshot {
     private final long max;
     private final long sum;
     private final long cnt;
+    private final long pcnt;
     private final long[] values;
 
     @SuppressFBWarnings(
@@ -43,18 +44,19 @@ public class FastSnapshot extends Snapshot {
         this.max = max;
         this.sum = sum;
         this.cnt = cnt;
+        this.pcnt = values != null ? sumOf(values) : 0;
         this.values = values;
     }
 
     @Override
     public double getValue(double quantile) {
-        if (cnt == 0 || values == null) {
+        if (pcnt == 0 || values == null) {
             return 0;
         }
         long qcnt = 0;
         for (int i = 0; i < values.length; i++) {
             qcnt += values[i];
-            if (((double) qcnt) / ((double) cnt) > quantile) {
+            if (((double) qcnt) / ((double) pcnt) > quantile) {
                 return timer.getBucketBound(i);
             }
         }
@@ -105,4 +107,17 @@ public class FastSnapshot extends Snapshot {
         // values in this snapshot represent percentile buckets, but not discrete values
     }
 
+    /**
+     * Calculates the sum of values of an array.
+     * @param a an array of values
+     * @return the sum of all array values
+     */
+    private long sumOf(long[] a) {
+        long sum = 0;
+        for (long x : a) {
+            sum += x;
+        }
+        return sum;
+    }
+
 }
\ No newline at end of file
diff --git a/bookkeeper-stats-providers/codahale-metrics-provider/src/test/java/org/apache/bookkeeper/stats/codahale/FastTimerTest.java b/bookkeeper-stats-providers/codahale-metrics-provider/src/test/java/org/apache/bookkeeper/stats/codahale/FastTimerTest.java
index 34ae5c3..b3a744f 100644
--- a/bookkeeper-stats-providers/codahale-metrics-provider/src/test/java/org/apache/bookkeeper/stats/codahale/FastTimerTest.java
+++ b/bookkeeper-stats-providers/codahale-metrics-provider/src/test/java/org/apache/bookkeeper/stats/codahale/FastTimerTest.java
@@ -216,4 +216,27 @@ public class FastTimerTest {
         assertEquals("FastSnapshot.getMean()", 10, (int) Math.round(s.getMean() / 1000000));
     }
 
+    @Test
+    public void testSnapshotOutOfSync() {
+        FastTimer t = getMockedFastTimer(1, FastTimer.Buckets.fine);
+        t.update(t.getBucketBound(0) - 1, TimeUnit.NANOSECONDS); // add value to 1st bucket
+        t.update(t.getBucketBound(1) - 1, TimeUnit.NANOSECONDS); // add value to 2nd bucket
+        t.update(t.getBucketBound(2) - 1, TimeUnit.NANOSECONDS); // add value to 3rd bucket
+        incSec(); // advance mocked time to next second
+        Snapshot s1 = t.getSnapshot();
+        long[] buckets = new long[t.getNumberOfBuckets()];
+        buckets[0] = 1;
+        buckets[1] = 1;
+        buckets[2] = 1;
+        Snapshot s2 = new FastSnapshot(t,
+                t.getBucketBound(0) - 1,
+                t.getBucketBound(2) - 1,
+                t.getBucketBound(0) + t.getBucketBound(1) + t.getBucketBound(2) + 3,
+                4, // count (4) is out of sync with number of recorded events in buckets (3)
+                buckets);
+        assertEquals("FastSnapshot.getMin()", s1.getMin(), s2.getMin());
+        assertEquals("FastSnapshot.getMax()", s1.getMax(), s2.getMax());
+        assertEquals("FastSnapshot.getValue(0.95)", (long) s1.getValue(0.95), (long) s2.getValue(0.95));
+    }
+
 }