You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by sh...@apache.org on 2018/11/15 12:21:42 UTC

[kylin] branch master updated: KYLIN-3656 Improve HLLCounter performance

This is an automated email from the ASF dual-hosted git repository.

shaofengshi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kylin.git


The following commit(s) were added to refs/heads/master by this push:
     new 889dc50  KYLIN-3656  Improve HLLCounter performance
889dc50 is described below

commit 889dc503c6b538e60d180055b9b39b84d15d9ba0
Author: chenchang <ba...@gmail.com>
AuthorDate: Thu Nov 15 17:32:06 2018 +0800

    KYLIN-3656  Improve HLLCounter performance
---
 .../apache/kylin/measure/hllc/DenseRegister.java   |  5 ++
 .../org/apache/kylin/measure/hllc/HLLCounter.java  | 30 +++++---
 .../apache/kylin/measure/hllc/HLLCounterTest.java  | 83 ++++++++++++++++++++++
 3 files changed, 110 insertions(+), 8 deletions(-)

diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
index d34fef6..5c192cc 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
@@ -35,6 +35,11 @@ public class DenseRegister implements Register, java.io.Serializable {
         this.register = new byte[m];
     }
 
+    public void  copyFrom(DenseRegister r){
+        assert m == r.m;
+        System.arraycopy(r.register, 0, register, 0 , register.length);
+    }
+
     public void set(int pos, byte value) {
         register[pos] = value;
     }
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
index 80bbb2a..875c7eb 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
@@ -32,6 +32,14 @@ import java.util.Map;
 @SuppressWarnings("serial")
 public class HLLCounter implements Serializable, Comparable<HLLCounter> {
 
+    static double[] harmonicMean;
+
+    static {
+        harmonicMean = new double[256];
+        for (int i = 1; i < 256; i++)
+            harmonicMean[i] = 1.0 / (1L << i);
+    }
+
     // not final for test purpose
     static double OVERFLOW_FACTOR = 0.01;
 
@@ -57,7 +65,11 @@ public class HLLCounter implements Serializable, Comparable<HLLCounter> {
 
     public HLLCounter(HLLCounter another) {
         this(another.p, another.getRegisterType(), another.hashFunc);
-        merge(another);
+        if(another.getRegisterType() == RegisterType.DENSE){
+            ((DenseRegister)register).copyFrom((DenseRegister)another.register);
+        }else {
+            merge(another);
+        }
     }
 
     public HLLCounter(int p, RegisterType type) {
@@ -202,6 +214,8 @@ public class HLLCounter implements Serializable, Comparable<HLLCounter> {
         int zeroBuckets;
 
         public HLLCSnapshot(HLLCounter hllc) {
+            int[] registerNums = new int[256];
+
             p = (byte) hllc.p;
             registerSum = 0;
             zeroBuckets = 0;
@@ -215,14 +229,14 @@ public class HLLCounter implements Serializable, Comparable<HLLCounter> {
                 dr = (DenseRegister) register;
             }
             byte[] registers = dr.getRawRegister();
-            for (int i = 0; i < hllc.m; i++) {
-                if (registers[i] == 0) {
-                    registerSum++;
-                    zeroBuckets++;
-                } else {
-                    registerSum += 1.0 / (1L << registers[i]);
-                }
+            for (int i = 0; i < hllc.m; i ++) {
+                registerNums[registers[i]] ++;
             }
+            zeroBuckets = registerNums[0];
+            for (int i= 1; i < 256; i ++)
+                registerSum += registerNums[i] * harmonicMean[i];
+
+            registerSum += zeroBuckets;
         }
 
         public long getCountEstimate() {
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
index 92f2aab..789addf 100644
--- a/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
+++ b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Random;
 import java.util.Set;
@@ -68,6 +69,88 @@ public class HLLCounterTest {
         assertTrue(hllc.getCountEstimate() > 10 * 0.9);
     }
 
+    /**
+     * evaluation getCountEstimate of HLLCounter
+     * cost time : 1341[old] -> 206[new]
+     */
+    @Test
+    public void countPerformanceWithLargeCardinality(){
+        int cardinality = 10_000_000;
+        HLLCounter hllc = generateTestCounter(2009, cardinality);
+        final int testCount = 5000;
+        countEstimatePerformance(hllc, cardinality, testCount);
+    }
+
+    /**
+     * evaluation getCountEstimate of HLLCounter
+     * cost time : 1396[old] -> 274[new]
+     */
+    @Test
+    public void countPerformanceSmallCardinality(){
+        int cardinality = 300_000;
+        HLLCounter hllc = generateTestCounter(2009, cardinality);
+        final int testCount = 5000;
+        countEstimatePerformance(hllc, cardinality, testCount);
+    }
+
+    /**
+     * evaluation constructor of HLLCounter
+     * cost time : 1577[old] -> 490[new]
+     */
+    @Test
+    public void createHLLCPerformance(){
+        int cardinality = 3_000_000;
+        HLLCounter hllc = generateTestCounter(2009, cardinality);
+        final int testCount = 30000;
+
+        HLLCounter hllc2 = null;
+        long start = System.currentTimeMillis();
+        for (int i = 0; i < testCount; i++){
+            hllc2 = new HLLCounter(hllc);
+        }
+        long totalTime = System.currentTimeMillis() - start;
+        System.out.println("constructor of HLLCounter cost time : " + totalTime);
+
+        long estimate = hllc2.getCountEstimate();
+        assertTrue(estimate > 0.9 * cardinality && estimate < 1.1 * cardinality);
+        System.out.println("estimate is " + estimate);
+    }
+
+    /**
+     * Simulate mutli call to method [getCountEstimate of HLLCounter] and get
+     * duration for method getCountEstimate of HLLCounter
+     */
+    private void countEstimatePerformance(HLLCounter hllc, int realCount, int callTimes) {
+        long start = System.currentTimeMillis();
+        for (int i = 0; i < callTimes; i++)
+            hllc.getCountEstimate();
+        long totalTime = System.currentTimeMillis() - start;
+        System.out.println("getCountEstimate of HLLCounter cost time : " + totalTime);
+
+        long estimate = hllc.getCountEstimate();
+        assertTrue(estimate > realCount * 0.9 && estimate < realCount * 1.1);
+        System.out.println("estimate is " + estimate);
+    }
+
+    private HLLCounter generateTestCounter(int seed, int maxDistinctCounts) {
+        long start = System.currentTimeMillis();
+        Random rand1 = new Random(seed);
+        Set<Integer> rawData = new HashSet<>();
+        while (rawData.size() < maxDistinctCounts)
+            rawData.add(rand1.nextInt());
+        ArrayList<Integer> testData = new ArrayList<>(rawData);
+        assertEquals(maxDistinctCounts, testData.size());
+
+        HLLCounter hllc = new HLLCounter(16, RegisterType.DENSE);
+
+        for (int j = 0; j < testData.size(); j++) {
+            hllc.add(testData.get(j));
+        }
+        long totalTime = System.currentTimeMillis() - start;
+        System.out.println("generate data cost time : " + totalTime);
+        return hllc;
+    }
+
     @Test
     public void countTest() throws IOException {
         int n = 10;