You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@kylin.apache.org by GitBox <gi...@apache.org> on 2018/11/15 12:21:39 UTC

[GitHub] shaofengshi closed pull request #345: KYLIN-3656 Improve HLLCounter performance

shaofengshi closed pull request #345: KYLIN-3656  Improve HLLCounter performance
URL: https://github.com/apache/kylin/pull/345
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
index d34fef63ab..5c192cc7fd 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
@@ -35,6 +35,11 @@ public DenseRegister(int p) {
         this.register = new byte[m];
     }
 
+    public void  copyFrom(DenseRegister r){
+        assert m == r.m;
+        System.arraycopy(r.register, 0, register, 0 , register.length);
+    }
+
     public void set(int pos, byte value) {
         register[pos] = value;
     }
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
index 80bbb2a9c1..875c7eb09e 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
@@ -32,6 +32,14 @@
 @SuppressWarnings("serial")
 public class HLLCounter implements Serializable, Comparable<HLLCounter> {
 
+    static double[] harmonicMean;
+
+    static {
+        harmonicMean = new double[256];
+        for (int i = 1; i < 256; i++)
+            harmonicMean[i] = 1.0 / (1L << i);
+    }
+
     // not final for test purpose
     static double OVERFLOW_FACTOR = 0.01;
 
@@ -57,7 +65,11 @@ public HLLCounter(int p, HashFunction hashFunc) {
 
     public HLLCounter(HLLCounter another) {
         this(another.p, another.getRegisterType(), another.hashFunc);
-        merge(another);
+        if(another.getRegisterType() == RegisterType.DENSE){
+            ((DenseRegister)register).copyFrom((DenseRegister)another.register);
+        }else {
+            merge(another);
+        }
     }
 
     public HLLCounter(int p, RegisterType type) {
@@ -202,6 +214,8 @@ public String toString() {
         int zeroBuckets;
 
         public HLLCSnapshot(HLLCounter hllc) {
+            int[] registerNums = new int[256];
+
             p = (byte) hllc.p;
             registerSum = 0;
             zeroBuckets = 0;
@@ -215,14 +229,14 @@ public HLLCSnapshot(HLLCounter hllc) {
                 dr = (DenseRegister) register;
             }
             byte[] registers = dr.getRawRegister();
-            for (int i = 0; i < hllc.m; i++) {
-                if (registers[i] == 0) {
-                    registerSum++;
-                    zeroBuckets++;
-                } else {
-                    registerSum += 1.0 / (1L << registers[i]);
-                }
+            for (int i = 0; i < hllc.m; i ++) {
+                registerNums[registers[i]] ++;
             }
+            zeroBuckets = registerNums[0];
+            for (int i= 1; i < 256; i ++)
+                registerSum += registerNums[i] * harmonicMean[i];
+
+            registerSum += zeroBuckets;
         }
 
         public long getCountEstimate() {
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
index 92f2aab270..789addfb8d 100644
--- a/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
+++ b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
@@ -22,6 +22,7 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Random;
 import java.util.Set;
@@ -68,6 +69,88 @@ public void tesSparseEstimate() throws IOException {
         assertTrue(hllc.getCountEstimate() > 10 * 0.9);
     }
 
+    /**
+     * evaluation getCountEstimate of HLLCounter
+     * cost time : 1341[old] -> 206[new]
+     */
+    @Test
+    public void countPerformanceWithLargeCardinality(){
+        int cardinality = 10_000_000;
+        HLLCounter hllc = generateTestCounter(2009, cardinality);
+        final int testCount = 5000;
+        countEstimatePerformance(hllc, cardinality, testCount);
+    }
+
+    /**
+     * evaluation getCountEstimate of HLLCounter
+     * cost time : 1396[old] -> 274[new]
+     */
+    @Test
+    public void countPerformanceSmallCardinality(){
+        int cardinality = 300_000;
+        HLLCounter hllc = generateTestCounter(2009, cardinality);
+        final int testCount = 5000;
+        countEstimatePerformance(hllc, cardinality, testCount);
+    }
+
+    /**
+     * evaluation constructor of HLLCounter
+     * cost time : 1577[old] -> 490[new]
+     */
+    @Test
+    public void createHLLCPerformance(){
+        int cardinality = 3_000_000;
+        HLLCounter hllc = generateTestCounter(2009, cardinality);
+        final int testCount = 30000;
+
+        HLLCounter hllc2 = null;
+        long start = System.currentTimeMillis();
+        for (int i = 0; i < testCount; i++){
+            hllc2 = new HLLCounter(hllc);
+        }
+        long totalTime = System.currentTimeMillis() - start;
+        System.out.println("constructor of HLLCounter cost time : " + totalTime);
+
+        long estimate = hllc2.getCountEstimate();
+        assertTrue(estimate > 0.9 * cardinality && estimate < 1.1 * cardinality);
+        System.out.println("estimate is " + estimate);
+    }
+
+    /**
+     * Simulate mutli call to method [getCountEstimate of HLLCounter] and get
+     * duration for method getCountEstimate of HLLCounter
+     */
+    private void countEstimatePerformance(HLLCounter hllc, int realCount, int callTimes) {
+        long start = System.currentTimeMillis();
+        for (int i = 0; i < callTimes; i++)
+            hllc.getCountEstimate();
+        long totalTime = System.currentTimeMillis() - start;
+        System.out.println("getCountEstimate of HLLCounter cost time : " + totalTime);
+
+        long estimate = hllc.getCountEstimate();
+        assertTrue(estimate > realCount * 0.9 && estimate < realCount * 1.1);
+        System.out.println("estimate is " + estimate);
+    }
+
+    private HLLCounter generateTestCounter(int seed, int maxDistinctCounts) {
+        long start = System.currentTimeMillis();
+        Random rand1 = new Random(seed);
+        Set<Integer> rawData = new HashSet<>();
+        while (rawData.size() < maxDistinctCounts)
+            rawData.add(rand1.nextInt());
+        ArrayList<Integer> testData = new ArrayList<>(rawData);
+        assertEquals(maxDistinctCounts, testData.size());
+
+        HLLCounter hllc = new HLLCounter(16, RegisterType.DENSE);
+
+        for (int j = 0; j < testData.size(); j++) {
+            hllc.add(testData.get(j));
+        }
+        long totalTime = System.currentTimeMillis() - start;
+        System.out.println("generate data cost time : " + totalTime);
+        return hllc;
+    }
+
     @Test
     public void countTest() throws IOException {
         int n = 10;


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services