You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by sh...@apache.org on 2018/11/15 12:21:42 UTC
[kylin] branch master updated: KYLIN-3656 Improve HLLCounter
performance
This is an automated email from the ASF dual-hosted git repository.
shaofengshi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kylin.git
The following commit(s) were added to refs/heads/master by this push:
new 889dc50 KYLIN-3656 Improve HLLCounter performance
889dc50 is described below
commit 889dc503c6b538e60d180055b9b39b84d15d9ba0
Author: chenchang <ba...@gmail.com>
AuthorDate: Thu Nov 15 17:32:06 2018 +0800
KYLIN-3656 Improve HLLCounter performance
---
.../apache/kylin/measure/hllc/DenseRegister.java | 5 ++
.../org/apache/kylin/measure/hllc/HLLCounter.java | 30 +++++---
.../apache/kylin/measure/hllc/HLLCounterTest.java | 83 ++++++++++++++++++++++
3 files changed, 110 insertions(+), 8 deletions(-)
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
index d34fef6..5c192cc 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
@@ -35,6 +35,11 @@ public class DenseRegister implements Register, java.io.Serializable {
this.register = new byte[m];
}
+ public void copyFrom(DenseRegister r){
+ assert m == r.m;
+ System.arraycopy(r.register, 0, register, 0 , register.length);
+ }
+
public void set(int pos, byte value) {
register[pos] = value;
}
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
index 80bbb2a..875c7eb 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
@@ -32,6 +32,14 @@ import java.util.Map;
@SuppressWarnings("serial")
public class HLLCounter implements Serializable, Comparable<HLLCounter> {
+ static double[] harmonicMean;
+
+ static {
+ harmonicMean = new double[256];
+ for (int i = 1; i < 256; i++)
+ harmonicMean[i] = 1.0 / (1L << i);
+ }
+
// not final for test purpose
static double OVERFLOW_FACTOR = 0.01;
@@ -57,7 +65,11 @@ public class HLLCounter implements Serializable, Comparable<HLLCounter> {
public HLLCounter(HLLCounter another) {
this(another.p, another.getRegisterType(), another.hashFunc);
- merge(another);
+ if(another.getRegisterType() == RegisterType.DENSE){
+ ((DenseRegister)register).copyFrom((DenseRegister)another.register);
+ }else {
+ merge(another);
+ }
}
public HLLCounter(int p, RegisterType type) {
@@ -202,6 +214,8 @@ public class HLLCounter implements Serializable, Comparable<HLLCounter> {
int zeroBuckets;
public HLLCSnapshot(HLLCounter hllc) {
+ int[] registerNums = new int[256];
+
p = (byte) hllc.p;
registerSum = 0;
zeroBuckets = 0;
@@ -215,14 +229,14 @@ public class HLLCounter implements Serializable, Comparable<HLLCounter> {
dr = (DenseRegister) register;
}
byte[] registers = dr.getRawRegister();
- for (int i = 0; i < hllc.m; i++) {
- if (registers[i] == 0) {
- registerSum++;
- zeroBuckets++;
- } else {
- registerSum += 1.0 / (1L << registers[i]);
- }
+ for (int i = 0; i < hllc.m; i ++) {
+ registerNums[registers[i]] ++;
}
+ zeroBuckets = registerNums[0];
+ for (int i= 1; i < 256; i ++)
+ registerSum += registerNums[i] * harmonicMean[i];
+
+ registerSum += zeroBuckets;
}
public long getCountEstimate() {
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
index 92f2aab..789addf 100644
--- a/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
+++ b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.nio.ByteBuffer;
+import java.util.ArrayList;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
@@ -68,6 +69,88 @@ public class HLLCounterTest {
assertTrue(hllc.getCountEstimate() > 10 * 0.9);
}
+ /**
+ * evaluation getCountEstimate of HLLCounter
+ * cost time : 1341[old] -> 206[new]
+ */
+ @Test
+ public void countPerformanceWithLargeCardinality(){
+ int cardinality = 10_000_000;
+ HLLCounter hllc = generateTestCounter(2009, cardinality);
+ final int testCount = 5000;
+ countEstimatePerformance(hllc, cardinality, testCount);
+ }
+
+ /**
+ * evaluation getCountEstimate of HLLCounter
+ * cost time : 1396[old] -> 274[new]
+ */
+ @Test
+ public void countPerformanceSmallCardinality(){
+ int cardinality = 300_000;
+ HLLCounter hllc = generateTestCounter(2009, cardinality);
+ final int testCount = 5000;
+ countEstimatePerformance(hllc, cardinality, testCount);
+ }
+
+ /**
+ * evaluation constructor of HLLCounter
+ * cost time : 1577[old] -> 490[new]
+ */
+ @Test
+ public void createHLLCPerformance(){
+ int cardinality = 3_000_000;
+ HLLCounter hllc = generateTestCounter(2009, cardinality);
+ final int testCount = 30000;
+
+ HLLCounter hllc2 = null;
+ long start = System.currentTimeMillis();
+ for (int i = 0; i < testCount; i++){
+ hllc2 = new HLLCounter(hllc);
+ }
+ long totalTime = System.currentTimeMillis() - start;
+ System.out.println("constructor of HLLCounter cost time : " + totalTime);
+
+ long estimate = hllc2.getCountEstimate();
+ assertTrue(estimate > 0.9 * cardinality && estimate < 1.1 * cardinality);
+ System.out.println("estimate is " + estimate);
+ }
+
+ /**
+ * Simulate mutli call to method [getCountEstimate of HLLCounter] and get
+ * duration for method getCountEstimate of HLLCounter
+ */
+ private void countEstimatePerformance(HLLCounter hllc, int realCount, int callTimes) {
+ long start = System.currentTimeMillis();
+ for (int i = 0; i < callTimes; i++)
+ hllc.getCountEstimate();
+ long totalTime = System.currentTimeMillis() - start;
+ System.out.println("getCountEstimate of HLLCounter cost time : " + totalTime);
+
+ long estimate = hllc.getCountEstimate();
+ assertTrue(estimate > realCount * 0.9 && estimate < realCount * 1.1);
+ System.out.println("estimate is " + estimate);
+ }
+
+ private HLLCounter generateTestCounter(int seed, int maxDistinctCounts) {
+ long start = System.currentTimeMillis();
+ Random rand1 = new Random(seed);
+ Set<Integer> rawData = new HashSet<>();
+ while (rawData.size() < maxDistinctCounts)
+ rawData.add(rand1.nextInt());
+ ArrayList<Integer> testData = new ArrayList<>(rawData);
+ assertEquals(maxDistinctCounts, testData.size());
+
+ HLLCounter hllc = new HLLCounter(16, RegisterType.DENSE);
+
+ for (int j = 0; j < testData.size(); j++) {
+ hllc.add(testData.get(j));
+ }
+ long totalTime = System.currentTimeMillis() - start;
+ System.out.println("generate data cost time : " + totalTime);
+ return hllc;
+ }
+
@Test
public void countTest() throws IOException {
int n = 10;