You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by li...@apache.org on 2016/12/14 07:30:47 UTC
[1/5] kylin git commit: KYLIN-1832 HyperLogLog performance
optimization
Repository: kylin
Updated Branches:
refs/heads/master 530365131 -> e6e330a8b
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterTest.java
deleted file mode 100644
index 5b7c565..0000000
--- a/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterTest.java
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-package org.apache.kylin.measure.hll;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.HashSet;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.kylin.common.util.Bytes;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * @author yangli9
- *
- */
-public class HyperLogLogCounterTest {
-
- ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
- Random rand1 = new Random(1);
- Random rand2 = new Random(2);
- Random rand3 = new Random(3);
- int errorCount1 = 0;
- int errorCount2 = 0;
- int errorCount3 = 0;
-
- @Test
- public void testOneAdd() throws IOException {
- HyperLogLogPlusCounter hllc = new HyperLogLogPlusCounter(14);
- HyperLogLogPlusCounter one = new HyperLogLogPlusCounter(14);
- for (int i = 0; i < 1000000; i++) {
- one.clear();
- one.add(rand1.nextInt());
- hllc.merge(one);
- }
- assertTrue(hllc.getCountEstimate() > 1000000 * 0.9);
- }
-
- @Test
- public void testPeekLength() throws IOException {
- HyperLogLogPlusCounter hllc = new HyperLogLogPlusCounter(10);
- HyperLogLogPlusCounter copy = new HyperLogLogPlusCounter(10);
- byte[] value = new byte[10];
- for (int i = 0; i < 200000; i++) {
- rand1.nextBytes(value);
- hllc.add(value);
-
- buf.clear();
- hllc.writeRegisters(buf);
-
- int len = buf.position();
- buf.position(0);
- assertEquals(len, hllc.peekLength(buf));
-
- copy.readRegisters(buf);
- assertEquals(len, buf.position());
- assertEquals(hllc, copy);
- }
- buf.clear();
- }
-
- private Set<String> generateTestData(int n) {
- Set<String> testData = new HashSet<String>();
- for (int i = 0; i < n; i++) {
- String[] samples = generateSampleData();
- for (String sample : samples) {
- testData.add(sample);
- }
- }
- return testData;
- }
-
- // simulate the visit (=visitor+id)
- private String[] generateSampleData() {
-
- StringBuilder buf = new StringBuilder();
- for (int i = 0; i < 19; i++) {
- buf.append(Math.abs(rand1.nextInt()) % 10);
- }
- String header = buf.toString();
-
- int size = Math.abs(rand3.nextInt()) % 9 + 1;
- String[] samples = new String[size];
- for (int k = 0; k < size; k++) {
- buf = new StringBuilder(header);
- buf.append("-");
- for (int i = 0; i < 10; i++) {
- buf.append(Math.abs(rand3.nextInt()) % 10);
- }
- samples[k] = buf.toString();
- }
-
- return samples;
- }
-
- @Test
- public void countTest() throws IOException {
- int n = 10;
- for (int i = 0; i < 5; i++) {
- count(n);
- n *= 10;
- }
- }
-
- private void count(int n) throws IOException {
- Set<String> testSet = generateTestData(n);
-
- HyperLogLogPlusCounter hllc = newHLLC();
- for (String testData : testSet) {
- hllc.add(Bytes.toBytes(testData));
- }
- long estimate = hllc.getCountEstimate();
- double errorRate = hllc.getErrorRate();
- double actualError = (double) Math.abs(testSet.size() - estimate) / testSet.size();
- System.out.println(estimate);
- System.out.println(testSet.size());
- System.out.println(errorRate);
- System.out.println("=" + actualError);
- Assert.assertTrue(actualError < errorRate * 3.0);
-
- checkSerialize(hllc);
- }
-
- private void checkSerialize(HyperLogLogPlusCounter hllc) throws IOException {
- long estimate = hllc.getCountEstimate();
- buf.clear();
- hllc.writeRegisters(buf);
- buf.flip();
- hllc.readRegisters(buf);
- Assert.assertEquals(estimate, hllc.getCountEstimate());
- }
-
- @Test
- public void mergeTest() throws IOException {
- double error = 0;
- int n = 100;
- for (int i = 0; i < n; i++) {
- double e = merge(i);
- error += e;
- }
- System.out.println("Total average error is " + error / n);
-
- System.out.println(" errorRateCount1 is " + errorCount1 + "!");
- System.out.println(" errorRateCount2 is " + errorCount2 + "!");
- System.out.println(" errorRateCount3 is " + errorCount3 + "!");
-
- Assert.assertTrue(errorCount1 <= n * 0.30);
- Assert.assertTrue(errorCount2 <= n * 0.05);
- Assert.assertTrue(errorCount3 <= n * 0.02);
- }
-
- private double merge(int round) throws IOException {
- int ln = 20;
- int dn = 100 * (round + 1);
- Set<String> testSet = new HashSet<String>();
- HyperLogLogPlusCounter[] hllcs = new HyperLogLogPlusCounter[ln];
- for (int i = 0; i < ln; i++) {
- hllcs[i] = newHLLC();
- for (int k = 0; k < dn; k++) {
- String[] samples = generateSampleData();
- for (String data : samples) {
- testSet.add(data);
- hllcs[i].add(Bytes.toBytes(data));
- }
- }
- }
- HyperLogLogPlusCounter mergeHllc = newHLLC();
- for (HyperLogLogPlusCounter hllc : hllcs) {
- mergeHllc.merge(serDes(hllc));
- }
-
- double errorRate = mergeHllc.getErrorRate();
- long estimate = mergeHllc.getCountEstimate();
- double actualError = Math.abs((double) (testSet.size() - estimate) / testSet.size());
-
- System.out.println(testSet.size() + "-" + estimate + " ~ " + actualError);
- Assert.assertTrue(actualError < 0.1);
-
- if (actualError > errorRate) {
- errorCount1++;
- }
- if (actualError > 2 * errorRate) {
- errorCount2++;
- }
- if (actualError > 3 * errorRate) {
- errorCount3++;
- }
-
- return actualError;
- }
-
- private HyperLogLogPlusCounter serDes(HyperLogLogPlusCounter hllc) throws IOException {
- buf.clear();
- hllc.writeRegisters(buf);
- buf.flip();
- HyperLogLogPlusCounter copy = new HyperLogLogPlusCounter(hllc.getPrecision());
- copy.readRegisters(buf);
- Assert.assertEquals(copy.getCountEstimate(), hllc.getCountEstimate());
- return copy;
- }
-
- @Test
- public void testPerformance() throws IOException {
- int N = 3; // reduce N HLLC into one
- int M = 1000; // for M times, use 100000 for real perf test
-
- HyperLogLogPlusCounter samples[] = new HyperLogLogPlusCounter[N];
- for (int i = 0; i < N; i++) {
- samples[i] = newHLLC();
- for (String str : generateTestData(10000))
- samples[i].add(str);
- }
-
- System.out.println("Perf test running ... ");
- long start = System.currentTimeMillis();
- HyperLogLogPlusCounter sum = newHLLC();
- for (int i = 0; i < M; i++) {
- sum.clear();
- for (int j = 0; j < N; j++) {
- sum.merge(samples[j]);
- checkSerialize(sum);
- }
- }
- long duration = System.currentTimeMillis() - start;
- System.out.println("Perf test result: " + duration / 1000 + " seconds");
- }
-
- @Test
- public void testEquivalence() {
- byte[] a = new byte[] { 0, 3, 4, 42, 2, 2 };
- byte[] b = new byte[] { 3, 4, 42 };
- HyperLogLogPlusCounter ha = new HyperLogLogPlusCounter();
- HyperLogLogPlusCounter hb = new HyperLogLogPlusCounter();
- ha.add(a, 1, 3);
- hb.add(b);
-
- Assert.assertTrue(ha.getCountEstimate() == hb.getCountEstimate());
- }
-
- private HyperLogLogPlusCounter newHLLC() {
- return new HyperLogLogPlusCounter(16);
- }
-}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/test/java/org/apache/kylin/measure/hll2/HyperLogLogCounterNewTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hll2/HyperLogLogCounterNewTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hll2/HyperLogLogCounterNewTest.java
new file mode 100644
index 0000000..feb8c8e
--- /dev/null
+++ b/core-metadata/src/test/java/org/apache/kylin/measure/hll2/HyperLogLogCounterNewTest.java
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package org.apache.kylin.measure.hll2;
+
+import org.apache.kylin.common.util.Bytes;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterOld;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.RegisterType;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Created by xiefan on 16-12-12.
+ */
+public class HyperLogLogCounterNewTest {
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ Random rand1 = new Random(1);
+ Random rand2 = new Random(2);
+ Random rand3 = new Random(3);
+ int errorCount1 = 0;
+ int errorCount2 = 0;
+ int errorCount3 = 0;
+
+ @Test
+ public void testOneAdd() throws IOException {
+ HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew(14);
+ HyperLogLogPlusCounterNew one = new HyperLogLogPlusCounterNew(14);
+ for (int i = 0; i < 1000000; i++) {
+ one.clear();
+ one.add(rand1.nextInt());
+ hllc.merge(one);
+ }
+ System.out.println(hllc.getCountEstimate());
+ assertTrue(hllc.getCountEstimate() > 1000000 * 0.9);
+ }
+
+ @Test
+ public void tesSparseEstimate() throws IOException {
+ HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew(14);
+ for (int i = 0; i < 10; i++) {
+ hllc.add(i);
+ }
+ System.out.println(hllc.getCountEstimate());
+ assertTrue(hllc.getCountEstimate() > 10 * 0.9);
+ }
+
+ @Test
+ public void countTest() throws IOException {
+ int n = 10;
+ for (int i = 0; i < 5; i++) {
+ count(n);
+ n *= 10;
+ }
+ }
+
+ @Test
+ public void mergeTest() throws IOException {
+ double error = 0;
+ int n = 100;
+ for (int i = 0; i < n; i++) {
+ double e = merge(i);
+ error += e;
+ }
+ System.out.println("Total average error is " + error / n);
+
+ System.out.println(" errorRateCount1 is " + errorCount1 + "!");
+ System.out.println(" errorRateCount2 is " + errorCount2 + "!");
+ System.out.println(" errorRateCount3 is " + errorCount3 + "!");
+
+ Assert.assertTrue(errorCount1 <= n * 0.30);
+ Assert.assertTrue(errorCount2 <= n * 0.05);
+ Assert.assertTrue(errorCount3 <= n * 0.02);
+ }
+
+ /*
+ compare the result of two different hll counter
+ */
+ @Test
+ public void compareResult() {
+ int p = 12; //4096
+ int m = 1 << p;
+
+ for (int t = 0; t < 5; t++) {
+ //compare sparse
+ HyperLogLogPlusCounterOld oldCounter = new HyperLogLogPlusCounterOld(p);
+ HyperLogLogPlusCounterNew newCounter = new HyperLogLogPlusCounterNew(p);
+
+ for (int i = 0; i < 20; i++) {
+ //int r = rand1.nextInt();
+ oldCounter.add(i);
+ newCounter.add(i);
+ }
+ assertEquals(RegisterType.SPARSE, newCounter.getRegisterType());
+ assertEquals(oldCounter.getCountEstimate(), newCounter.getCountEstimate());
+ //compare dense
+ for (int i = 0; i < m; i++) {
+ oldCounter.add(i);
+ newCounter.add(i);
+ }
+ assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
+ assertEquals(oldCounter.getCountEstimate(), newCounter.getCountEstimate());
+ }
+
+ }
+
+ @Test
+ public void testPeekLength() throws IOException {
+ HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew(10);
+ HyperLogLogPlusCounterNew copy = new HyperLogLogPlusCounterNew(10);
+ byte[] value = new byte[10];
+ for (int i = 0; i < 200000; i++) {
+ rand1.nextBytes(value);
+ hllc.add(value);
+
+ buf.clear();
+ hllc.writeRegisters(buf);
+
+ int len = buf.position();
+ buf.position(0);
+ assertEquals(len, hllc.peekLength(buf));
+
+ copy.readRegisters(buf);
+ assertEquals(len, buf.position());
+ assertEquals(hllc, copy);
+ }
+ buf.clear();
+ }
+
+ @Test
+ public void testEquivalence() {
+ byte[] a = new byte[] { 0, 3, 4, 42, 2, 2 };
+ byte[] b = new byte[] { 3, 4, 42 };
+ HyperLogLogPlusCounterNew ha = new HyperLogLogPlusCounterNew();
+ HyperLogLogPlusCounterNew hb = new HyperLogLogPlusCounterNew();
+ ha.add(a, 1, 3);
+ hb.add(b);
+
+ Assert.assertTrue(ha.getCountEstimate() == hb.getCountEstimate());
+ }
+
+ @Test
+ public void testAutoChangeToSparse() {
+ int p = 15;
+ int m = 1 << p;
+ HyperLogLogPlusCounterNew counter = new HyperLogLogPlusCounterNew(p);
+ assertEquals(RegisterType.SPARSE, counter.getRegisterType());
+ double over = HyperLogLogPlusCounterNew.overflowFactor * m;
+ int overFlow = (int) over + 1000;
+ for (int i = 0; i < overFlow; i++)
+ counter.add(i);
+ assertEquals(RegisterType.DENSE, counter.getRegisterType());
+ }
+
+ @Test
+ public void testSerialilze() throws Exception {
+ //test sparse serialize
+ int p = 15;
+ int m = 1 << p;
+ HyperLogLogPlusCounterNew counter = new HyperLogLogPlusCounterNew(p);
+ counter.add(123);
+ assertEquals(RegisterType.SPARSE, counter.getRegisterType());
+ checkSerialize(counter);
+ //test dense serialize
+ double over = HyperLogLogPlusCounterNew.overflowFactor * m;
+ int overFlow = (int) over + 1000;
+ for (int i = 0; i < overFlow; i++)
+ counter.add(i);
+ assertEquals(RegisterType.DENSE, counter.getRegisterType());
+ checkSerialize(counter);
+ }
+
+ private Set<String> generateTestData(int n) {
+ Set<String> testData = new HashSet<String>();
+ for (int i = 0; i < n; i++) {
+ String[] samples = generateSampleData();
+ for (String sample : samples) {
+ testData.add(sample);
+ }
+ }
+ return testData;
+ }
+
+ // simulate the visit (=visitor+id)
+ private String[] generateSampleData() {
+
+ StringBuilder buf = new StringBuilder();
+ for (int i = 0; i < 19; i++) {
+ buf.append(Math.abs(rand1.nextInt()) % 10);
+ }
+ String header = buf.toString();
+
+ int size = Math.abs(rand3.nextInt()) % 9 + 1;
+ String[] samples = new String[size];
+ for (int k = 0; k < size; k++) {
+ buf = new StringBuilder(header);
+ buf.append("-");
+ for (int i = 0; i < 10; i++) {
+ buf.append(Math.abs(rand3.nextInt()) % 10);
+ }
+ samples[k] = buf.toString();
+ }
+
+ return samples;
+ }
+
+ private double merge(int round) throws IOException {
+ int ln = 20;
+ int dn = 100 * (round + 1);
+ Set<String> testSet = new HashSet<String>();
+ HyperLogLogPlusCounterNew[] hllcs = new HyperLogLogPlusCounterNew[ln];
+ for (int i = 0; i < ln; i++) {
+ hllcs[i] = newHLLC();
+ for (int k = 0; k < dn; k++) {
+ String[] samples = generateSampleData();
+ for (String data : samples) {
+ testSet.add(data);
+ hllcs[i].add(Bytes.toBytes(data));
+ }
+ }
+ }
+ HyperLogLogPlusCounterNew mergeHllc = newHLLC();
+ for (HyperLogLogPlusCounterNew hllc : hllcs) {
+ mergeHllc.merge(hllc);
+ }
+
+ double errorRate = mergeHllc.getErrorRate();
+ long estimate = mergeHllc.getCountEstimate();
+ double actualError = Math.abs((double) (testSet.size() - estimate) / testSet.size());
+
+ System.out.println(testSet.size() + "-" + estimate + " ~ " + actualError);
+ Assert.assertTrue(actualError < 0.1);
+
+ if (actualError > errorRate) {
+ errorCount1++;
+ }
+ if (actualError > 2 * errorRate) {
+ errorCount2++;
+ }
+ if (actualError > 3 * errorRate) {
+ errorCount3++;
+ }
+
+ return actualError;
+ }
+
+ private HyperLogLogPlusCounterNew newHLLC() {
+ return new HyperLogLogPlusCounterNew(16);
+ }
+
+ private void count(int n) throws IOException {
+ Set<String> testSet = generateTestData(n);
+
+ HyperLogLogPlusCounterNew hllc = newHLLC();
+ for (String testData : testSet) {
+ hllc.add(Bytes.toBytes(testData));
+ }
+ long estimate = hllc.getCountEstimate();
+ double errorRate = hllc.getErrorRate();
+ double actualError = (double) Math.abs(testSet.size() - estimate) / testSet.size();
+ System.out.println(estimate);
+ System.out.println(testSet.size());
+ System.out.println(errorRate);
+ System.out.println("=" + actualError);
+ Assert.assertTrue(actualError < errorRate * 3.0);
+
+ checkSerialize(hllc);
+ }
+
+ private void checkSerialize(HyperLogLogPlusCounterNew hllc) throws IOException {
+ long estimate = hllc.getCountEstimate();
+ buf.clear();
+ hllc.writeRegisters(buf);
+ buf.flip();
+ hllc.readRegisters(buf);
+ Assert.assertEquals(estimate, hllc.getCountEstimate());
+ }
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/test/java/org/apache/kylin/measure/hll2/NewHyperLogLogBenchmarkTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hll2/NewHyperLogLogBenchmarkTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hll2/NewHyperLogLogBenchmarkTest.java
new file mode 100644
index 0000000..bfb87f9
--- /dev/null
+++ b/core-metadata/src/test/java/org/apache/kylin/measure/hll2/NewHyperLogLogBenchmarkTest.java
@@ -0,0 +1,288 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package org.apache.kylin.measure.hll2;
+
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterOld;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.RegisterType;
+import org.junit.Test;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Created by xiefan on 16-12-12.
+ */
+public class NewHyperLogLogBenchmarkTest {
+
+ public static final Random rand = new Random(1);
+
+ final int testTimes = 10000;
+
+ @Test
+ public void denseToDenseRegisterMergeBenchmark() throws Exception {
+ final int p = 15;
+ int m = 1 << p;
+
+ System.out.println("m : " + m);
+ double oldFactor = HyperLogLogPlusCounterNew.overflowFactor;
+ HyperLogLogPlusCounterNew.overflowFactor = 1.1; //keep sparse
+ for (int cardinality : getTestDataDivide(m)) {
+ final HyperLogLogPlusCounterOld oldCounter = new HyperLogLogPlusCounterOld(p);
+ final HyperLogLogPlusCounterOld oldCounter2 = getRandOldCounter(p, cardinality);
+ long oldTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+
+ for (int i = 0; i < testTimes; i++) {
+ oldCounter.merge(oldCounter2);
+ }
+ }
+ });
+ final HyperLogLogPlusCounterNew newCounter = new HyperLogLogPlusCounterNew(p, RegisterType.DENSE);
+ final HyperLogLogPlusCounterNew newCounter2 = new HyperLogLogPlusCounterNew(p, RegisterType.DENSE);
+ for (int i = 0; i < testTimes; i++)
+ newCounter2.add(i);
+ long newTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+ for (int i = 0; i < testTimes; i++) {
+ newCounter.merge(newCounter2);
+ }
+ }
+ });
+ assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
+ assertEquals(RegisterType.DENSE, newCounter2.getRegisterType());
+ System.out.println("----------------------------");
+ System.out.println("cardinality : " + cardinality);
+ System.out.println("old time : " + oldTime);
+ System.out.println("new time : " + newTime);
+ }
+ HyperLogLogPlusCounterNew.overflowFactor = oldFactor;
+ }
+
+ @Test
+ public void sparseToSparseMergeBenchmark() throws Exception {
+ final int p = 15;
+ int m = 1 << p;
+ System.out.println("m : " + m);
+ double oldFactor = HyperLogLogPlusCounterNew.overflowFactor;
+ HyperLogLogPlusCounterNew.overflowFactor = 1.1; //keep sparse
+ for (int cardinality : getTestDataDivide(m)) {
+ final HyperLogLogPlusCounterOld oldCounter = new HyperLogLogPlusCounterOld(p);
+ final HyperLogLogPlusCounterOld oldCounter2 = getRandOldCounter(p, cardinality);
+ long oldTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+
+ for (int i = 0; i < testTimes; i++) {
+ oldCounter.merge(oldCounter2);
+ }
+ }
+ });
+ final HyperLogLogPlusCounterNew newCounter = new HyperLogLogPlusCounterNew(p);
+ final HyperLogLogPlusCounterNew newCounter2 = getRandNewCounter(p, cardinality);
+ long newTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+ for (int i = 0; i < testTimes; i++) {
+ newCounter.merge(newCounter2);
+ }
+ }
+ });
+ assertEquals(RegisterType.SPARSE, newCounter.getRegisterType());
+ assertEquals(RegisterType.SPARSE, newCounter2.getRegisterType());
+ System.out.println("----------------------------");
+ System.out.println("cardinality : " + cardinality);
+ System.out.println("old time : " + oldTime);
+ System.out.println("new time : " + newTime);
+ }
+ HyperLogLogPlusCounterNew.overflowFactor = oldFactor;
+ }
+
+ @Test
+ public void sparseToDenseRegisterMergeBenchmark() throws Exception {
+ final int p = 15;
+ int m = 1 << p;
+ System.out.println("m : " + m);
+ double oldFactor = HyperLogLogPlusCounterNew.overflowFactor;
+ HyperLogLogPlusCounterNew.overflowFactor = 1.1; //keep sparse
+ for (int cardinality : getTestDataDivide(m)) {
+ System.out.println("----------------------------");
+ System.out.println("cardinality : " + cardinality);
+ final HyperLogLogPlusCounterOld oldCounter = new HyperLogLogPlusCounterOld(p);
+ final HyperLogLogPlusCounterOld oldCounter2 = getRandOldCounter(p, cardinality);
+ long oldTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+ for (int i = 0; i < testTimes; i++) {
+ oldCounter.merge(oldCounter2);
+ }
+ }
+ });
+ final HyperLogLogPlusCounterNew newCounter = new HyperLogLogPlusCounterNew(p, RegisterType.DENSE);
+ final HyperLogLogPlusCounterNew newCounter2 = getRandNewCounter(p, cardinality);
+ long newTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+ for (int i = 0; i < testTimes; i++) {
+ newCounter.merge(newCounter2);
+ }
+ }
+ });
+ assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
+ assertEquals(RegisterType.SPARSE, newCounter2.getRegisterType());
+ System.out.println("old time : " + oldTime);
+ System.out.println("new time : " + newTime);
+ }
+ HyperLogLogPlusCounterNew.overflowFactor = oldFactor;
+ }
+
+ @Test
+ public void sparseSerializeBenchmark() throws Exception {
+ final int p = 15;
+ int m = 1 << p;
+ double oldFactor = HyperLogLogPlusCounterNew.overflowFactor;
+ HyperLogLogPlusCounterNew.overflowFactor = 1.1; //keep sparse
+ for (int cardinality : getTestDataDivide(m)) {
+ System.out.println("----------------------------");
+ System.out.println("cardinality : " + cardinality);
+ final HyperLogLogPlusCounterOld oldCounter = getRandOldCounter(p, cardinality);
+ long oldTime = runTestCase(new TestCase() {
+ @Override
+ public void run() throws Exception {
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ long totalBytes = 0;
+ for (int i = 0; i < testTimes; i++) {
+ buf.clear();
+ oldCounter.writeRegisters(buf);
+ totalBytes += buf.position();
+ buf.flip();
+ oldCounter.readRegisters(buf);
+ }
+ System.out.println("old serialize bytes : " + totalBytes / testTimes + "B");
+ }
+ });
+ final HyperLogLogPlusCounterNew newCounter = getRandNewCounter(p, cardinality);
+ long newTime = runTestCase(new TestCase() {
+ @Override
+ public void run() throws Exception {
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ long totalBytes = 0;
+ for (int i = 0; i < testTimes; i++) {
+ buf.clear();
+ newCounter.writeRegisters(buf);
+ totalBytes += buf.position();
+ buf.flip();
+ newCounter.readRegisters(buf);
+ }
+ System.out.println("new serialize bytes : " + totalBytes / testTimes + "B");
+ }
+ });
+ assertEquals(RegisterType.SPARSE, newCounter.getRegisterType());
+ System.out.println("old serialize time : " + oldTime);
+ System.out.println("new serialize time : " + newTime);
+ }
+ HyperLogLogPlusCounterNew.overflowFactor = oldFactor;
+ }
+
+ @Test
+ public void denseSerializeBenchmark() throws Exception {
+ final int p = 15;
+ int m = 1 << p;
+ double oldFactor = HyperLogLogPlusCounterNew.overflowFactor;
+ HyperLogLogPlusCounterNew.overflowFactor = 0; //keep sparse
+ for (int cardinality : getTestDataDivide(m)) {
+ System.out.println("----------------------------");
+ System.out.println("cardinality : " + cardinality);
+ final HyperLogLogPlusCounterOld oldCounter = getRandOldCounter(p, cardinality);
+ long oldTime = runTestCase(new TestCase() {
+ @Override
+ public void run() throws Exception {
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ long totalBytes = 0;
+ for (int i = 0; i < testTimes; i++) {
+ buf.clear();
+ oldCounter.writeRegisters(buf);
+ totalBytes += buf.position();
+ buf.flip();
+ oldCounter.readRegisters(buf);
+ }
+ System.out.println("old serialize bytes : " + totalBytes / testTimes + "B");
+ }
+ });
+ final HyperLogLogPlusCounterNew newCounter = getRandNewCounter(p, cardinality, RegisterType.DENSE);
+ long newTime = runTestCase(new TestCase() {
+ @Override
+ public void run() throws Exception {
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ long totalBytes = 0;
+ for (int i = 0; i < testTimes; i++) {
+ buf.clear();
+ newCounter.writeRegisters(buf);
+ totalBytes += buf.position();
+ buf.flip();
+ newCounter.readRegisters(buf);
+ }
+ System.out.println("new serialize bytes : " + totalBytes / testTimes + "B");
+ }
+ });
+ assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
+ System.out.println("old serialize time : " + oldTime);
+ System.out.println("new serialize time : " + newTime);
+ }
+ HyperLogLogPlusCounterNew.overflowFactor = oldFactor;
+ }
+
+ interface TestCase {
+ void run() throws Exception;
+ }
+
+ public long runTestCase(TestCase testCase) throws Exception {
+ long startTime = System.currentTimeMillis();
+ testCase.run();
+ return System.currentTimeMillis() - startTime;
+ }
+
+ public HyperLogLogPlusCounterOld getRandOldCounter(int p, int num) {
+ HyperLogLogPlusCounterOld c = new HyperLogLogPlusCounterOld(p);
+ for (int i = 0; i < num; i++)
+ c.add(i);
+ return c;
+ }
+
+ public HyperLogLogPlusCounterNew getRandNewCounter(int p, int num) {
+ HyperLogLogPlusCounterNew c = new HyperLogLogPlusCounterNew(p);
+ for (int i = 0; i < num; i++)
+ c.add(i);
+ return c;
+ }
+
+ public HyperLogLogPlusCounterNew getRandNewCounter(int p, int num, RegisterType type) {
+ HyperLogLogPlusCounterNew c = new HyperLogLogPlusCounterNew(p, type);
+ for (int i = 0; i < num; i++)
+ c.add(i);
+ return c;
+ }
+
+ public static int[] getTestDataDivide(int m) {
+ return new int[] { 1, 5, 10, 100, m / 200, m / 100, m / 50, m / 20, m / 10, m };
+ }
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
index 21af1e6..5445491 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
@@ -53,7 +53,7 @@ import org.apache.kylin.cube.kv.CubeDimEncMap;
import org.apache.kylin.cube.kv.RowKeyEncoder;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.engine.mr.HadoopUtil;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.apache.kylin.metadata.datatype.DataType;
import org.apache.kylin.metadata.model.FunctionDesc;
import org.apache.kylin.metadata.model.MeasureDesc;
@@ -76,7 +76,7 @@ public class CubeStatsReader {
final int samplingPercentage;
final int mapperNumberOfFirstBuild; // becomes meaningless after merge
final double mapperOverlapRatioOfFirstBuild; // becomes meaningless after merge
- final Map<Long, HyperLogLogPlusCounter> cuboidRowEstimatesHLL;
+ final Map<Long, HyperLogLogPlusCounterNew> cuboidRowEstimatesHLL;
final CuboidScheduler cuboidScheduler;
public CubeStatsReader(CubeSegment cubeSegment, KylinConfig kylinConfig) throws IOException {
@@ -96,7 +96,7 @@ public class CubeStatsReader {
int percentage = 100;
int mapperNumber = 0;
double mapperOverlapRatio = 0;
- Map<Long, HyperLogLogPlusCounter> counterMap = Maps.newHashMap();
+ Map<Long, HyperLogLogPlusCounterNew> counterMap = Maps.newHashMap();
LongWritable key = (LongWritable) ReflectionUtils.newInstance(reader.getKeyClass(), hadoopConf);
BytesWritable value = (BytesWritable) ReflectionUtils.newInstance(reader.getValueClass(), hadoopConf);
@@ -108,7 +108,7 @@ public class CubeStatsReader {
} else if (key.get() == -2) {
mapperNumber = Bytes.toInt(value.getBytes());
} else if (key.get() > 0) {
- HyperLogLogPlusCounter hll = new HyperLogLogPlusCounter(kylinConfig.getCubeStatsHLLPrecision());
+ HyperLogLogPlusCounterNew hll = new HyperLogLogPlusCounterNew(kylinConfig.getCubeStatsHLLPrecision());
ByteArray byteArray = new ByteArray(value.getBytes());
hll.readRegisters(byteArray.asBuffer());
counterMap.put(key.get(), hll);
@@ -161,9 +161,9 @@ public class CubeStatsReader {
return mapperOverlapRatioOfFirstBuild;
}
- public static Map<Long, Long> getCuboidRowCountMapFromSampling(Map<Long, HyperLogLogPlusCounter> hllcMap, int samplingPercentage) {
+ public static Map<Long, Long> getCuboidRowCountMapFromSampling(Map<Long, HyperLogLogPlusCounterNew> hllcMap, int samplingPercentage) {
Map<Long, Long> cuboidRowCountMap = Maps.newHashMap();
- for (Map.Entry<Long, HyperLogLogPlusCounter> entry : hllcMap.entrySet()) {
+ for (Map.Entry<Long, HyperLogLogPlusCounterNew> entry : hllcMap.entrySet()) {
// No need to adjust according sampling percentage. Assumption is that data set is far
// more than cardinality. Even a percentage of the data should already see all cardinalities.
cuboidRowCountMap.put(entry.getKey(), entry.getValue().getCountEstimate());
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsWriter.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsWriter.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsWriter.java
index 74a2107..219cdf2 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsWriter.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsWriter.java
@@ -33,17 +33,17 @@ import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.measure.BufferedMeasureCodec;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
public class CubeStatsWriter {
public static void writeCuboidStatistics(Configuration conf, Path outputPath, //
- Map<Long, HyperLogLogPlusCounter> cuboidHLLMap, int samplingPercentage) throws IOException {
+ Map<Long, HyperLogLogPlusCounterNew> cuboidHLLMap, int samplingPercentage) throws IOException {
writeCuboidStatistics(conf, outputPath, cuboidHLLMap, samplingPercentage, 0, 0);
}
public static void writeCuboidStatistics(Configuration conf, Path outputPath, //
- Map<Long, HyperLogLogPlusCounter> cuboidHLLMap, int samplingPercentage, int mapperNumber, double mapperOverlapRatio) throws IOException {
+ Map<Long, HyperLogLogPlusCounterNew> cuboidHLLMap, int samplingPercentage, int mapperNumber, double mapperOverlapRatio) throws IOException {
Path seqFilePath = new Path(outputPath, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME);
List<Long> allCuboids = new ArrayList<Long>();
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java
index 776d750..0d388c7 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java
@@ -47,7 +47,7 @@ import org.apache.kylin.engine.mr.KylinReducer;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.engine.mr.common.CubeStatsWriter;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.apache.kylin.metadata.model.TblColRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -64,7 +64,7 @@ public class FactDistinctColumnsReducer extends KylinReducer<SelfDefineSortableK
private List<TblColRef> columnList;
private String statisticsOutput = null;
private List<Long> baseCuboidRowCountInMappers;
- protected Map<Long, HyperLogLogPlusCounter> cuboidHLLMap = null;
+ protected Map<Long, HyperLogLogPlusCounterNew> cuboidHLLMap = null;
protected long baseCuboidId;
protected CubeDesc cubeDesc;
private long totalRowsBeforeMerge = 0;
@@ -156,7 +156,7 @@ public class FactDistinctColumnsReducer extends KylinReducer<SelfDefineSortableK
// for hll
long cuboidId = Bytes.toLong(key.getBytes(), 1, Bytes.SIZEOF_LONG);
for (Text value : values) {
- HyperLogLogPlusCounter hll = new HyperLogLogPlusCounter(cubeConfig.getCubeStatsHLLPrecision());
+ HyperLogLogPlusCounterNew hll = new HyperLogLogPlusCounterNew(cubeConfig.getCubeStatsHLLPrecision());
ByteBuffer bf = ByteBuffer.wrap(value.getBytes(), 0, value.getLength());
hll.readRegisters(bf);
@@ -270,7 +270,7 @@ public class FactDistinctColumnsReducer extends KylinReducer<SelfDefineSortableK
if (isStatistics) {
// output the hll info
long grandTotal = 0;
- for (HyperLogLogPlusCounter hll : cuboidHLLMap.values()) {
+ for (HyperLogLogPlusCounterNew hll : cuboidHLLMap.values()) {
grandTotal += hll.getCountEstimate();
}
double mapperOverlapRatio = grandTotal == 0 ? 0 : (double) totalRowsBeforeMerge / grandTotal;
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java
index a5c8fc0..c0575f1 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java
@@ -29,7 +29,7 @@ import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.cube.cuboid.CuboidScheduler;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.measure.BufferedMeasureCodec;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.apache.kylin.metadata.model.TblColRef;
import com.google.common.collect.Lists;
@@ -45,7 +45,7 @@ public class FactDistinctHiveColumnsMapper<KEYIN> extends FactDistinctColumnsMap
protected CuboidScheduler cuboidScheduler = null;
protected int nRowKey;
private Integer[][] allCuboidsBitSet = null;
- private HyperLogLogPlusCounter[] allCuboidsHLL = null;
+ private HyperLogLogPlusCounterNew[] allCuboidsHLL = null;
private Long[] cuboidIds;
private HashFunction hf = null;
private int rowCount = 0;
@@ -76,9 +76,9 @@ public class FactDistinctHiveColumnsMapper<KEYIN> extends FactDistinctColumnsMap
allCuboidsBitSet = allCuboidsBitSetList.toArray(new Integer[cuboidIdList.size()][]);
cuboidIds = cuboidIdList.toArray(new Long[cuboidIdList.size()]);
- allCuboidsHLL = new HyperLogLogPlusCounter[cuboidIds.length];
+ allCuboidsHLL = new HyperLogLogPlusCounterNew[cuboidIds.length];
for (int i = 0; i < cuboidIds.length; i++) {
- allCuboidsHLL[i] = new HyperLogLogPlusCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision());
+ allCuboidsHLL[i] = new HyperLogLogPlusCounterNew(cubeDesc.getConfig().getCubeStatsHLLPrecision());
}
hf = Hashing.murmur3_32();
@@ -207,7 +207,7 @@ public class FactDistinctHiveColumnsMapper<KEYIN> extends FactDistinctColumnsMap
if (collectStatistics) {
ByteBuffer hllBuf = ByteBuffer.allocate(BufferedMeasureCodec.DEFAULT_BUFFER_SIZE);
// output each cuboid's hll to reducer, key is 0 - cuboidId
- HyperLogLogPlusCounter hll;
+ HyperLogLogPlusCounterNew hll;
for (int i = 0; i < cuboidIds.length; i++) {
hll = allCuboidsHLL[i];
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java
index 88f6ba2..e839989 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java
@@ -47,7 +47,7 @@ import org.apache.kylin.job.exception.ExecuteException;
import org.apache.kylin.job.execution.AbstractExecutable;
import org.apache.kylin.job.execution.ExecutableContext;
import org.apache.kylin.job.execution.ExecuteResult;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -56,7 +56,7 @@ import com.google.common.collect.Maps;
public class MergeStatisticsStep extends AbstractExecutable {
private static final Logger logger = LoggerFactory.getLogger(MergeStatisticsStep.class);
- protected Map<Long, HyperLogLogPlusCounter> cuboidHLLMap = Maps.newHashMap();
+ protected Map<Long, HyperLogLogPlusCounterNew> cuboidHLLMap = Maps.newHashMap();
public MergeStatisticsStep() {
super();
@@ -100,7 +100,7 @@ public class MergeStatisticsStep extends AbstractExecutable {
// sampling percentage;
averageSamplingPercentage += Bytes.toInt(value.getBytes());
} else if (key.get() > 0) {
- HyperLogLogPlusCounter hll = new HyperLogLogPlusCounter(kylinConf.getCubeStatsHLLPrecision());
+ HyperLogLogPlusCounterNew hll = new HyperLogLogPlusCounterNew(kylinConf.getCubeStatsHLLPrecision());
ByteArray byteArray = new ByteArray(value.getBytes());
hll.readRegisters(byteArray.asBuffer());
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeSamplingTest.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeSamplingTest.java b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeSamplingTest.java
index 89d23fa..cae3b62 100644
--- a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeSamplingTest.java
+++ b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeSamplingTest.java
@@ -24,7 +24,7 @@ import java.util.List;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.kylin.common.util.ByteArray;
import org.apache.kylin.common.util.Bytes;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.junit.Before;
import org.junit.Test;
@@ -45,7 +45,7 @@ public class CubeSamplingTest {
private Integer[][] allCuboidsBitSet;
private HashFunction hf = null;
private long baseCuboidId;
- private HyperLogLogPlusCounter[] allCuboidsHLL = null;
+ private HyperLogLogPlusCounterNew[] allCuboidsHLL = null;
private final byte[] seperator = Bytes.toBytes(",");
@Before
@@ -61,9 +61,9 @@ public class CubeSamplingTest {
allCuboidsBitSet = allCuboidsBitSetList.toArray(new Integer[allCuboidsBitSetList.size()][]);
System.out.println("Totally have " + allCuboidsBitSet.length + " cuboids.");
- allCuboidsHLL = new HyperLogLogPlusCounter[allCuboids.size()];
+ allCuboidsHLL = new HyperLogLogPlusCounterNew[allCuboids.size()];
for (int i = 0; i < allCuboids.size(); i++) {
- allCuboidsHLL[i] = new HyperLogLogPlusCounter(14);
+ allCuboidsHLL[i] = new HyperLogLogPlusCounterNew(14);
}
// hf = Hashing.goodFastHash(32);
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducerTest.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducerTest.java b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducerTest.java
index ca8684f..a00db94 100644
--- a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducerTest.java
+++ b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducerTest.java
@@ -28,7 +28,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.kylin.engine.mr.HadoopUtil;
import org.apache.kylin.engine.mr.common.CubeStatsWriter;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.junit.Test;
import com.google.common.collect.Maps;
@@ -48,7 +48,7 @@ public class FactDistinctColumnsReducerTest {
}
System.out.println(outputPath);
- Map<Long, HyperLogLogPlusCounter> cuboidHLLMap = Maps.newHashMap();
+ Map<Long, HyperLogLogPlusCounterNew> cuboidHLLMap = Maps.newHashMap();
CubeStatsWriter.writeCuboidStatistics(conf, outputPath, cuboidHLLMap, 100);
FileSystem.getLocal(conf).delete(outputPath, true);
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
----------------------------------------------------------------------
diff --git a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
index 10c74f3..76212c8 100644
--- a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
+++ b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
@@ -83,7 +83,7 @@ import org.apache.kylin.engine.spark.cube.DefaultTupleConverter;
import org.apache.kylin.engine.spark.util.IteratorUtils;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.MeasureAggregators;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.apache.kylin.metadata.model.FunctionDesc;
import org.apache.kylin.metadata.model.IJoinedFlatTableDesc;
import org.apache.kylin.metadata.model.MeasureDesc;
@@ -241,15 +241,15 @@ public class SparkCubing extends AbstractApplication {
}
}
- private Map<Long, HyperLogLogPlusCounter> sampling(final JavaRDD<List<String>> rowJavaRDD, final String cubeName, String segmentId) throws Exception {
+ private Map<Long, HyperLogLogPlusCounterNew> sampling(final JavaRDD<List<String>> rowJavaRDD, final String cubeName, String segmentId) throws Exception {
CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()).reloadCubeLocal(cubeName);
CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);
CubeDesc cubeDesc = cubeInstance.getDescriptor();
CuboidScheduler cuboidScheduler = new CuboidScheduler(cubeDesc);
List<Long> allCuboidIds = cuboidScheduler.getAllCuboidIds();
- final HashMap<Long, HyperLogLogPlusCounter> zeroValue = Maps.newHashMap();
+ final HashMap<Long, HyperLogLogPlusCounterNew> zeroValue = Maps.newHashMap();
for (Long id : allCuboidIds) {
- zeroValue.put(id, new HyperLogLogPlusCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision()));
+ zeroValue.put(id, new HyperLogLogPlusCounterNew(cubeDesc.getConfig().getCubeStatsHLLPrecision()));
}
CubeJoinedFlatTableEnrich flatDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc);
@@ -278,12 +278,12 @@ public class SparkCubing extends AbstractApplication {
row_hashcodes[i] = new ByteArray();
}
- final HashMap<Long, HyperLogLogPlusCounter> samplingResult = rowJavaRDD.aggregate(zeroValue, new Function2<HashMap<Long, HyperLogLogPlusCounter>, List<String>, HashMap<Long, HyperLogLogPlusCounter>>() {
+ final HashMap<Long, HyperLogLogPlusCounterNew> samplingResult = rowJavaRDD.aggregate(zeroValue, new Function2<HashMap<Long, HyperLogLogPlusCounterNew>, List<String>, HashMap<Long, HyperLogLogPlusCounterNew>>() {
final HashFunction hashFunction = Hashing.murmur3_128();
@Override
- public HashMap<Long, HyperLogLogPlusCounter> call(HashMap<Long, HyperLogLogPlusCounter> v1, List<String> v2) throws Exception {
+ public HashMap<Long, HyperLogLogPlusCounterNew> call(HashMap<Long, HyperLogLogPlusCounterNew> v1, List<String> v2) throws Exception {
for (int i = 0; i < nRowKey; i++) {
Hasher hc = hashFunction.newHasher();
String colValue = v2.get(rowKeyColumnIndexes[i]);
@@ -296,7 +296,7 @@ public class SparkCubing extends AbstractApplication {
for (Map.Entry<Long, Integer[]> entry : allCuboidsBitSet.entrySet()) {
Hasher hc = hashFunction.newHasher();
- HyperLogLogPlusCounter counter = v1.get(entry.getKey());
+ HyperLogLogPlusCounterNew counter = v1.get(entry.getKey());
final Integer[] cuboidBitSet = entry.getValue();
for (int position = 0; position < cuboidBitSet.length; position++) {
hc.putBytes(row_hashcodes[cuboidBitSet[position]].array());
@@ -305,14 +305,14 @@ public class SparkCubing extends AbstractApplication {
}
return v1;
}
- }, new Function2<HashMap<Long, HyperLogLogPlusCounter>, HashMap<Long, HyperLogLogPlusCounter>, HashMap<Long, HyperLogLogPlusCounter>>() {
+ }, new Function2<HashMap<Long, HyperLogLogPlusCounterNew>, HashMap<Long, HyperLogLogPlusCounterNew>, HashMap<Long, HyperLogLogPlusCounterNew>>() {
@Override
- public HashMap<Long, HyperLogLogPlusCounter> call(HashMap<Long, HyperLogLogPlusCounter> v1, HashMap<Long, HyperLogLogPlusCounter> v2) throws Exception {
+ public HashMap<Long, HyperLogLogPlusCounterNew> call(HashMap<Long, HyperLogLogPlusCounterNew> v1, HashMap<Long, HyperLogLogPlusCounterNew> v2) throws Exception {
Preconditions.checkArgument(v1.size() == v2.size());
Preconditions.checkArgument(v1.size() > 0);
- for (Map.Entry<Long, HyperLogLogPlusCounter> entry : v1.entrySet()) {
- final HyperLogLogPlusCounter counter1 = entry.getValue();
- final HyperLogLogPlusCounter counter2 = v2.get(entry.getKey());
+ for (Map.Entry<Long, HyperLogLogPlusCounterNew> entry : v1.entrySet()) {
+ final HyperLogLogPlusCounterNew counter1 = entry.getValue();
+ final HyperLogLogPlusCounterNew counter2 = v2.get(entry.getKey());
counter1.merge(Preconditions.checkNotNull(counter2, "counter cannot be null"));
}
return v1;
@@ -470,7 +470,7 @@ public class SparkCubing extends AbstractApplication {
ClassUtil.addClasspath(confPath);
}
- private byte[][] createHTable(String cubeName, String segmentId, Map<Long, HyperLogLogPlusCounter> samplingResult) throws Exception {
+ private byte[][] createHTable(String cubeName, String segmentId, Map<Long, HyperLogLogPlusCounterNew> samplingResult) throws Exception {
final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
final CubeInstance cubeInstance = CubeManager.getInstance(kylinConfig).getCube(cubeName);
final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);
@@ -614,7 +614,7 @@ public class SparkCubing extends AbstractApplication {
}
});
- final Map<Long, HyperLogLogPlusCounter> samplingResult = sampling(rowJavaRDD, cubeName, segmentId);
+ final Map<Long, HyperLogLogPlusCounterNew> samplingResult = sampling(rowJavaRDD, cubeName, segmentId);
final byte[][] splitKeys = createHTable(cubeName, segmentId, samplingResult);
final String hfile = build(rowJavaRDD, cubeName, segmentId, splitKeys);
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityMapper.java
----------------------------------------------------------------------
diff --git a/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityMapper.java b/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityMapper.java
index 06a07ca..230249f 100644
--- a/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityMapper.java
+++ b/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityMapper.java
@@ -35,18 +35,18 @@ import org.apache.kylin.engine.mr.MRUtil;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.measure.BufferedMeasureCodec;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.apache.kylin.metadata.MetadataManager;
import org.apache.kylin.metadata.model.ColumnDesc;
import org.apache.kylin.metadata.model.TableDesc;
/**
* @author Jack
- *
+ *
*/
public class ColumnCardinalityMapper<T> extends KylinMapper<T, Object, IntWritable, BytesWritable> {
- private Map<Integer, HyperLogLogPlusCounter> hllcMap = new HashMap<Integer, HyperLogLogPlusCounter>();
+ private Map<Integer, HyperLogLogPlusCounterNew> hllcMap = new HashMap<Integer, HyperLogLogPlusCounterNew>();
public static final String DEFAULT_DELIM = ",";
private int counter = 0;
@@ -87,9 +87,9 @@ public class ColumnCardinalityMapper<T> extends KylinMapper<T, Object, IntWritab
counter++;
}
- private HyperLogLogPlusCounter getHllc(Integer key) {
+ private HyperLogLogPlusCounterNew getHllc(Integer key) {
if (!hllcMap.containsKey(key)) {
- hllcMap.put(key, new HyperLogLogPlusCounter());
+ hllcMap.put(key, new HyperLogLogPlusCounterNew());
}
return hllcMap.get(key);
}
@@ -100,7 +100,7 @@ public class ColumnCardinalityMapper<T> extends KylinMapper<T, Object, IntWritab
ByteBuffer buf = ByteBuffer.allocate(BufferedMeasureCodec.DEFAULT_BUFFER_SIZE);
while (it.hasNext()) {
int key = it.next();
- HyperLogLogPlusCounter hllc = hllcMap.get(key);
+ HyperLogLogPlusCounterNew hllc = hllcMap.get(key);
buf.clear();
hllc.writeRegisters(buf);
buf.flip();
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducer.java
----------------------------------------------------------------------
diff --git a/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducer.java b/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducer.java
index ea66999..32cc6d9 100644
--- a/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducer.java
+++ b/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducer.java
@@ -32,7 +32,7 @@ import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.kylin.engine.mr.KylinReducer;
import org.apache.kylin.measure.BufferedMeasureCodec;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
/**
* @author Jack
@@ -41,7 +41,7 @@ import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
public class ColumnCardinalityReducer extends KylinReducer<IntWritable, BytesWritable, IntWritable, LongWritable> {
public static final int ONE = 1;
- private Map<Integer, HyperLogLogPlusCounter> hllcMap = new HashMap<Integer, HyperLogLogPlusCounter>();
+ private Map<Integer, HyperLogLogPlusCounterNew> hllcMap = new HashMap<Integer, HyperLogLogPlusCounterNew>();
@Override
protected void setup(Context context) throws IOException {
@@ -53,16 +53,16 @@ public class ColumnCardinalityReducer extends KylinReducer<IntWritable, BytesWri
int skey = key.get();
for (BytesWritable v : values) {
ByteBuffer buffer = ByteBuffer.wrap(v.getBytes());
- HyperLogLogPlusCounter hll = new HyperLogLogPlusCounter();
+ HyperLogLogPlusCounterNew hll = new HyperLogLogPlusCounterNew();
hll.readRegisters(buffer);
getHllc(skey).merge(hll);
hll.clear();
}
}
- private HyperLogLogPlusCounter getHllc(Integer key) {
+ private HyperLogLogPlusCounterNew getHllc(Integer key) {
if (!hllcMap.containsKey(key)) {
- hllcMap.put(key, new HyperLogLogPlusCounter());
+ hllcMap.put(key, new HyperLogLogPlusCounterNew());
}
return hllcMap.get(key);
}
@@ -78,7 +78,7 @@ public class ColumnCardinalityReducer extends KylinReducer<IntWritable, BytesWri
it = keys.iterator();
while (it.hasNext()) {
int key = it.next();
- HyperLogLogPlusCounter hllc = hllcMap.get(key);
+ HyperLogLogPlusCounterNew hllc = hllcMap.get(key);
ByteBuffer buf = ByteBuffer.allocate(BufferedMeasureCodec.DEFAULT_BUFFER_SIZE);
buf.clear();
hllc.writeRegisters(buf);
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/source-hive/src/test/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducerTest.java
----------------------------------------------------------------------
diff --git a/source-hive/src/test/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducerTest.java b/source-hive/src/test/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducerTest.java
index d27860a..410543a 100644
--- a/source-hive/src/test/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducerTest.java
+++ b/source-hive/src/test/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducerTest.java
@@ -35,7 +35,7 @@ import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.apache.hadoop.mrunit.types.Pair;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.measure.BufferedMeasureCodec;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.junit.Before;
import org.junit.Test;
@@ -57,7 +57,7 @@ public class ColumnCardinalityReducerTest {
}
private byte[] getBytes(String str) throws IOException {
- HyperLogLogPlusCounter hllc = new HyperLogLogPlusCounter();
+ HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew();
StringTokenizer tokenizer = new StringTokenizer(str, ColumnCardinalityMapper.DEFAULT_DELIM);
int i = 0;
while (tokenizer.hasMoreTokens()) {
[4/5] kylin git commit: KYLIN-1832 code review
Posted by li...@apache.org.
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterOldTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterOldTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterOldTest.java
deleted file mode 100644
index 5d17fea..0000000
--- a/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterOldTest.java
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-package org.apache.kylin.measure.hll;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.HashSet;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.kylin.common.util.Bytes;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterOld;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * @author yangli9
- *
- */
-public class HyperLogLogCounterOldTest {
-
- ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
- Random rand1 = new Random(1);
- Random rand2 = new Random(2);
- Random rand3 = new Random(3);
- int errorCount1 = 0;
- int errorCount2 = 0;
- int errorCount3 = 0;
-
- @Test
- public void testOneAdd() throws IOException {
- HyperLogLogPlusCounterOld hllc = new HyperLogLogPlusCounterOld(14);
- HyperLogLogPlusCounterOld one = new HyperLogLogPlusCounterOld(14);
- for (int i = 0; i < 1000000; i++) {
- one.clear();
- one.add(rand1.nextInt());
- hllc.merge(one);
- }
- assertTrue(hllc.getCountEstimate() > 1000000 * 0.9);
- }
-
- @Test
- public void testPeekLength() throws IOException {
- HyperLogLogPlusCounterOld hllc = new HyperLogLogPlusCounterOld(10);
- HyperLogLogPlusCounterOld copy = new HyperLogLogPlusCounterOld(10);
- byte[] value = new byte[10];
- for (int i = 0; i < 200000; i++) {
- rand1.nextBytes(value);
- hllc.add(value);
-
- buf.clear();
- hllc.writeRegisters(buf);
-
- int len = buf.position();
- buf.position(0);
- assertEquals(len, hllc.peekLength(buf));
-
- copy.readRegisters(buf);
- assertEquals(len, buf.position());
- assertEquals(hllc, copy);
- }
- buf.clear();
- }
-
- private Set<String> generateTestData(int n) {
- Set<String> testData = new HashSet<String>();
- for (int i = 0; i < n; i++) {
- String[] samples = generateSampleData();
- for (String sample : samples) {
- testData.add(sample);
- }
- }
- return testData;
- }
-
- // simulate the visit (=visitor+id)
- private String[] generateSampleData() {
-
- StringBuilder buf = new StringBuilder();
- for (int i = 0; i < 19; i++) {
- buf.append(Math.abs(rand1.nextInt()) % 10);
- }
- String header = buf.toString();
-
- int size = Math.abs(rand3.nextInt()) % 9 + 1;
- String[] samples = new String[size];
- for (int k = 0; k < size; k++) {
- buf = new StringBuilder(header);
- buf.append("-");
- for (int i = 0; i < 10; i++) {
- buf.append(Math.abs(rand3.nextInt()) % 10);
- }
- samples[k] = buf.toString();
- }
-
- return samples;
- }
-
- @Test
- public void countTest() throws IOException {
- int n = 10;
- for (int i = 0; i < 5; i++) {
- count(n);
- n *= 10;
- }
- }
-
- private void count(int n) throws IOException {
- Set<String> testSet = generateTestData(n);
-
- HyperLogLogPlusCounterOld hllc = newHLLC();
- for (String testData : testSet) {
- hllc.add(Bytes.toBytes(testData));
- }
- long estimate = hllc.getCountEstimate();
- double errorRate = hllc.getErrorRate();
- double actualError = (double) Math.abs(testSet.size() - estimate) / testSet.size();
- System.out.println(estimate);
- System.out.println(testSet.size());
- System.out.println(errorRate);
- System.out.println("=" + actualError);
- Assert.assertTrue(actualError < errorRate * 3.0);
-
- checkSerialize(hllc);
- }
-
- private void checkSerialize(HyperLogLogPlusCounterOld hllc) throws IOException {
- long estimate = hllc.getCountEstimate();
- buf.clear();
- hllc.writeRegisters(buf);
- buf.flip();
- hllc.readRegisters(buf);
- Assert.assertEquals(estimate, hllc.getCountEstimate());
- }
-
- @Test
- public void mergeTest() throws IOException {
- double error = 0;
- int n = 100;
- for (int i = 0; i < n; i++) {
- double e = merge(i);
- error += e;
- }
- System.out.println("Total average error is " + error / n);
-
- System.out.println(" errorRateCount1 is " + errorCount1 + "!");
- System.out.println(" errorRateCount2 is " + errorCount2 + "!");
- System.out.println(" errorRateCount3 is " + errorCount3 + "!");
-
- Assert.assertTrue(errorCount1 <= n * 0.30);
- Assert.assertTrue(errorCount2 <= n * 0.05);
- Assert.assertTrue(errorCount3 <= n * 0.02);
- }
-
- private double merge(int round) throws IOException {
- int ln = 20;
- int dn = 100 * (round + 1);
- Set<String> testSet = new HashSet<String>();
- HyperLogLogPlusCounterOld[] hllcs = new HyperLogLogPlusCounterOld[ln];
- for (int i = 0; i < ln; i++) {
- hllcs[i] = newHLLC();
- for (int k = 0; k < dn; k++) {
- String[] samples = generateSampleData();
- for (String data : samples) {
- testSet.add(data);
- hllcs[i].add(Bytes.toBytes(data));
- }
- }
- }
- HyperLogLogPlusCounterOld mergeHllc = newHLLC();
- for (HyperLogLogPlusCounterOld hllc : hllcs) {
- mergeHllc.merge(serDes(hllc));
- }
-
- double errorRate = mergeHllc.getErrorRate();
- long estimate = mergeHllc.getCountEstimate();
- double actualError = Math.abs((double) (testSet.size() - estimate) / testSet.size());
-
- System.out.println(testSet.size() + "-" + estimate + " ~ " + actualError);
- Assert.assertTrue(actualError < 0.1);
-
- if (actualError > errorRate) {
- errorCount1++;
- }
- if (actualError > 2 * errorRate) {
- errorCount2++;
- }
- if (actualError > 3 * errorRate) {
- errorCount3++;
- }
-
- return actualError;
- }
-
- private HyperLogLogPlusCounterOld serDes(HyperLogLogPlusCounterOld hllc) throws IOException {
- buf.clear();
- hllc.writeRegisters(buf);
- buf.flip();
- HyperLogLogPlusCounterOld copy = new HyperLogLogPlusCounterOld(hllc.getPrecision());
- copy.readRegisters(buf);
- Assert.assertEquals(copy.getCountEstimate(), hllc.getCountEstimate());
- return copy;
- }
-
- @Test
- public void testPerformance() throws IOException {
- int N = 3; // reduce N HLLC into one
- int M = 1000; // for M times, use 100000 for real perf test
-
- HyperLogLogPlusCounterOld samples[] = new HyperLogLogPlusCounterOld[N];
- for (int i = 0; i < N; i++) {
- samples[i] = newHLLC();
- for (String str : generateTestData(10000))
- samples[i].add(str);
- }
-
- System.out.println("Perf test running ... ");
- long start = System.currentTimeMillis();
- HyperLogLogPlusCounterOld sum = newHLLC();
- for (int i = 0; i < M; i++) {
- sum.clear();
- for (int j = 0; j < N; j++) {
- sum.merge(samples[j]);
- checkSerialize(sum);
- }
- }
- long duration = System.currentTimeMillis() - start;
- System.out.println("Perf test result: " + duration / 1000 + " seconds");
- }
-
- @Test
- public void testEquivalence() {
- byte[] a = new byte[] { 0, 3, 4, 42, 2, 2 };
- byte[] b = new byte[] { 3, 4, 42 };
- HyperLogLogPlusCounterOld ha = new HyperLogLogPlusCounterOld();
- HyperLogLogPlusCounterOld hb = new HyperLogLogPlusCounterOld();
- ha.add(a, 1, 3);
- hb.add(b);
-
- Assert.assertTrue(ha.getCountEstimate() == hb.getCountEstimate());
- }
-
- private HyperLogLogPlusCounterOld newHLLC() {
- return new HyperLogLogPlusCounterOld(16);
- }
-}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/test/java/org/apache/kylin/measure/hll2/HyperLogLogCounterNewTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hll2/HyperLogLogCounterNewTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hll2/HyperLogLogCounterNewTest.java
deleted file mode 100644
index feb8c8e..0000000
--- a/core-metadata/src/test/java/org/apache/kylin/measure/hll2/HyperLogLogCounterNewTest.java
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-package org.apache.kylin.measure.hll2;
-
-import org.apache.kylin.common.util.Bytes;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterOld;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
-import org.apache.kylin.measure.hllc.RegisterType;
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.HashSet;
-import java.util.Random;
-import java.util.Set;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-/**
- * Created by xiefan on 16-12-12.
- */
-public class HyperLogLogCounterNewTest {
- ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
- Random rand1 = new Random(1);
- Random rand2 = new Random(2);
- Random rand3 = new Random(3);
- int errorCount1 = 0;
- int errorCount2 = 0;
- int errorCount3 = 0;
-
- @Test
- public void testOneAdd() throws IOException {
- HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew(14);
- HyperLogLogPlusCounterNew one = new HyperLogLogPlusCounterNew(14);
- for (int i = 0; i < 1000000; i++) {
- one.clear();
- one.add(rand1.nextInt());
- hllc.merge(one);
- }
- System.out.println(hllc.getCountEstimate());
- assertTrue(hllc.getCountEstimate() > 1000000 * 0.9);
- }
-
- @Test
- public void tesSparseEstimate() throws IOException {
- HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew(14);
- for (int i = 0; i < 10; i++) {
- hllc.add(i);
- }
- System.out.println(hllc.getCountEstimate());
- assertTrue(hllc.getCountEstimate() > 10 * 0.9);
- }
-
- @Test
- public void countTest() throws IOException {
- int n = 10;
- for (int i = 0; i < 5; i++) {
- count(n);
- n *= 10;
- }
- }
-
- @Test
- public void mergeTest() throws IOException {
- double error = 0;
- int n = 100;
- for (int i = 0; i < n; i++) {
- double e = merge(i);
- error += e;
- }
- System.out.println("Total average error is " + error / n);
-
- System.out.println(" errorRateCount1 is " + errorCount1 + "!");
- System.out.println(" errorRateCount2 is " + errorCount2 + "!");
- System.out.println(" errorRateCount3 is " + errorCount3 + "!");
-
- Assert.assertTrue(errorCount1 <= n * 0.30);
- Assert.assertTrue(errorCount2 <= n * 0.05);
- Assert.assertTrue(errorCount3 <= n * 0.02);
- }
-
- /*
- compare the result of two different hll counter
- */
- @Test
- public void compareResult() {
- int p = 12; //4096
- int m = 1 << p;
-
- for (int t = 0; t < 5; t++) {
- //compare sparse
- HyperLogLogPlusCounterOld oldCounter = new HyperLogLogPlusCounterOld(p);
- HyperLogLogPlusCounterNew newCounter = new HyperLogLogPlusCounterNew(p);
-
- for (int i = 0; i < 20; i++) {
- //int r = rand1.nextInt();
- oldCounter.add(i);
- newCounter.add(i);
- }
- assertEquals(RegisterType.SPARSE, newCounter.getRegisterType());
- assertEquals(oldCounter.getCountEstimate(), newCounter.getCountEstimate());
- //compare dense
- for (int i = 0; i < m; i++) {
- oldCounter.add(i);
- newCounter.add(i);
- }
- assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
- assertEquals(oldCounter.getCountEstimate(), newCounter.getCountEstimate());
- }
-
- }
-
- @Test
- public void testPeekLength() throws IOException {
- HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew(10);
- HyperLogLogPlusCounterNew copy = new HyperLogLogPlusCounterNew(10);
- byte[] value = new byte[10];
- for (int i = 0; i < 200000; i++) {
- rand1.nextBytes(value);
- hllc.add(value);
-
- buf.clear();
- hllc.writeRegisters(buf);
-
- int len = buf.position();
- buf.position(0);
- assertEquals(len, hllc.peekLength(buf));
-
- copy.readRegisters(buf);
- assertEquals(len, buf.position());
- assertEquals(hllc, copy);
- }
- buf.clear();
- }
-
- @Test
- public void testEquivalence() {
- byte[] a = new byte[] { 0, 3, 4, 42, 2, 2 };
- byte[] b = new byte[] { 3, 4, 42 };
- HyperLogLogPlusCounterNew ha = new HyperLogLogPlusCounterNew();
- HyperLogLogPlusCounterNew hb = new HyperLogLogPlusCounterNew();
- ha.add(a, 1, 3);
- hb.add(b);
-
- Assert.assertTrue(ha.getCountEstimate() == hb.getCountEstimate());
- }
-
- @Test
- public void testAutoChangeToSparse() {
- int p = 15;
- int m = 1 << p;
- HyperLogLogPlusCounterNew counter = new HyperLogLogPlusCounterNew(p);
- assertEquals(RegisterType.SPARSE, counter.getRegisterType());
- double over = HyperLogLogPlusCounterNew.overflowFactor * m;
- int overFlow = (int) over + 1000;
- for (int i = 0; i < overFlow; i++)
- counter.add(i);
- assertEquals(RegisterType.DENSE, counter.getRegisterType());
- }
-
- @Test
- public void testSerialilze() throws Exception {
- //test sparse serialize
- int p = 15;
- int m = 1 << p;
- HyperLogLogPlusCounterNew counter = new HyperLogLogPlusCounterNew(p);
- counter.add(123);
- assertEquals(RegisterType.SPARSE, counter.getRegisterType());
- checkSerialize(counter);
- //test dense serialize
- double over = HyperLogLogPlusCounterNew.overflowFactor * m;
- int overFlow = (int) over + 1000;
- for (int i = 0; i < overFlow; i++)
- counter.add(i);
- assertEquals(RegisterType.DENSE, counter.getRegisterType());
- checkSerialize(counter);
- }
-
- private Set<String> generateTestData(int n) {
- Set<String> testData = new HashSet<String>();
- for (int i = 0; i < n; i++) {
- String[] samples = generateSampleData();
- for (String sample : samples) {
- testData.add(sample);
- }
- }
- return testData;
- }
-
- // simulate the visit (=visitor+id)
- private String[] generateSampleData() {
-
- StringBuilder buf = new StringBuilder();
- for (int i = 0; i < 19; i++) {
- buf.append(Math.abs(rand1.nextInt()) % 10);
- }
- String header = buf.toString();
-
- int size = Math.abs(rand3.nextInt()) % 9 + 1;
- String[] samples = new String[size];
- for (int k = 0; k < size; k++) {
- buf = new StringBuilder(header);
- buf.append("-");
- for (int i = 0; i < 10; i++) {
- buf.append(Math.abs(rand3.nextInt()) % 10);
- }
- samples[k] = buf.toString();
- }
-
- return samples;
- }
-
- private double merge(int round) throws IOException {
- int ln = 20;
- int dn = 100 * (round + 1);
- Set<String> testSet = new HashSet<String>();
- HyperLogLogPlusCounterNew[] hllcs = new HyperLogLogPlusCounterNew[ln];
- for (int i = 0; i < ln; i++) {
- hllcs[i] = newHLLC();
- for (int k = 0; k < dn; k++) {
- String[] samples = generateSampleData();
- for (String data : samples) {
- testSet.add(data);
- hllcs[i].add(Bytes.toBytes(data));
- }
- }
- }
- HyperLogLogPlusCounterNew mergeHllc = newHLLC();
- for (HyperLogLogPlusCounterNew hllc : hllcs) {
- mergeHllc.merge(hllc);
- }
-
- double errorRate = mergeHllc.getErrorRate();
- long estimate = mergeHllc.getCountEstimate();
- double actualError = Math.abs((double) (testSet.size() - estimate) / testSet.size());
-
- System.out.println(testSet.size() + "-" + estimate + " ~ " + actualError);
- Assert.assertTrue(actualError < 0.1);
-
- if (actualError > errorRate) {
- errorCount1++;
- }
- if (actualError > 2 * errorRate) {
- errorCount2++;
- }
- if (actualError > 3 * errorRate) {
- errorCount3++;
- }
-
- return actualError;
- }
-
- private HyperLogLogPlusCounterNew newHLLC() {
- return new HyperLogLogPlusCounterNew(16);
- }
-
- private void count(int n) throws IOException {
- Set<String> testSet = generateTestData(n);
-
- HyperLogLogPlusCounterNew hllc = newHLLC();
- for (String testData : testSet) {
- hllc.add(Bytes.toBytes(testData));
- }
- long estimate = hllc.getCountEstimate();
- double errorRate = hllc.getErrorRate();
- double actualError = (double) Math.abs(testSet.size() - estimate) / testSet.size();
- System.out.println(estimate);
- System.out.println(testSet.size());
- System.out.println(errorRate);
- System.out.println("=" + actualError);
- Assert.assertTrue(actualError < errorRate * 3.0);
-
- checkSerialize(hllc);
- }
-
- private void checkSerialize(HyperLogLogPlusCounterNew hllc) throws IOException {
- long estimate = hllc.getCountEstimate();
- buf.clear();
- hllc.writeRegisters(buf);
- buf.flip();
- hllc.readRegisters(buf);
- Assert.assertEquals(estimate, hllc.getCountEstimate());
- }
-}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/test/java/org/apache/kylin/measure/hll2/NewHyperLogLogBenchmarkTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hll2/NewHyperLogLogBenchmarkTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hll2/NewHyperLogLogBenchmarkTest.java
deleted file mode 100644
index bfb87f9..0000000
--- a/core-metadata/src/test/java/org/apache/kylin/measure/hll2/NewHyperLogLogBenchmarkTest.java
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-package org.apache.kylin.measure.hll2;
-
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterOld;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
-import org.apache.kylin.measure.hllc.RegisterType;
-import org.junit.Test;
-
-import java.nio.ByteBuffer;
-import java.util.Random;
-
-import static org.junit.Assert.assertEquals;
-
-/**
- * Created by xiefan on 16-12-12.
- */
-public class NewHyperLogLogBenchmarkTest {
-
- public static final Random rand = new Random(1);
-
- final int testTimes = 10000;
-
- @Test
- public void denseToDenseRegisterMergeBenchmark() throws Exception {
- final int p = 15;
- int m = 1 << p;
-
- System.out.println("m : " + m);
- double oldFactor = HyperLogLogPlusCounterNew.overflowFactor;
- HyperLogLogPlusCounterNew.overflowFactor = 1.1; //keep sparse
- for (int cardinality : getTestDataDivide(m)) {
- final HyperLogLogPlusCounterOld oldCounter = new HyperLogLogPlusCounterOld(p);
- final HyperLogLogPlusCounterOld oldCounter2 = getRandOldCounter(p, cardinality);
- long oldTime = runTestCase(new TestCase() {
- @Override
- public void run() {
-
- for (int i = 0; i < testTimes; i++) {
- oldCounter.merge(oldCounter2);
- }
- }
- });
- final HyperLogLogPlusCounterNew newCounter = new HyperLogLogPlusCounterNew(p, RegisterType.DENSE);
- final HyperLogLogPlusCounterNew newCounter2 = new HyperLogLogPlusCounterNew(p, RegisterType.DENSE);
- for (int i = 0; i < testTimes; i++)
- newCounter2.add(i);
- long newTime = runTestCase(new TestCase() {
- @Override
- public void run() {
- for (int i = 0; i < testTimes; i++) {
- newCounter.merge(newCounter2);
- }
- }
- });
- assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
- assertEquals(RegisterType.DENSE, newCounter2.getRegisterType());
- System.out.println("----------------------------");
- System.out.println("cardinality : " + cardinality);
- System.out.println("old time : " + oldTime);
- System.out.println("new time : " + newTime);
- }
- HyperLogLogPlusCounterNew.overflowFactor = oldFactor;
- }
-
- @Test
- public void sparseToSparseMergeBenchmark() throws Exception {
- final int p = 15;
- int m = 1 << p;
- System.out.println("m : " + m);
- double oldFactor = HyperLogLogPlusCounterNew.overflowFactor;
- HyperLogLogPlusCounterNew.overflowFactor = 1.1; //keep sparse
- for (int cardinality : getTestDataDivide(m)) {
- final HyperLogLogPlusCounterOld oldCounter = new HyperLogLogPlusCounterOld(p);
- final HyperLogLogPlusCounterOld oldCounter2 = getRandOldCounter(p, cardinality);
- long oldTime = runTestCase(new TestCase() {
- @Override
- public void run() {
-
- for (int i = 0; i < testTimes; i++) {
- oldCounter.merge(oldCounter2);
- }
- }
- });
- final HyperLogLogPlusCounterNew newCounter = new HyperLogLogPlusCounterNew(p);
- final HyperLogLogPlusCounterNew newCounter2 = getRandNewCounter(p, cardinality);
- long newTime = runTestCase(new TestCase() {
- @Override
- public void run() {
- for (int i = 0; i < testTimes; i++) {
- newCounter.merge(newCounter2);
- }
- }
- });
- assertEquals(RegisterType.SPARSE, newCounter.getRegisterType());
- assertEquals(RegisterType.SPARSE, newCounter2.getRegisterType());
- System.out.println("----------------------------");
- System.out.println("cardinality : " + cardinality);
- System.out.println("old time : " + oldTime);
- System.out.println("new time : " + newTime);
- }
- HyperLogLogPlusCounterNew.overflowFactor = oldFactor;
- }
-
- @Test
- public void sparseToDenseRegisterMergeBenchmark() throws Exception {
- final int p = 15;
- int m = 1 << p;
- System.out.println("m : " + m);
- double oldFactor = HyperLogLogPlusCounterNew.overflowFactor;
- HyperLogLogPlusCounterNew.overflowFactor = 1.1; //keep sparse
- for (int cardinality : getTestDataDivide(m)) {
- System.out.println("----------------------------");
- System.out.println("cardinality : " + cardinality);
- final HyperLogLogPlusCounterOld oldCounter = new HyperLogLogPlusCounterOld(p);
- final HyperLogLogPlusCounterOld oldCounter2 = getRandOldCounter(p, cardinality);
- long oldTime = runTestCase(new TestCase() {
- @Override
- public void run() {
- for (int i = 0; i < testTimes; i++) {
- oldCounter.merge(oldCounter2);
- }
- }
- });
- final HyperLogLogPlusCounterNew newCounter = new HyperLogLogPlusCounterNew(p, RegisterType.DENSE);
- final HyperLogLogPlusCounterNew newCounter2 = getRandNewCounter(p, cardinality);
- long newTime = runTestCase(new TestCase() {
- @Override
- public void run() {
- for (int i = 0; i < testTimes; i++) {
- newCounter.merge(newCounter2);
- }
- }
- });
- assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
- assertEquals(RegisterType.SPARSE, newCounter2.getRegisterType());
- System.out.println("old time : " + oldTime);
- System.out.println("new time : " + newTime);
- }
- HyperLogLogPlusCounterNew.overflowFactor = oldFactor;
- }
-
- @Test
- public void sparseSerializeBenchmark() throws Exception {
- final int p = 15;
- int m = 1 << p;
- double oldFactor = HyperLogLogPlusCounterNew.overflowFactor;
- HyperLogLogPlusCounterNew.overflowFactor = 1.1; //keep sparse
- for (int cardinality : getTestDataDivide(m)) {
- System.out.println("----------------------------");
- System.out.println("cardinality : " + cardinality);
- final HyperLogLogPlusCounterOld oldCounter = getRandOldCounter(p, cardinality);
- long oldTime = runTestCase(new TestCase() {
- @Override
- public void run() throws Exception {
- ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
- long totalBytes = 0;
- for (int i = 0; i < testTimes; i++) {
- buf.clear();
- oldCounter.writeRegisters(buf);
- totalBytes += buf.position();
- buf.flip();
- oldCounter.readRegisters(buf);
- }
- System.out.println("old serialize bytes : " + totalBytes / testTimes + "B");
- }
- });
- final HyperLogLogPlusCounterNew newCounter = getRandNewCounter(p, cardinality);
- long newTime = runTestCase(new TestCase() {
- @Override
- public void run() throws Exception {
- ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
- long totalBytes = 0;
- for (int i = 0; i < testTimes; i++) {
- buf.clear();
- newCounter.writeRegisters(buf);
- totalBytes += buf.position();
- buf.flip();
- newCounter.readRegisters(buf);
- }
- System.out.println("new serialize bytes : " + totalBytes / testTimes + "B");
- }
- });
- assertEquals(RegisterType.SPARSE, newCounter.getRegisterType());
- System.out.println("old serialize time : " + oldTime);
- System.out.println("new serialize time : " + newTime);
- }
- HyperLogLogPlusCounterNew.overflowFactor = oldFactor;
- }
-
- @Test
- public void denseSerializeBenchmark() throws Exception {
- final int p = 15;
- int m = 1 << p;
- double oldFactor = HyperLogLogPlusCounterNew.overflowFactor;
- HyperLogLogPlusCounterNew.overflowFactor = 0; //keep sparse
- for (int cardinality : getTestDataDivide(m)) {
- System.out.println("----------------------------");
- System.out.println("cardinality : " + cardinality);
- final HyperLogLogPlusCounterOld oldCounter = getRandOldCounter(p, cardinality);
- long oldTime = runTestCase(new TestCase() {
- @Override
- public void run() throws Exception {
- ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
- long totalBytes = 0;
- for (int i = 0; i < testTimes; i++) {
- buf.clear();
- oldCounter.writeRegisters(buf);
- totalBytes += buf.position();
- buf.flip();
- oldCounter.readRegisters(buf);
- }
- System.out.println("old serialize bytes : " + totalBytes / testTimes + "B");
- }
- });
- final HyperLogLogPlusCounterNew newCounter = getRandNewCounter(p, cardinality, RegisterType.DENSE);
- long newTime = runTestCase(new TestCase() {
- @Override
- public void run() throws Exception {
- ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
- long totalBytes = 0;
- for (int i = 0; i < testTimes; i++) {
- buf.clear();
- newCounter.writeRegisters(buf);
- totalBytes += buf.position();
- buf.flip();
- newCounter.readRegisters(buf);
- }
- System.out.println("new serialize bytes : " + totalBytes / testTimes + "B");
- }
- });
- assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
- System.out.println("old serialize time : " + oldTime);
- System.out.println("new serialize time : " + newTime);
- }
- HyperLogLogPlusCounterNew.overflowFactor = oldFactor;
- }
-
- interface TestCase {
- void run() throws Exception;
- }
-
- public long runTestCase(TestCase testCase) throws Exception {
- long startTime = System.currentTimeMillis();
- testCase.run();
- return System.currentTimeMillis() - startTime;
- }
-
- public HyperLogLogPlusCounterOld getRandOldCounter(int p, int num) {
- HyperLogLogPlusCounterOld c = new HyperLogLogPlusCounterOld(p);
- for (int i = 0; i < num; i++)
- c.add(i);
- return c;
- }
-
- public HyperLogLogPlusCounterNew getRandNewCounter(int p, int num) {
- HyperLogLogPlusCounterNew c = new HyperLogLogPlusCounterNew(p);
- for (int i = 0; i < num; i++)
- c.add(i);
- return c;
- }
-
- public HyperLogLogPlusCounterNew getRandNewCounter(int p, int num, RegisterType type) {
- HyperLogLogPlusCounterNew c = new HyperLogLogPlusCounterNew(p, type);
- for (int i = 0; i < num; i++)
- c.add(i);
- return c;
- }
-
- public static int[] getTestDataDivide(int m) {
- return new int[] { 1, 5, 10, 100, m / 200, m / 100, m / 50, m / 20, m / 10, m };
- }
-}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterOldTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterOldTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterOldTest.java
new file mode 100644
index 0000000..c4a97cd
--- /dev/null
+++ b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterOldTest.java
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.measure.hllc;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.kylin.common.util.Bytes;
+import org.apache.kylin.measure.hllc.HLLCounterOld;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * @author yangli9
+ *
+ */
+@SuppressWarnings("deprecation")
+public class HLLCounterOldTest {
+
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ Random rand1 = new Random(1);
+ Random rand2 = new Random(2);
+ Random rand3 = new Random(3);
+ int errorCount1 = 0;
+ int errorCount2 = 0;
+ int errorCount3 = 0;
+
+ @Test
+ public void testOneAdd() throws IOException {
+ HLLCounterOld hllc = new HLLCounterOld(14);
+ HLLCounterOld one = new HLLCounterOld(14);
+ for (int i = 0; i < 1000000; i++) {
+ one.clear();
+ one.add(rand1.nextInt());
+ hllc.merge(one);
+ }
+ assertTrue(hllc.getCountEstimate() > 1000000 * 0.9);
+ }
+
+ @Test
+ public void testPeekLength() throws IOException {
+ HLLCounterOld hllc = new HLLCounterOld(10);
+ HLLCounterOld copy = new HLLCounterOld(10);
+ byte[] value = new byte[10];
+ for (int i = 0; i < 200000; i++) {
+ rand1.nextBytes(value);
+ hllc.add(value);
+
+ buf.clear();
+ hllc.writeRegisters(buf);
+
+ int len = buf.position();
+ buf.position(0);
+ assertEquals(len, hllc.peekLength(buf));
+
+ copy.readRegisters(buf);
+ assertEquals(len, buf.position());
+ assertEquals(hllc, copy);
+ }
+ buf.clear();
+ }
+
+ private Set<String> generateTestData(int n) {
+ Set<String> testData = new HashSet<String>();
+ for (int i = 0; i < n; i++) {
+ String[] samples = generateSampleData();
+ for (String sample : samples) {
+ testData.add(sample);
+ }
+ }
+ return testData;
+ }
+
+ // simulate the visit (=visitor+id)
+ private String[] generateSampleData() {
+
+ StringBuilder buf = new StringBuilder();
+ for (int i = 0; i < 19; i++) {
+ buf.append(Math.abs(rand1.nextInt()) % 10);
+ }
+ String header = buf.toString();
+
+ int size = Math.abs(rand3.nextInt()) % 9 + 1;
+ String[] samples = new String[size];
+ for (int k = 0; k < size; k++) {
+ buf = new StringBuilder(header);
+ buf.append("-");
+ for (int i = 0; i < 10; i++) {
+ buf.append(Math.abs(rand3.nextInt()) % 10);
+ }
+ samples[k] = buf.toString();
+ }
+
+ return samples;
+ }
+
+ @Test
+ public void countTest() throws IOException {
+ int n = 10;
+ for (int i = 0; i < 5; i++) {
+ count(n);
+ n *= 10;
+ }
+ }
+
+ private void count(int n) throws IOException {
+ Set<String> testSet = generateTestData(n);
+
+ HLLCounterOld hllc = newHLLC();
+ for (String testData : testSet) {
+ hllc.add(Bytes.toBytes(testData));
+ }
+ long estimate = hllc.getCountEstimate();
+ double errorRate = hllc.getErrorRate();
+ double actualError = (double) Math.abs(testSet.size() - estimate) / testSet.size();
+ System.out.println(estimate);
+ System.out.println(testSet.size());
+ System.out.println(errorRate);
+ System.out.println("=" + actualError);
+ Assert.assertTrue(actualError < errorRate * 3.0);
+
+ checkSerialize(hllc);
+ }
+
+ private void checkSerialize(HLLCounterOld hllc) throws IOException {
+ long estimate = hllc.getCountEstimate();
+ buf.clear();
+ hllc.writeRegisters(buf);
+ buf.flip();
+ hllc.readRegisters(buf);
+ Assert.assertEquals(estimate, hllc.getCountEstimate());
+ }
+
+ @Test
+ public void mergeTest() throws IOException {
+ double error = 0;
+ int n = 100;
+ for (int i = 0; i < n; i++) {
+ double e = merge(i);
+ error += e;
+ }
+ System.out.println("Total average error is " + error / n);
+
+ System.out.println(" errorRateCount1 is " + errorCount1 + "!");
+ System.out.println(" errorRateCount2 is " + errorCount2 + "!");
+ System.out.println(" errorRateCount3 is " + errorCount3 + "!");
+
+ Assert.assertTrue(errorCount1 <= n * 0.30);
+ Assert.assertTrue(errorCount2 <= n * 0.05);
+ Assert.assertTrue(errorCount3 <= n * 0.02);
+ }
+
+ private double merge(int round) throws IOException {
+ int ln = 20;
+ int dn = 100 * (round + 1);
+ Set<String> testSet = new HashSet<String>();
+ HLLCounterOld[] hllcs = new HLLCounterOld[ln];
+ for (int i = 0; i < ln; i++) {
+ hllcs[i] = newHLLC();
+ for (int k = 0; k < dn; k++) {
+ String[] samples = generateSampleData();
+ for (String data : samples) {
+ testSet.add(data);
+ hllcs[i].add(Bytes.toBytes(data));
+ }
+ }
+ }
+ HLLCounterOld mergeHllc = newHLLC();
+ for (HLLCounterOld hllc : hllcs) {
+ mergeHllc.merge(serDes(hllc));
+ }
+
+ double errorRate = mergeHllc.getErrorRate();
+ long estimate = mergeHllc.getCountEstimate();
+ double actualError = Math.abs((double) (testSet.size() - estimate) / testSet.size());
+
+ System.out.println(testSet.size() + "-" + estimate + " ~ " + actualError);
+ Assert.assertTrue(actualError < 0.1);
+
+ if (actualError > errorRate) {
+ errorCount1++;
+ }
+ if (actualError > 2 * errorRate) {
+ errorCount2++;
+ }
+ if (actualError > 3 * errorRate) {
+ errorCount3++;
+ }
+
+ return actualError;
+ }
+
+ private HLLCounterOld serDes(HLLCounterOld hllc) throws IOException {
+ buf.clear();
+ hllc.writeRegisters(buf);
+ buf.flip();
+ HLLCounterOld copy = new HLLCounterOld(hllc.getPrecision());
+ copy.readRegisters(buf);
+ Assert.assertEquals(copy.getCountEstimate(), hllc.getCountEstimate());
+ return copy;
+ }
+
+ @Test
+ public void testPerformance() throws IOException {
+ int N = 3; // reduce N HLLC into one
+ int M = 1000; // for M times, use 100000 for real perf test
+
+ HLLCounterOld samples[] = new HLLCounterOld[N];
+ for (int i = 0; i < N; i++) {
+ samples[i] = newHLLC();
+ for (String str : generateTestData(10000))
+ samples[i].add(str);
+ }
+
+ System.out.println("Perf test running ... ");
+ long start = System.currentTimeMillis();
+ HLLCounterOld sum = newHLLC();
+ for (int i = 0; i < M; i++) {
+ sum.clear();
+ for (int j = 0; j < N; j++) {
+ sum.merge(samples[j]);
+ checkSerialize(sum);
+ }
+ }
+ long duration = System.currentTimeMillis() - start;
+ System.out.println("Perf test result: " + duration / 1000 + " seconds");
+ }
+
+ @Test
+ public void testEquivalence() {
+ byte[] a = new byte[] { 0, 3, 4, 42, 2, 2 };
+ byte[] b = new byte[] { 3, 4, 42 };
+ HLLCounterOld ha = new HLLCounterOld();
+ HLLCounterOld hb = new HLLCounterOld();
+ ha.add(a, 1, 3);
+ hb.add(b);
+
+ Assert.assertTrue(ha.getCountEstimate() == hb.getCountEstimate());
+ }
+
+ private HLLCounterOld newHLLC() {
+ return new HLLCounterOld(16);
+ }
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
new file mode 100644
index 0000000..26ad4a7
--- /dev/null
+++ b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/HLLCounterTest.java
@@ -0,0 +1,316 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package org.apache.kylin.measure.hllc;
+
+import org.apache.kylin.common.util.Bytes;
+import org.apache.kylin.measure.hllc.HLLCounterOld;
+import org.apache.kylin.measure.hllc.HLLCounter;
+import org.apache.kylin.measure.hllc.RegisterType;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Created by xiefan on 16-12-12.
+ */
+@SuppressWarnings("deprecation")
+public class HLLCounterTest {
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ Random rand1 = new Random(1);
+ Random rand2 = new Random(2);
+ Random rand3 = new Random(3);
+ int errorCount1 = 0;
+ int errorCount2 = 0;
+ int errorCount3 = 0;
+
+ @Test
+ public void testOneAdd() throws IOException {
+ HLLCounter hllc = new HLLCounter(14);
+ HLLCounter one = new HLLCounter(14);
+ for (int i = 0; i < 1000000; i++) {
+ one.clear();
+ one.add(rand1.nextInt());
+ hllc.merge(one);
+ }
+ System.out.println(hllc.getCountEstimate());
+ assertTrue(hllc.getCountEstimate() > 1000000 * 0.9);
+ }
+
+ @Test
+ public void tesSparseEstimate() throws IOException {
+ HLLCounter hllc = new HLLCounter(14);
+ for (int i = 0; i < 10; i++) {
+ hllc.add(i);
+ }
+ System.out.println(hllc.getCountEstimate());
+ assertTrue(hllc.getCountEstimate() > 10 * 0.9);
+ }
+
+ @Test
+ public void countTest() throws IOException {
+ int n = 10;
+ for (int i = 0; i < 5; i++) {
+ count(n);
+ n *= 10;
+ }
+ }
+
+ @Test
+ public void mergeTest() throws IOException {
+ double error = 0;
+ int n = 100;
+ for (int i = 0; i < n; i++) {
+ double e = merge(i);
+ error += e;
+ }
+ System.out.println("Total average error is " + error / n);
+
+ System.out.println(" errorRateCount1 is " + errorCount1 + "!");
+ System.out.println(" errorRateCount2 is " + errorCount2 + "!");
+ System.out.println(" errorRateCount3 is " + errorCount3 + "!");
+
+ Assert.assertTrue(errorCount1 <= n * 0.30);
+ Assert.assertTrue(errorCount2 <= n * 0.05);
+ Assert.assertTrue(errorCount3 <= n * 0.02);
+ }
+
+ /* compare the result of two different hll counter */
+ @Test
+ public void compareResult() throws IOException {
+ int p = 12; //4096
+ int m = 1 << p;
+
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+
+ for (int t = 0; t < 5; t++) {
+ //compare sparse
+ HLLCounterOld oldCounter = new HLLCounterOld(p);
+ HLLCounter newCounter = new HLLCounter(p);
+ HLLCounter newCounter2 = new HLLCounter(p);
+
+ for (int i = 0; i < 20; i++) {
+ int r = rand1.nextInt();
+ oldCounter.add(r);
+ newCounter.add(r);
+ }
+ assertEquals(RegisterType.SPARSE, newCounter.getRegisterType());
+ assertEquals(oldCounter.getCountEstimate(), newCounter.getCountEstimate());
+
+ buf.clear();
+ oldCounter.writeRegisters(buf);
+ buf.flip();
+ newCounter2.readRegisters(buf);
+ assertEquals(oldCounter.getCountEstimate(), newCounter2.getCountEstimate());
+
+ //compare dense
+ for (int i = 0; i < m / 2; i++) {
+ int r = rand1.nextInt();
+ oldCounter.add(r);
+ newCounter.add(r);
+ }
+ assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
+ assertEquals(oldCounter.getCountEstimate(), newCounter.getCountEstimate());
+
+ buf.clear();
+ oldCounter.writeRegisters(buf);
+ buf.flip();
+ newCounter2.readRegisters(buf);
+ assertEquals(oldCounter.getCountEstimate(), newCounter2.getCountEstimate());
+ }
+ }
+
+ @Test
+ public void testPeekLength() throws IOException {
+ HLLCounter hllc = new HLLCounter(10);
+ HLLCounter copy = new HLLCounter(10);
+ byte[] value = new byte[10];
+ for (int i = 0; i < 200000; i++) {
+ rand1.nextBytes(value);
+ hllc.add(value);
+
+ buf.clear();
+ hllc.writeRegisters(buf);
+
+ int len = buf.position();
+ buf.position(0);
+ assertEquals(len, hllc.peekLength(buf));
+
+ copy.readRegisters(buf);
+ assertEquals(len, buf.position());
+ assertEquals(hllc, copy);
+ }
+ buf.clear();
+ }
+
+ @Test
+ public void testEquivalence() {
+ byte[] a = new byte[] { 0, 3, 4, 42, 2, 2 };
+ byte[] b = new byte[] { 3, 4, 42 };
+ HLLCounter ha = new HLLCounter();
+ HLLCounter hb = new HLLCounter();
+ ha.add(a, 1, 3);
+ hb.add(b);
+
+ Assert.assertTrue(ha.getCountEstimate() == hb.getCountEstimate());
+ }
+
+ @Test
+ public void testAutoChangeToSparse() {
+ int p = 15;
+ int m = 1 << p;
+ HLLCounter counter = new HLLCounter(p);
+ assertEquals(RegisterType.SPARSE, counter.getRegisterType());
+ double over = HLLCounter.OVERFLOW_FACTOR * m;
+ int overFlow = (int) over + 1000;
+ for (int i = 0; i < overFlow; i++)
+ counter.add(i);
+ assertEquals(RegisterType.DENSE, counter.getRegisterType());
+ }
+
+ @Test
+ public void testSerialilze() throws Exception {
+ //test sparse serialize
+ int p = 15;
+ int m = 1 << p;
+ HLLCounter counter = new HLLCounter(p);
+ counter.add(123);
+ assertEquals(RegisterType.SPARSE, counter.getRegisterType());
+ checkSerialize(counter);
+ //test dense serialize
+ double over = HLLCounter.OVERFLOW_FACTOR * m;
+ int overFlow = (int) over + 1000;
+ for (int i = 0; i < overFlow; i++)
+ counter.add(i);
+ assertEquals(RegisterType.DENSE, counter.getRegisterType());
+ checkSerialize(counter);
+ }
+
+ private Set<String> generateTestData(int n) {
+ Set<String> testData = new HashSet<String>();
+ for (int i = 0; i < n; i++) {
+ String[] samples = generateSampleData();
+ for (String sample : samples) {
+ testData.add(sample);
+ }
+ }
+ return testData;
+ }
+
+ // simulate the visit (=visitor+id)
+ private String[] generateSampleData() {
+
+ StringBuilder buf = new StringBuilder();
+ for (int i = 0; i < 19; i++) {
+ buf.append(Math.abs(rand1.nextInt()) % 10);
+ }
+ String header = buf.toString();
+
+ int size = Math.abs(rand3.nextInt()) % 9 + 1;
+ String[] samples = new String[size];
+ for (int k = 0; k < size; k++) {
+ buf = new StringBuilder(header);
+ buf.append("-");
+ for (int i = 0; i < 10; i++) {
+ buf.append(Math.abs(rand3.nextInt()) % 10);
+ }
+ samples[k] = buf.toString();
+ }
+
+ return samples;
+ }
+
+ private double merge(int round) throws IOException {
+ int ln = 20;
+ int dn = 100 * (round + 1);
+ Set<String> testSet = new HashSet<String>();
+ HLLCounter[] hllcs = new HLLCounter[ln];
+ for (int i = 0; i < ln; i++) {
+ hllcs[i] = newHLLC();
+ for (int k = 0; k < dn; k++) {
+ String[] samples = generateSampleData();
+ for (String data : samples) {
+ testSet.add(data);
+ hllcs[i].add(Bytes.toBytes(data));
+ }
+ }
+ }
+ HLLCounter mergeHllc = newHLLC();
+ for (HLLCounter hllc : hllcs) {
+ mergeHllc.merge(hllc);
+ }
+
+ double errorRate = mergeHllc.getErrorRate();
+ long estimate = mergeHllc.getCountEstimate();
+ double actualError = Math.abs((double) (testSet.size() - estimate) / testSet.size());
+
+ System.out.println(testSet.size() + "-" + estimate + " ~ " + actualError);
+ Assert.assertTrue(actualError < 0.1);
+
+ if (actualError > errorRate) {
+ errorCount1++;
+ }
+ if (actualError > 2 * errorRate) {
+ errorCount2++;
+ }
+ if (actualError > 3 * errorRate) {
+ errorCount3++;
+ }
+
+ return actualError;
+ }
+
+ private HLLCounter newHLLC() {
+ return new HLLCounter(16);
+ }
+
+ private void count(int n) throws IOException {
+ Set<String> testSet = generateTestData(n);
+
+ HLLCounter hllc = newHLLC();
+ for (String testData : testSet) {
+ hllc.add(Bytes.toBytes(testData));
+ }
+ long estimate = hllc.getCountEstimate();
+ double errorRate = hllc.getErrorRate();
+ double actualError = (double) Math.abs(testSet.size() - estimate) / testSet.size();
+ System.out.println(estimate);
+ System.out.println(testSet.size());
+ System.out.println(errorRate);
+ System.out.println("=" + actualError);
+ Assert.assertTrue(actualError < errorRate * 3.0);
+
+ checkSerialize(hllc);
+ }
+
+ private void checkSerialize(HLLCounter hllc) throws IOException {
+ long estimate = hllc.getCountEstimate();
+ buf.clear();
+ hllc.writeRegisters(buf);
+ buf.flip();
+ hllc.readRegisters(buf);
+ Assert.assertEquals(estimate, hllc.getCountEstimate());
+ }
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/test/java/org/apache/kylin/measure/hllc/NewHyperLogLogBenchmarkTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hllc/NewHyperLogLogBenchmarkTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/NewHyperLogLogBenchmarkTest.java
new file mode 100644
index 0000000..586c007
--- /dev/null
+++ b/core-metadata/src/test/java/org/apache/kylin/measure/hllc/NewHyperLogLogBenchmarkTest.java
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package org.apache.kylin.measure.hllc;
+
+import org.apache.kylin.measure.hllc.HLLCounterOld;
+import org.apache.kylin.measure.hllc.HLLCounter;
+import org.apache.kylin.measure.hllc.RegisterType;
+import org.junit.Test;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Created by xiefan on 16-12-12.
+ */
+@SuppressWarnings("deprecation")
+public class NewHyperLogLogBenchmarkTest {
+
+ public static final Random rand = new Random(1);
+
+ final int testTimes = 10000;
+
+ @Test
+ public void denseToDenseRegisterMergeBenchmark() throws Exception {
+ final int p = 15;
+ int m = 1 << p;
+
+ System.out.println("denseToDenseRegisterMergeBenchmark(), m : " + m);
+ double oldFactor = HLLCounter.OVERFLOW_FACTOR;
+ HLLCounter.OVERFLOW_FACTOR = 1.1; //keep sparse
+ for (int cardinality : getTestDataDivide(m)) {
+ final HLLCounterOld oldCounter = new HLLCounterOld(p);
+ final HLLCounterOld oldCounter2 = getRandOldCounter(p, cardinality);
+ long oldTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+
+ for (int i = 0; i < testTimes; i++) {
+ oldCounter.merge(oldCounter2);
+ }
+ }
+ });
+ final HLLCounter newCounter = new HLLCounter(p, RegisterType.DENSE);
+ final HLLCounter newCounter2 = new HLLCounter(p, RegisterType.DENSE);
+ for (int i = 0; i < testTimes; i++)
+ newCounter2.add(i);
+ long newTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+ for (int i = 0; i < testTimes; i++) {
+ newCounter.merge(newCounter2);
+ }
+ }
+ });
+ assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
+ assertEquals(RegisterType.DENSE, newCounter2.getRegisterType());
+ System.out.println("----------------------------");
+ System.out.println("cardinality : " + cardinality);
+ System.out.println("old time : " + oldTime);
+ System.out.println("new time : " + newTime);
+ }
+ HLLCounter.OVERFLOW_FACTOR = oldFactor;
+ }
+
+ @Test
+ public void sparseToSparseMergeBenchmark() throws Exception {
+ final int p = 15;
+ int m = 1 << p;
+ System.out.println("sparseToSparseMergeBenchmark(), m : " + m);
+ double oldFactor = HLLCounter.OVERFLOW_FACTOR;
+ HLLCounter.OVERFLOW_FACTOR = 1.1; //keep sparse
+ for (int cardinality : getTestDataDivide(m)) {
+ final HLLCounterOld oldCounter = new HLLCounterOld(p);
+ final HLLCounterOld oldCounter2 = getRandOldCounter(p, cardinality);
+ long oldTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+
+ for (int i = 0; i < testTimes; i++) {
+ oldCounter.merge(oldCounter2);
+ }
+ }
+ });
+ final HLLCounter newCounter = new HLLCounter(p);
+ final HLLCounter newCounter2 = getRandNewCounter(p, cardinality);
+ long newTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+ for (int i = 0; i < testTimes; i++) {
+ newCounter.merge(newCounter2);
+ }
+ }
+ });
+ assertEquals(RegisterType.SPARSE, newCounter.getRegisterType());
+ assertEquals(RegisterType.SPARSE, newCounter2.getRegisterType());
+ System.out.println("----------------------------");
+ System.out.println("cardinality : " + cardinality);
+ System.out.println("old time : " + oldTime);
+ System.out.println("new time : " + newTime);
+ }
+ HLLCounter.OVERFLOW_FACTOR = oldFactor;
+ }
+
+ @Test
+ public void sparseToDenseRegisterMergeBenchmark() throws Exception {
+ final int p = 15;
+ int m = 1 << p;
+ System.out.println("sparseToDenseRegisterMergeBenchmark(), m : " + m);
+ double oldFactor = HLLCounter.OVERFLOW_FACTOR;
+ HLLCounter.OVERFLOW_FACTOR = 1.1; //keep sparse
+ for (int cardinality : getTestDataDivide(m)) {
+ System.out.println("----------------------------");
+ System.out.println("cardinality : " + cardinality);
+ final HLLCounterOld oldCounter = new HLLCounterOld(p);
+ final HLLCounterOld oldCounter2 = getRandOldCounter(p, cardinality);
+ long oldTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+ for (int i = 0; i < testTimes; i++) {
+ oldCounter.merge(oldCounter2);
+ }
+ }
+ });
+ final HLLCounter newCounter = new HLLCounter(p, RegisterType.DENSE);
+ final HLLCounter newCounter2 = getRandNewCounter(p, cardinality);
+ long newTime = runTestCase(new TestCase() {
+ @Override
+ public void run() {
+ for (int i = 0; i < testTimes; i++) {
+ newCounter.merge(newCounter2);
+ }
+ }
+ });
+ assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
+ assertEquals(RegisterType.SPARSE, newCounter2.getRegisterType());
+ System.out.println("old time : " + oldTime);
+ System.out.println("new time : " + newTime);
+ }
+ HLLCounter.OVERFLOW_FACTOR = oldFactor;
+ }
+
+ @Test
+ public void sparseSerializeBenchmark() throws Exception {
+ final int p = 15;
+ int m = 1 << p;
+ double oldFactor = HLLCounter.OVERFLOW_FACTOR;
+ HLLCounter.OVERFLOW_FACTOR = 1.1; //keep sparse
+ System.out.println("sparseSerializeBenchmark()");
+ for (int cardinality : getTestDataDivide(m)) {
+ System.out.println("----------------------------");
+ System.out.println("cardinality : " + cardinality);
+ final HLLCounterOld oldCounter = getRandOldCounter(p, cardinality);
+ long oldTime = runTestCase(new TestCase() {
+ @Override
+ public void run() throws Exception {
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ long totalBytes = 0;
+ for (int i = 0; i < testTimes; i++) {
+ buf.clear();
+ oldCounter.writeRegisters(buf);
+ totalBytes += buf.position();
+ buf.flip();
+ oldCounter.readRegisters(buf);
+ }
+ System.out.println("old serialize bytes : " + totalBytes / testTimes + "B");
+ }
+ });
+ final HLLCounter newCounter = getRandNewCounter(p, cardinality);
+ long newTime = runTestCase(new TestCase() {
+ @Override
+ public void run() throws Exception {
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ long totalBytes = 0;
+ for (int i = 0; i < testTimes; i++) {
+ buf.clear();
+ newCounter.writeRegisters(buf);
+ totalBytes += buf.position();
+ buf.flip();
+ newCounter.readRegisters(buf);
+ }
+ System.out.println("new serialize bytes : " + totalBytes / testTimes + "B");
+ }
+ });
+ assertEquals(RegisterType.SPARSE, newCounter.getRegisterType());
+ System.out.println("old serialize time : " + oldTime);
+ System.out.println("new serialize time : " + newTime);
+ }
+ HLLCounter.OVERFLOW_FACTOR = oldFactor;
+ }
+
+ @Test
+ public void denseSerializeBenchmark() throws Exception {
+ final int p = 15;
+ final int m = 1 << p;
+ double oldFactor = HLLCounter.OVERFLOW_FACTOR;
+ HLLCounter.OVERFLOW_FACTOR = 0; //keep sparse
+ System.out.println("denseSerializeBenchmark()");
+ for (int cardinality : getTestDataDivide(m)) {
+ System.out.println("----------------------------");
+ System.out.println("cardinality : " + cardinality);
+ final HLLCounterOld oldCounter = getRandOldCounter(p, cardinality);
+ long oldTime = runTestCase(new TestCase() {
+ @Override
+ public void run() throws Exception {
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ long totalBytes = 0;
+ for (int i = 0; i < testTimes; i++) {
+ buf.clear();
+ oldCounter.writeRegisters(buf);
+ totalBytes += buf.position();
+ buf.flip();
+ oldCounter.readRegisters(buf);
+ }
+ System.out.println("old serialize bytes : " + totalBytes / testTimes + "B");
+ }
+ });
+ final HLLCounter newCounter = getRandNewCounter(p, cardinality, RegisterType.DENSE);
+ long newTime = runTestCase(new TestCase() {
+ @Override
+ public void run() throws Exception {
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ long totalBytes = 0;
+ for (int i = 0; i < testTimes; i++) {
+ buf.clear();
+ newCounter.writeRegisters(buf);
+ totalBytes += buf.position();
+ buf.flip();
+ newCounter.readRegisters(buf);
+ }
+ System.out.println("new serialize bytes : " + totalBytes / testTimes + "B");
+ }
+ });
+ assertEquals(RegisterType.DENSE, newCounter.getRegisterType());
+ System.out.println("old serialize time : " + oldTime);
+ System.out.println("new serialize time : " + newTime);
+ }
+ HLLCounter.OVERFLOW_FACTOR = oldFactor;
+ }
+
+ interface TestCase {
+ void run() throws Exception;
+ }
+
+ public long runTestCase(TestCase testCase) throws Exception {
+ long startTime = System.currentTimeMillis();
+ testCase.run();
+ return System.currentTimeMillis() - startTime;
+ }
+
+ public HLLCounterOld getRandOldCounter(int p, int num) {
+ HLLCounterOld c = new HLLCounterOld(p);
+ for (int i = 0; i < num; i++)
+ c.add(i);
+ return c;
+ }
+
+ public HLLCounter getRandNewCounter(int p, int num) {
+ HLLCounter c = new HLLCounter(p);
+ for (int i = 0; i < num; i++)
+ c.add(i);
+ return c;
+ }
+
+ public HLLCounter getRandNewCounter(int p, int num, RegisterType type) {
+ HLLCounter c = new HLLCounter(p, type);
+ for (int i = 0; i < num; i++)
+ c.add(i);
+ return c;
+ }
+
+ public static int[] getTestDataDivide(int m) {
+ return new int[] { 1, 5, 10, 100, m / 200, m / 100, m / 50, m / 20, m / 10, m };
+ }
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
index 5445491..ffba181 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java
@@ -53,7 +53,7 @@ import org.apache.kylin.cube.kv.CubeDimEncMap;
import org.apache.kylin.cube.kv.RowKeyEncoder;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.engine.mr.HadoopUtil;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.datatype.DataType;
import org.apache.kylin.metadata.model.FunctionDesc;
import org.apache.kylin.metadata.model.MeasureDesc;
@@ -76,7 +76,7 @@ public class CubeStatsReader {
final int samplingPercentage;
final int mapperNumberOfFirstBuild; // becomes meaningless after merge
final double mapperOverlapRatioOfFirstBuild; // becomes meaningless after merge
- final Map<Long, HyperLogLogPlusCounterNew> cuboidRowEstimatesHLL;
+ final Map<Long, HLLCounter> cuboidRowEstimatesHLL;
final CuboidScheduler cuboidScheduler;
public CubeStatsReader(CubeSegment cubeSegment, KylinConfig kylinConfig) throws IOException {
@@ -96,7 +96,7 @@ public class CubeStatsReader {
int percentage = 100;
int mapperNumber = 0;
double mapperOverlapRatio = 0;
- Map<Long, HyperLogLogPlusCounterNew> counterMap = Maps.newHashMap();
+ Map<Long, HLLCounter> counterMap = Maps.newHashMap();
LongWritable key = (LongWritable) ReflectionUtils.newInstance(reader.getKeyClass(), hadoopConf);
BytesWritable value = (BytesWritable) ReflectionUtils.newInstance(reader.getValueClass(), hadoopConf);
@@ -108,7 +108,7 @@ public class CubeStatsReader {
} else if (key.get() == -2) {
mapperNumber = Bytes.toInt(value.getBytes());
} else if (key.get() > 0) {
- HyperLogLogPlusCounterNew hll = new HyperLogLogPlusCounterNew(kylinConfig.getCubeStatsHLLPrecision());
+ HLLCounter hll = new HLLCounter(kylinConfig.getCubeStatsHLLPrecision());
ByteArray byteArray = new ByteArray(value.getBytes());
hll.readRegisters(byteArray.asBuffer());
counterMap.put(key.get(), hll);
@@ -161,9 +161,9 @@ public class CubeStatsReader {
return mapperOverlapRatioOfFirstBuild;
}
- public static Map<Long, Long> getCuboidRowCountMapFromSampling(Map<Long, HyperLogLogPlusCounterNew> hllcMap, int samplingPercentage) {
+ public static Map<Long, Long> getCuboidRowCountMapFromSampling(Map<Long, HLLCounter> hllcMap, int samplingPercentage) {
Map<Long, Long> cuboidRowCountMap = Maps.newHashMap();
- for (Map.Entry<Long, HyperLogLogPlusCounterNew> entry : hllcMap.entrySet()) {
+ for (Map.Entry<Long, HLLCounter> entry : hllcMap.entrySet()) {
// No need to adjust according sampling percentage. Assumption is that data set is far
// more than cardinality. Even a percentage of the data should already see all cardinalities.
cuboidRowCountMap.put(entry.getKey(), entry.getValue().getCountEstimate());
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsWriter.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsWriter.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsWriter.java
index 219cdf2..8f400c3 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsWriter.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsWriter.java
@@ -33,17 +33,17 @@ import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.measure.BufferedMeasureCodec;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
public class CubeStatsWriter {
public static void writeCuboidStatistics(Configuration conf, Path outputPath, //
- Map<Long, HyperLogLogPlusCounterNew> cuboidHLLMap, int samplingPercentage) throws IOException {
+ Map<Long, HLLCounter> cuboidHLLMap, int samplingPercentage) throws IOException {
writeCuboidStatistics(conf, outputPath, cuboidHLLMap, samplingPercentage, 0, 0);
}
public static void writeCuboidStatistics(Configuration conf, Path outputPath, //
- Map<Long, HyperLogLogPlusCounterNew> cuboidHLLMap, int samplingPercentage, int mapperNumber, double mapperOverlapRatio) throws IOException {
+ Map<Long, HLLCounter> cuboidHLLMap, int samplingPercentage, int mapperNumber, double mapperOverlapRatio) throws IOException {
Path seqFilePath = new Path(outputPath, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME);
List<Long> allCuboids = new ArrayList<Long>();
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java
index 0d388c7..3115fe4 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java
@@ -47,7 +47,7 @@ import org.apache.kylin.engine.mr.KylinReducer;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.engine.mr.common.CubeStatsWriter;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.model.TblColRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -64,7 +64,7 @@ public class FactDistinctColumnsReducer extends KylinReducer<SelfDefineSortableK
private List<TblColRef> columnList;
private String statisticsOutput = null;
private List<Long> baseCuboidRowCountInMappers;
- protected Map<Long, HyperLogLogPlusCounterNew> cuboidHLLMap = null;
+ protected Map<Long, HLLCounter> cuboidHLLMap = null;
protected long baseCuboidId;
protected CubeDesc cubeDesc;
private long totalRowsBeforeMerge = 0;
@@ -156,7 +156,7 @@ public class FactDistinctColumnsReducer extends KylinReducer<SelfDefineSortableK
// for hll
long cuboidId = Bytes.toLong(key.getBytes(), 1, Bytes.SIZEOF_LONG);
for (Text value : values) {
- HyperLogLogPlusCounterNew hll = new HyperLogLogPlusCounterNew(cubeConfig.getCubeStatsHLLPrecision());
+ HLLCounter hll = new HLLCounter(cubeConfig.getCubeStatsHLLPrecision());
ByteBuffer bf = ByteBuffer.wrap(value.getBytes(), 0, value.getLength());
hll.readRegisters(bf);
@@ -270,7 +270,7 @@ public class FactDistinctColumnsReducer extends KylinReducer<SelfDefineSortableK
if (isStatistics) {
// output the hll info
long grandTotal = 0;
- for (HyperLogLogPlusCounterNew hll : cuboidHLLMap.values()) {
+ for (HLLCounter hll : cuboidHLLMap.values()) {
grandTotal += hll.getCountEstimate();
}
double mapperOverlapRatio = grandTotal == 0 ? 0 : (double) totalRowsBeforeMerge / grandTotal;
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java
index c0575f1..5692c76 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java
@@ -29,7 +29,7 @@ import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.cube.cuboid.CuboidScheduler;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.measure.BufferedMeasureCodec;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.model.TblColRef;
import com.google.common.collect.Lists;
@@ -45,7 +45,7 @@ public class FactDistinctHiveColumnsMapper<KEYIN> extends FactDistinctColumnsMap
protected CuboidScheduler cuboidScheduler = null;
protected int nRowKey;
private Integer[][] allCuboidsBitSet = null;
- private HyperLogLogPlusCounterNew[] allCuboidsHLL = null;
+ private HLLCounter[] allCuboidsHLL = null;
private Long[] cuboidIds;
private HashFunction hf = null;
private int rowCount = 0;
@@ -76,9 +76,9 @@ public class FactDistinctHiveColumnsMapper<KEYIN> extends FactDistinctColumnsMap
allCuboidsBitSet = allCuboidsBitSetList.toArray(new Integer[cuboidIdList.size()][]);
cuboidIds = cuboidIdList.toArray(new Long[cuboidIdList.size()]);
- allCuboidsHLL = new HyperLogLogPlusCounterNew[cuboidIds.length];
+ allCuboidsHLL = new HLLCounter[cuboidIds.length];
for (int i = 0; i < cuboidIds.length; i++) {
- allCuboidsHLL[i] = new HyperLogLogPlusCounterNew(cubeDesc.getConfig().getCubeStatsHLLPrecision());
+ allCuboidsHLL[i] = new HLLCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision());
}
hf = Hashing.murmur3_32();
@@ -207,7 +207,7 @@ public class FactDistinctHiveColumnsMapper<KEYIN> extends FactDistinctColumnsMap
if (collectStatistics) {
ByteBuffer hllBuf = ByteBuffer.allocate(BufferedMeasureCodec.DEFAULT_BUFFER_SIZE);
// output each cuboid's hll to reducer, key is 0 - cuboidId
- HyperLogLogPlusCounterNew hll;
+ HLLCounter hll;
for (int i = 0; i < cuboidIds.length; i++) {
hll = allCuboidsHLL[i];
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java
index e839989..811fc24 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java
@@ -47,7 +47,7 @@ import org.apache.kylin.job.exception.ExecuteException;
import org.apache.kylin.job.execution.AbstractExecutable;
import org.apache.kylin.job.execution.ExecutableContext;
import org.apache.kylin.job.execution.ExecuteResult;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -56,7 +56,7 @@ import com.google.common.collect.Maps;
public class MergeStatisticsStep extends AbstractExecutable {
private static final Logger logger = LoggerFactory.getLogger(MergeStatisticsStep.class);
- protected Map<Long, HyperLogLogPlusCounterNew> cuboidHLLMap = Maps.newHashMap();
+ protected Map<Long, HLLCounter> cuboidHLLMap = Maps.newHashMap();
public MergeStatisticsStep() {
super();
@@ -100,7 +100,7 @@ public class MergeStatisticsStep extends AbstractExecutable {
// sampling percentage;
averageSamplingPercentage += Bytes.toInt(value.getBytes());
} else if (key.get() > 0) {
- HyperLogLogPlusCounterNew hll = new HyperLogLogPlusCounterNew(kylinConf.getCubeStatsHLLPrecision());
+ HLLCounter hll = new HLLCounter(kylinConf.getCubeStatsHLLPrecision());
ByteArray byteArray = new ByteArray(value.getBytes());
hll.readRegisters(byteArray.asBuffer());
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeSamplingTest.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeSamplingTest.java b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeSamplingTest.java
index cae3b62..beec00f 100644
--- a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeSamplingTest.java
+++ b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/CubeSamplingTest.java
@@ -24,7 +24,7 @@ import java.util.List;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.kylin.common.util.ByteArray;
import org.apache.kylin.common.util.Bytes;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.junit.Before;
import org.junit.Test;
@@ -45,7 +45,7 @@ public class CubeSamplingTest {
private Integer[][] allCuboidsBitSet;
private HashFunction hf = null;
private long baseCuboidId;
- private HyperLogLogPlusCounterNew[] allCuboidsHLL = null;
+ private HLLCounter[] allCuboidsHLL = null;
private final byte[] seperator = Bytes.toBytes(",");
@Before
@@ -61,9 +61,9 @@ public class CubeSamplingTest {
allCuboidsBitSet = allCuboidsBitSetList.toArray(new Integer[allCuboidsBitSetList.size()][]);
System.out.println("Totally have " + allCuboidsBitSet.length + " cuboids.");
- allCuboidsHLL = new HyperLogLogPlusCounterNew[allCuboids.size()];
+ allCuboidsHLL = new HLLCounter[allCuboids.size()];
for (int i = 0; i < allCuboids.size(); i++) {
- allCuboidsHLL[i] = new HyperLogLogPlusCounterNew(14);
+ allCuboidsHLL[i] = new HLLCounter(14);
}
// hf = Hashing.goodFastHash(32);
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducerTest.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducerTest.java b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducerTest.java
index a00db94..f6f790e 100644
--- a/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducerTest.java
+++ b/engine-mr/src/test/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducerTest.java
@@ -28,7 +28,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.kylin.engine.mr.HadoopUtil;
import org.apache.kylin.engine.mr.common.CubeStatsWriter;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.junit.Test;
import com.google.common.collect.Maps;
@@ -48,7 +48,7 @@ public class FactDistinctColumnsReducerTest {
}
System.out.println(outputPath);
- Map<Long, HyperLogLogPlusCounterNew> cuboidHLLMap = Maps.newHashMap();
+ Map<Long, HLLCounter> cuboidHLLMap = Maps.newHashMap();
CubeStatsWriter.writeCuboidStatistics(conf, outputPath, cuboidHLLMap, 100);
FileSystem.getLocal(conf).delete(outputPath, true);
[2/5] kylin git commit: KYLIN-1832 HyperLogLog performance
optimization
Posted by li...@apache.org.
KYLIN-1832 HyperLogLog performance optimization
Signed-off-by: Li Yang <li...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/f05404d5
Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/f05404d5
Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/f05404d5
Branch: refs/heads/master
Commit: f05404d5576b52c70cf26eb1bccde1c27cd3852f
Parents: 5303651
Author: xiefan46 <95...@qq.com>
Authored: Fri Dec 9 16:53:04 2016 +0800
Committer: Li Yang <li...@apache.org>
Committed: Wed Dec 14 11:07:42 2016 +0800
----------------------------------------------------------------------
.../org/apache/kylin/cube/util/CubingUtils.java | 14 +-
.../apache/kylin/gridtable/UnitTestSupport.java | 22 +-
.../benchmark/GTScannerBenchmark2.java | 4 +-
.../gridtable/AggregationCacheMemSizeTest.java | 4 +-
.../metadata/measure/MeasureCodecTest.java | 4 +-
.../org/apache/kylin/measure/MeasureType.java | 2 +-
.../kylin/measure/MeasureTypeFactory.java | 2 +-
.../kylin/measure/hllc/DenseRegister.java | 91 +++++
.../kylin/measure/hllc/HLLCAggregator.java | 10 +-
.../kylin/measure/hllc/HLLCMeasureType.java | 20 +-
.../kylin/measure/hllc/HLLCSerializer.java | 16 +-
.../measure/hllc/HLLDistinctCountAggFunc.java | 22 +-
.../measure/hllc/HyperLogLogPlusCounter.java | 392 -------------------
.../measure/hllc/HyperLogLogPlusCounterNew.java | 388 ++++++++++++++++++
.../measure/hllc/HyperLogLogPlusCounterOld.java | 392 +++++++++++++++++++
.../org/apache/kylin/measure/hllc/Register.java | 37 ++
.../apache/kylin/measure/hllc/RegisterType.java | 25 ++
.../kylin/measure/hllc/SparseRegister.java | 98 +++++
.../measure/AggregatorMemEstimateTest.java | 4 +-
.../measure/hll/HyperLogLogCounterOldTest.java | 265 +++++++++++++
.../measure/hll/HyperLogLogCounterTest.java | 265 -------------
.../measure/hll2/HyperLogLogCounterNewTest.java | 301 ++++++++++++++
.../hll2/NewHyperLogLogBenchmarkTest.java | 288 ++++++++++++++
.../kylin/engine/mr/common/CubeStatsReader.java | 12 +-
.../kylin/engine/mr/common/CubeStatsWriter.java | 6 +-
.../mr/steps/FactDistinctColumnsReducer.java | 8 +-
.../mr/steps/FactDistinctHiveColumnsMapper.java | 10 +-
.../engine/mr/steps/MergeStatisticsStep.java | 6 +-
.../kylin/engine/mr/steps/CubeSamplingTest.java | 8 +-
.../steps/FactDistinctColumnsReducerTest.java | 4 +-
.../apache/kylin/engine/spark/SparkCubing.java | 28 +-
.../cardinality/ColumnCardinalityMapper.java | 12 +-
.../cardinality/ColumnCardinalityReducer.java | 12 +-
.../ColumnCardinalityReducerTest.java | 4 +-
34 files changed, 2002 insertions(+), 774 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java
----------------------------------------------------------------------
diff --git a/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java b/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java
index 413b907..35139a4 100644
--- a/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java
+++ b/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java
@@ -38,7 +38,7 @@ import org.apache.kylin.dict.DictionaryGenerator;
import org.apache.kylin.dict.DictionaryInfo;
import org.apache.kylin.dict.DictionaryManager;
import org.apache.kylin.dict.IterableDictionaryValueEnumerator;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.apache.kylin.metadata.model.IJoinedFlatTableDesc;
import org.apache.kylin.metadata.model.TblColRef;
import org.apache.kylin.source.ReadableTable;
@@ -59,7 +59,7 @@ public class CubingUtils {
private static Logger logger = LoggerFactory.getLogger(CubingUtils.class);
- public static Map<Long, HyperLogLogPlusCounter> sampling(CubeDesc cubeDesc, IJoinedFlatTableDesc flatDescIn, Iterable<List<String>> streams) {
+ public static Map<Long, HyperLogLogPlusCounterNew> sampling(CubeDesc cubeDesc, IJoinedFlatTableDesc flatDescIn, Iterable<List<String>> streams) {
final CubeJoinedFlatTableEnrich flatDesc = new CubeJoinedFlatTableEnrich(flatDescIn, cubeDesc);
final int rowkeyLength = cubeDesc.getRowkey().getRowKeyColumns().length;
final List<Long> allCuboidIds = new CuboidScheduler(cubeDesc).getAllCuboidIds();
@@ -84,9 +84,9 @@ public class CubingUtils {
return result;
}
});
- final Map<Long, HyperLogLogPlusCounter> result = Maps.newHashMapWithExpectedSize(allCuboidIds.size());
+ final Map<Long, HyperLogLogPlusCounterNew> result = Maps.newHashMapWithExpectedSize(allCuboidIds.size());
for (Long cuboidId : allCuboidIds) {
- result.put(cuboidId, new HyperLogLogPlusCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision()));
+ result.put(cuboidId, new HyperLogLogPlusCounterNew(cubeDesc.getConfig().getCubeStatsHLLPrecision()));
Integer[] cuboidBitSet = new Integer[Long.bitCount(cuboidId)];
long mask = Long.highestOneBit(baseCuboidId);
@@ -118,9 +118,9 @@ public class CubingUtils {
}
}
- for (Map.Entry<Long, HyperLogLogPlusCounter> longHyperLogLogPlusCounterEntry : result.entrySet()) {
- Long cuboidId = longHyperLogLogPlusCounterEntry.getKey();
- HyperLogLogPlusCounter counter = longHyperLogLogPlusCounterEntry.getValue();
+ for (Map.Entry<Long, HyperLogLogPlusCounterNew> longHyperLogLogPlusCounterNewEntry : result.entrySet()) {
+ Long cuboidId = longHyperLogLogPlusCounterNewEntry.getKey();
+ HyperLogLogPlusCounterNew counter = longHyperLogLogPlusCounterNewEntry.getValue();
Hasher hc = hf.newHasher();
final Integer[] cuboidBitSet = allCuboidsBitSet.get(cuboidId);
for (int position = 0; position < cuboidBitSet.length; position++) {
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-cube/src/main/java/org/apache/kylin/gridtable/UnitTestSupport.java
----------------------------------------------------------------------
diff --git a/core-cube/src/main/java/org/apache/kylin/gridtable/UnitTestSupport.java b/core-cube/src/main/java/org/apache/kylin/gridtable/UnitTestSupport.java
index 3396fd2..6cbf237 100644
--- a/core-cube/src/main/java/org/apache/kylin/gridtable/UnitTestSupport.java
+++ b/core-cube/src/main/java/org/apache/kylin/gridtable/UnitTestSupport.java
@@ -26,7 +26,7 @@ import java.util.List;
import org.apache.kylin.common.util.DateFormat;
import org.apache.kylin.common.util.ImmutableBitSet;
import org.apache.kylin.gridtable.GTInfo.Builder;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.apache.kylin.metadata.datatype.DataType;
import org.apache.kylin.metadata.datatype.LongMutable;
@@ -106,16 +106,16 @@ public class UnitTestSupport {
String d_01_15 = datePlus("2015-01-15", i * 4);
String d_01_16 = datePlus("2015-01-16", i * 4);
String d_01_17 = datePlus("2015-01-17", i * 4);
- result.add(newRec(info, d_01_14, "Yang", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounter(14)));
- result.add(newRec(info, d_01_14, "Luke", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounter(14)));
- result.add(newRec(info, d_01_15, "Xu", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounter(14)));
- result.add(newRec(info, d_01_15, "Dong", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounter(14)));
- result.add(newRec(info, d_01_15, "Jason", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounter(14)));
- result.add(newRec(info, d_01_16, "Mahone", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounter(14)));
- result.add(newRec(info, d_01_16, "Shaofeng", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounter(14)));
- result.add(newRec(info, d_01_16, "Qianhao", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounter(14)));
- result.add(newRec(info, d_01_16, "George", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounter(14)));
- result.add(newRec(info, d_01_17, "Kejia", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounter(14)));
+ result.add(newRec(info, d_01_14, "Yang", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
+ result.add(newRec(info, d_01_14, "Luke", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
+ result.add(newRec(info, d_01_15, "Xu", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
+ result.add(newRec(info, d_01_15, "Dong", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
+ result.add(newRec(info, d_01_15, "Jason", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
+ result.add(newRec(info, d_01_16, "Mahone", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
+ result.add(newRec(info, d_01_16, "Shaofeng", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
+ result.add(newRec(info, d_01_16, "Qianhao", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
+ result.add(newRec(info, d_01_16, "George", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
+ result.add(newRec(info, d_01_17, "Kejia", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
}
return result;
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-cube/src/main/java/org/apache/kylin/gridtable/benchmark/GTScannerBenchmark2.java
----------------------------------------------------------------------
diff --git a/core-cube/src/main/java/org/apache/kylin/gridtable/benchmark/GTScannerBenchmark2.java b/core-cube/src/main/java/org/apache/kylin/gridtable/benchmark/GTScannerBenchmark2.java
index 40a5e01..f80bd24 100644
--- a/core-cube/src/main/java/org/apache/kylin/gridtable/benchmark/GTScannerBenchmark2.java
+++ b/core-cube/src/main/java/org/apache/kylin/gridtable/benchmark/GTScannerBenchmark2.java
@@ -34,7 +34,7 @@ import org.apache.kylin.gridtable.GTScanRequest;
import org.apache.kylin.gridtable.GTScanRequestBuilder;
import org.apache.kylin.gridtable.IGTScanner;
import org.apache.kylin.gridtable.benchmark.SortedGTRecordGenerator.Randomizer;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.apache.kylin.metadata.datatype.DataType;
import org.apache.kylin.metadata.filter.ColumnTupleFilter;
import org.apache.kylin.metadata.filter.CompareTupleFilter;
@@ -80,7 +80,7 @@ public class GTScannerBenchmark2 {
gen.addDimension(100, 4, null);
gen.addMeasure(8);
gen.addMeasure(8, new Randomizer() {
- HyperLogLogPlusCounter hllc = new HyperLogLogPlusCounter(12);
+ HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew(12);
@Override
public int fillRandom(Random rand, byte[] array, int offset) {
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-cube/src/test/java/org/apache/kylin/gridtable/AggregationCacheMemSizeTest.java
----------------------------------------------------------------------
diff --git a/core-cube/src/test/java/org/apache/kylin/gridtable/AggregationCacheMemSizeTest.java b/core-cube/src/test/java/org/apache/kylin/gridtable/AggregationCacheMemSizeTest.java
index 00c0bd0..66a6b51 100644
--- a/core-cube/src/test/java/org/apache/kylin/gridtable/AggregationCacheMemSizeTest.java
+++ b/core-cube/src/test/java/org/apache/kylin/gridtable/AggregationCacheMemSizeTest.java
@@ -26,7 +26,7 @@ import org.apache.kylin.measure.basic.LongSumAggregator;
import org.apache.kylin.measure.bitmap.BitmapAggregator;
import org.apache.kylin.measure.bitmap.BitmapCounter;
import org.apache.kylin.measure.hllc.HLLCAggregator;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.apache.kylin.metadata.datatype.DoubleMutable;
import org.apache.kylin.metadata.datatype.LongMutable;
import org.github.jamm.MemoryMeter;
@@ -105,7 +105,7 @@ public class AggregationCacheMemSizeTest {
private HLLCAggregator createHLLCAggr() {
HLLCAggregator hllcAggregator = new HLLCAggregator(14);
- hllcAggregator.aggregate(new HyperLogLogPlusCounter(14));
+ hllcAggregator.aggregate(new HyperLogLogPlusCounterNew(14));
return hllcAggregator;
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java
----------------------------------------------------------------------
diff --git a/core-cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java b/core-cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java
index 18680ec..cd1aa96 100644
--- a/core-cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java
+++ b/core-cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java
@@ -26,7 +26,7 @@ import java.nio.ByteBuffer;
import org.apache.kylin.common.util.LocalFileMetadataTestCase;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.bitmap.BitmapCounter;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.apache.kylin.metadata.datatype.DoubleMutable;
import org.apache.kylin.metadata.datatype.LongMutable;
import org.apache.kylin.metadata.model.FunctionDesc;
@@ -57,7 +57,7 @@ public class MeasureCodecTest extends LocalFileMetadataTestCase {
DoubleMutable d = new DoubleMutable(1.0);
LongMutable l = new LongMutable(2);
BigDecimal b = new BigDecimal("333.1234");
- HyperLogLogPlusCounter hllc = new HyperLogLogPlusCounter(16);
+ HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew(16);
hllc.add("1234567");
hllc.add("abcdefg");
BitmapCounter bitmap = new BitmapCounter();
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/MeasureType.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/MeasureType.java b/core-metadata/src/main/java/org/apache/kylin/measure/MeasureType.java
index de1b442..031636e 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/MeasureType.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/MeasureType.java
@@ -36,7 +36,7 @@ import org.apache.kylin.metadata.tuple.TupleInfo;
* MeasureType captures how a kind of aggregation is defined, how it is calculated
* during cube build, and how it is involved in query and storage scan.
*
- * @param <T> the Java type of aggregation data object, e.g. HyperLogLogPlusCounter
+ * @param <T> the Java type of aggregation data object, e.g. HyperLogLogPlusCounterOld
*/
abstract public class MeasureType<T> {
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java b/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java
index c5bd482..d94dec9 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java
@@ -62,7 +62,7 @@ import com.google.common.collect.Maps;
}
</pre>
*
- * @param <T> the Java type of aggregation data object, e.g. HyperLogLogPlusCounter
+ * @param <T> the Java type of aggregation data object, e.g. HyperLogLogPlusCounterOld
*/
abstract public class MeasureTypeFactory<T> {
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
new file mode 100644
index 0000000..26ee6ab
--- /dev/null
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package org.apache.kylin.measure.hllc;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * Created by xiefan on 16-12-9.
+ */
+public class DenseRegister implements Register {
+ private int p;
+
+ private int m;
+
+ private byte[] register;
+
+ public DenseRegister(int p) {
+ this.m = 1 << p;
+ this.register = new byte[m];
+ }
+
+ public void set(int pos, byte value) {
+ register[pos] = value;
+ }
+
+ @Override
+ public Byte get(int pos) {
+ return register[pos];
+ }
+
+ @Override
+ public void merge(Register another) {
+ if (another instanceof DenseRegister) {
+ DenseRegister dr = (DenseRegister) another;
+ for (int i = 0; i < register.length; i++) {
+ if (dr.register[i] > register[i])
+ register[i] = dr.register[i];
+ }
+ } else {
+ SparseRegister sr = (SparseRegister) another;
+ Collection<Map.Entry<Integer, Byte>> allValue = sr.getAllValue();
+ for (Map.Entry<Integer, Byte> entry : allValue) {
+ if (entry.getValue() > register[entry.getKey()])
+ register[entry.getKey()] = entry.getValue();
+ }
+ }
+ }
+
+ @Override
+ public void clear() {
+ byte zero = (byte) 0;
+ Arrays.fill(register, zero);
+ }
+
+ @Override
+ public int getSize() {
+ int size = 0;
+ for (int i = 0; i < m; i++) {
+ if (register[i] > 0)
+ size++;
+ }
+ return size;
+ }
+
+ @Override
+ public int getHashCode() {
+ return Arrays.hashCode(register);
+ }
+
+ public byte[] getRawRegister() {
+ return this.register;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCAggregator.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCAggregator.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCAggregator.java
index aea2df1..ca73285 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCAggregator.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCAggregator.java
@@ -23,10 +23,10 @@ import org.apache.kylin.measure.MeasureAggregator;
/**
*/
@SuppressWarnings("serial")
-public class HLLCAggregator extends MeasureAggregator<HyperLogLogPlusCounter> {
+public class HLLCAggregator extends MeasureAggregator<HyperLogLogPlusCounterNew> {
final int precision;
- HyperLogLogPlusCounter sum = null;
+ HyperLogLogPlusCounterNew sum = null;
public HLLCAggregator(int precision) {
this.precision = precision;
@@ -38,15 +38,15 @@ public class HLLCAggregator extends MeasureAggregator<HyperLogLogPlusCounter> {
}
@Override
- public void aggregate(HyperLogLogPlusCounter value) {
+ public void aggregate(HyperLogLogPlusCounterNew value) {
if (sum == null)
- sum = new HyperLogLogPlusCounter(value);
+ sum = new HyperLogLogPlusCounterNew(value);
else
sum.merge(value);
}
@Override
- public HyperLogLogPlusCounter getState() {
+ public HyperLogLogPlusCounterNew getState() {
return sum;
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCMeasureType.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCMeasureType.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCMeasureType.java
index 0e58dca..481fa4e 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCMeasureType.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCMeasureType.java
@@ -33,15 +33,15 @@ import org.apache.kylin.metadata.model.TblColRef;
import com.google.common.collect.ImmutableMap;
-public class HLLCMeasureType extends MeasureType<HyperLogLogPlusCounter> {
+public class HLLCMeasureType extends MeasureType<HyperLogLogPlusCounterNew> {
public static final String FUNC_COUNT_DISTINCT = FunctionDesc.FUNC_COUNT_DISTINCT;
public static final String DATATYPE_HLLC = "hllc";
- public static class Factory extends MeasureTypeFactory<HyperLogLogPlusCounter> {
+ public static class Factory extends MeasureTypeFactory<HyperLogLogPlusCounterNew> {
@Override
- public MeasureType<HyperLogLogPlusCounter> createMeasureType(String funcName, DataType dataType) {
+ public MeasureType<HyperLogLogPlusCounterNew> createMeasureType(String funcName, DataType dataType) {
return new HLLCMeasureType(funcName, dataType);
}
@@ -56,7 +56,7 @@ public class HLLCMeasureType extends MeasureType<HyperLogLogPlusCounter> {
}
@Override
- public Class<? extends DataTypeSerializer<HyperLogLogPlusCounter>> getAggrDataTypeSerializer() {
+ public Class<? extends DataTypeSerializer<HyperLogLogPlusCounterNew>> getAggrDataTypeSerializer() {
return HLLCSerializer.class;
}
}
@@ -91,13 +91,13 @@ public class HLLCMeasureType extends MeasureType<HyperLogLogPlusCounter> {
}
@Override
- public MeasureIngester<HyperLogLogPlusCounter> newIngester() {
- return new MeasureIngester<HyperLogLogPlusCounter>() {
- HyperLogLogPlusCounter current = new HyperLogLogPlusCounter(dataType.getPrecision());
+ public MeasureIngester<HyperLogLogPlusCounterNew> newIngester() {
+ return new MeasureIngester<HyperLogLogPlusCounterNew>() {
+ HyperLogLogPlusCounterNew current = new HyperLogLogPlusCounterNew(dataType.getPrecision());
@Override
- public HyperLogLogPlusCounter valueOf(String[] values, MeasureDesc measureDesc, Map<TblColRef, Dictionary<String>> dictionaryMap) {
- HyperLogLogPlusCounter hllc = current;
+ public HyperLogLogPlusCounterNew valueOf(String[] values, MeasureDesc measureDesc, Map<TblColRef, Dictionary<String>> dictionaryMap) {
+ HyperLogLogPlusCounterNew hllc = current;
hllc.clear();
for (String v : values) {
if (v != null)
@@ -109,7 +109,7 @@ public class HLLCMeasureType extends MeasureType<HyperLogLogPlusCounter> {
}
@Override
- public MeasureAggregator<HyperLogLogPlusCounter> newAggregator() {
+ public MeasureAggregator<HyperLogLogPlusCounterNew> newAggregator() {
return new HLLCAggregator(dataType.getPrecision());
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCSerializer.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCSerializer.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCSerializer.java
index 4d08b6f..1d01abc 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCSerializer.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCSerializer.java
@@ -28,10 +28,10 @@ import org.apache.kylin.metadata.datatype.DataTypeSerializer;
* @author yangli9
*
*/
-public class HLLCSerializer extends DataTypeSerializer<HyperLogLogPlusCounter> {
+public class HLLCSerializer extends DataTypeSerializer<HyperLogLogPlusCounterNew> {
// be thread-safe and avoid repeated obj creation
- private ThreadLocal<HyperLogLogPlusCounter> current = new ThreadLocal<HyperLogLogPlusCounter>();
+ private ThreadLocal<HyperLogLogPlusCounterNew> current = new ThreadLocal<HyperLogLogPlusCounterNew>();
private int precision;
@@ -40,7 +40,7 @@ public class HLLCSerializer extends DataTypeSerializer<HyperLogLogPlusCounter> {
}
@Override
- public void serialize(HyperLogLogPlusCounter value, ByteBuffer out) {
+ public void serialize(HyperLogLogPlusCounterNew value, ByteBuffer out) {
try {
value.writeRegisters(out);
} catch (IOException e) {
@@ -48,18 +48,18 @@ public class HLLCSerializer extends DataTypeSerializer<HyperLogLogPlusCounter> {
}
}
- private HyperLogLogPlusCounter current() {
- HyperLogLogPlusCounter hllc = current.get();
+ private HyperLogLogPlusCounterNew current() {
+ HyperLogLogPlusCounterNew hllc = current.get();
if (hllc == null) {
- hllc = new HyperLogLogPlusCounter(precision);
+ hllc = new HyperLogLogPlusCounterNew(precision);
current.set(hllc);
}
return hllc;
}
@Override
- public HyperLogLogPlusCounter deserialize(ByteBuffer in) {
- HyperLogLogPlusCounter hllc = current();
+ public HyperLogLogPlusCounterNew deserialize(ByteBuffer in) {
+ HyperLogLogPlusCounterNew hllc = current();
try {
hllc.readRegisters(in);
} catch (IOException e) {
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLDistinctCountAggFunc.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLDistinctCountAggFunc.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLDistinctCountAggFunc.java
index 8f2a0fa..a72ad09 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLDistinctCountAggFunc.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLDistinctCountAggFunc.java
@@ -31,21 +31,21 @@ public class HLLDistinctCountAggFunc {
private static final Logger logger = LoggerFactory.getLogger(HLLDistinctCountAggFunc.class);
- public static HyperLogLogPlusCounter init() {
+ public static HyperLogLogPlusCounterNew init() {
return null;
}
- public static HyperLogLogPlusCounter initAdd(Object v) {
+ public static HyperLogLogPlusCounterNew initAdd(Object v) {
if (v instanceof Long) { // holistic case
long l = (Long) v;
return new FixedValueHLLCMockup(l);
} else {
- HyperLogLogPlusCounter c = (HyperLogLogPlusCounter) v;
- return new HyperLogLogPlusCounter(c);
+ HyperLogLogPlusCounterNew c = (HyperLogLogPlusCounterNew) v;
+ return new HyperLogLogPlusCounterNew(c);
}
}
- public static HyperLogLogPlusCounter add(HyperLogLogPlusCounter counter, Object v) {
+ public static HyperLogLogPlusCounterNew add(HyperLogLogPlusCounterNew counter, Object v) {
if (v instanceof Long) { // holistic case
long l = (Long) v;
if (counter == null) {
@@ -58,9 +58,9 @@ public class HLLDistinctCountAggFunc {
return counter;
}
} else {
- HyperLogLogPlusCounter c = (HyperLogLogPlusCounter) v;
+ HyperLogLogPlusCounterNew c = (HyperLogLogPlusCounterNew) v;
if (counter == null) {
- return new HyperLogLogPlusCounter(c);
+ return new HyperLogLogPlusCounterNew(c);
} else {
counter.merge(c);
return counter;
@@ -68,16 +68,16 @@ public class HLLDistinctCountAggFunc {
}
}
- public static HyperLogLogPlusCounter merge(HyperLogLogPlusCounter counter0, Object counter1) {
+ public static HyperLogLogPlusCounterNew merge(HyperLogLogPlusCounterNew counter0, Object counter1) {
return add(counter0, counter1);
}
- public static long result(HyperLogLogPlusCounter counter) {
+ public static long result(HyperLogLogPlusCounterNew counter) {
return counter == null ? 0L : counter.getCountEstimate();
}
@SuppressWarnings("serial")
- private static class FixedValueHLLCMockup extends HyperLogLogPlusCounter {
+ private static class FixedValueHLLCMockup extends HyperLogLogPlusCounterNew {
private Long value = null;
@@ -107,7 +107,7 @@ public class HLLDistinctCountAggFunc {
}
@Override
- public void merge(HyperLogLogPlusCounter another) {
+ public void merge(HyperLogLogPlusCounterNew another) {
throw new UnsupportedOperationException();
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounter.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounter.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounter.java
deleted file mode 100644
index 00407f9..0000000
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounter.java
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-package org.apache.kylin.measure.hllc;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-
-import org.apache.kylin.common.util.BytesUtil;
-
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-
-/**
- * About compression, test on HLLC data shows
- *
- * - LZF compression ratio is around 65%-80%, fast
- * - GZIP compression ratio is around 41%-46%, very slow
- *
- * @author yangli9
- */
-@SuppressWarnings("serial")
-public class HyperLogLogPlusCounter implements Serializable, Comparable<HyperLogLogPlusCounter> {
-
- private final int p;
- private final int m;
- private final HashFunction hashFunc;
- byte[] registers;
- int singleBucket;
-
- public HyperLogLogPlusCounter() {
- this(10);
- }
-
- public HyperLogLogPlusCounter(int p) {
- this(p, Hashing.murmur3_128());
- }
-
- public HyperLogLogPlusCounter(HyperLogLogPlusCounter another) {
- this(another.p, another.hashFunc);
- merge(another);
- }
-
- /** The larger p is, the more storage (2^p bytes), the better accuracy */
- private HyperLogLogPlusCounter(int p, HashFunction hashFunc) {
- this.p = p;
- this.m = 1 << p;//(int) Math.pow(2, p);
- this.hashFunc = hashFunc;
- this.registers = new byte[m];
- this.singleBucket = -1;
- }
-
- public void clear() {
- byte zero = (byte) 0;
- if (singleBucket == -1) {
- //nothing
- } else if (singleBucket >= 0) {
- registers[singleBucket] = 0;
- } else {
- Arrays.fill(registers, zero);
- }
- singleBucket = -1;
- }
-
- public void add(int value) {
- add(hashFunc.hashInt(value).asLong());
- }
-
- public void add(String value) {
- add(hashFunc.hashString(value, Charset.defaultCharset()).asLong());
- }
-
- public void add(byte[] value) {
- add(hashFunc.hashBytes(value).asLong());
- }
-
- public void add(byte[] value, int offset, int length) {
- add(hashFunc.hashBytes(value, offset, length).asLong());
- }
-
- protected void add(long hash) {
- int bucketMask = m - 1;
- int bucket = (int) (hash & bucketMask);
- int firstOnePos = Long.numberOfLeadingZeros(hash | bucketMask) + 1;
-
- if (firstOnePos > registers[bucket])
- registers[bucket] = (byte) firstOnePos;
-
- if (singleBucket == -1)
- singleBucket = bucket;
- else
- singleBucket = Integer.MIN_VALUE;
- }
-
- public void merge(HyperLogLogPlusCounter another) {
- assert this.p == another.p;
- assert this.hashFunc == another.hashFunc;
-
- // quick path for single value HLLC
- if (another.singleBucket == -1) {
- return;
- } else if (another.singleBucket >= 0) {
- int b = another.singleBucket;
- if (registers[b] < another.registers[b])
- registers[b] = another.registers[b];
- } else {
- // normal path
- for (int i = 0; i < m; i++) {
- if (registers[i] < another.registers[i])
- registers[i] = another.registers[i];
- }
- }
- singleBucket = Integer.MIN_VALUE;
- }
-
- public long getCountEstimate() {
- return new HLLCSnapshot(this).getCountEstimate();
- }
-
- public int getPrecision() {
- return this.p;
- }
-
- public double getErrorRate() {
- return 1.04 / Math.sqrt(m);
- }
-
- private int size() {
- if (singleBucket == -1) {
- return 0;
- } else if (singleBucket >= 0) {
- return 1;
- } else {
- int size = 0;
- for (int i = 0; i < m; i++) {
- if (registers[i] > 0)
- size++;
- }
- return size;
- }
- }
-
- @Override
- public String toString() {
- return "" + getCountEstimate();
- }
-
- // ============================================================================
-
- // a memory efficient snapshot of HLL registers which can yield count
- // estimate later
- public static class HLLCSnapshot {
- byte p;
- double registerSum;
- int zeroBuckets;
-
- public HLLCSnapshot(HyperLogLogPlusCounter hllc) {
- p = (byte) hllc.p;
- registerSum = 0;
- zeroBuckets = 0;
-
- byte[] registers = hllc.registers;
- for (int i = 0; i < hllc.m; i++) {
- if (registers[i] == 0) {
- registerSum++;
- zeroBuckets++;
- } else {
- registerSum += 1.0 / (1L << registers[i]);
- }
- }
- }
-
- public long getCountEstimate() {
- int m = 1 << p;
- double alpha = 0.7213 / (1 + 1.079 / m);
- double estimate = alpha * m * m / registerSum;
-
- // small cardinality adjustment
- if (zeroBuckets >= m * 0.07) { // (reference presto's HLL impl)
- estimate = m * Math.log(m * 1.0 / zeroBuckets);
- } else if (HyperLogLogPlusTable.isBiasCorrection(m, estimate)) {
- estimate = HyperLogLogPlusTable.biasCorrection(p, estimate);
- }
-
- return Math.round(estimate);
- }
- }
-
- // ============================================================================
-
- public void writeRegisters(final ByteBuffer out) throws IOException {
-
- final int indexLen = getRegisterIndexSize();
- int size = size();
-
- // decide output scheme -- map (3*size bytes) or array (2^p bytes)
- byte scheme;
- if (5 + (indexLen + 1) * size < m) // 5 is max len of vint
- scheme = 0; // map
- else
- scheme = 1; // array
- out.put(scheme);
-
- if (scheme == 0) { // map scheme
- BytesUtil.writeVInt(size, out);
- if (singleBucket == -1) {
- // no non-zero register
- } else if (singleBucket >= 0) {
- writeUnsigned(singleBucket, indexLen, out);
- out.put(registers[singleBucket]);
- } else {
- for (int i = 0; i < m; i++) {
- if (registers[i] > 0) {
- writeUnsigned(i, indexLen, out);
- out.put(registers[i]);
- }
- }
- }
- } else if (scheme == 1) { // array scheme
- out.put(registers);
- } else
- throw new IllegalStateException();
- }
-
- public void readRegisters(ByteBuffer in) throws IOException {
- byte scheme = in.get();
-
- if (scheme == 0) { // map scheme
- clear();
- int size = BytesUtil.readVInt(in);
- if (size > m)
- throw new IllegalArgumentException("register size (" + size + ") cannot be larger than m (" + m + ")");
- int indexLen = getRegisterIndexSize();
- int key = 0;
- for (int i = 0; i < size; i++) {
- key = readUnsigned(in, indexLen);
- registers[key] = in.get();
- }
-
- if (size == 0)
- singleBucket = -1;
- else if (size == 1)
- singleBucket = key;
- else
- singleBucket = Integer.MIN_VALUE;
-
- } else if (scheme == 1) { // array scheme
- in.get(registers);
- singleBucket = Integer.MIN_VALUE;
- } else
- throw new IllegalStateException();
- }
-
- public int peekLength(ByteBuffer in) {
- int mark = in.position();
- int len;
-
- byte scheme = in.get();
- if (scheme == 0) { // map scheme
- int size = BytesUtil.readVInt(in);
- int indexLen = getRegisterIndexSize();
- len = in.position() - mark + (indexLen + 1) * size;
- } else {
- len = in.position() - mark + m;
- }
-
- in.position(mark);
- return len;
- }
-
- public int maxLength() {
- return 1 + m;
- }
-
- public void writeRegistersArray(final ByteBuffer out) {
- out.put(this.registers);
- }
-
- public void readRegistersArray(ByteBuffer in) {
- in.get(registers, 0, m);
- singleBucket = Integer.MIN_VALUE;
- }
-
- private int getRegisterIndexSize() {
- return (p - 1) / 8 + 1; // 2 when p=16, 3 when p=17
- }
-
- @Override
- public int hashCode() {
- final int prime = 31;
- int result = 1;
- result = prime * result + ((hashFunc == null) ? 0 : hashFunc.hashCode());
- result = prime * result + p;
- result = prime * result + Arrays.hashCode(registers);
- return result;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- HyperLogLogPlusCounter other = (HyperLogLogPlusCounter) obj;
- if (hashFunc == null) {
- if (other.hashFunc != null)
- return false;
- } else if (!hashFunc.equals(other.hashFunc))
- return false;
- if (p != other.p)
- return false;
- if (!Arrays.equals(registers, other.registers))
- return false;
- return true;
- }
-
- @Override
- public int compareTo(HyperLogLogPlusCounter o) {
- if (o == null)
- return 1;
-
- long e1 = this.getCountEstimate();
- long e2 = o.getCountEstimate();
-
- if (e1 == e2)
- return 0;
- else if (e1 > e2)
- return 1;
- else
- return -1;
- }
-
- public static void main(String[] args) throws IOException {
- dumpErrorRates();
- }
-
- static void dumpErrorRates() {
- for (int p = 10; p <= 18; p++) {
- double rate = new HyperLogLogPlusCounter(p).getErrorRate();
- double er = Math.round(rate * 10000) / 100D;
- double er2 = Math.round(rate * 2 * 10000) / 100D;
- double er3 = Math.round(rate * 3 * 10000) / 100D;
- long size = Math.round(Math.pow(2, p));
- System.out.println("HLLC" + p + ",\t" + size + " bytes,\t68% err<" + er + "%" + ",\t95% err<" + er2 + "%" + ",\t99.7% err<" + er3 + "%");
- }
- }
-
- /**
- *
- * @param num
- * @param size
- * @param out
- */
- public static void writeUnsigned(int num, int size, ByteBuffer out) {
- for (int i = 0; i < size; i++) {
- out.put((byte) num);
- num >>>= 8;
- }
- }
-
- public static int readUnsigned(ByteBuffer in, int size) {
- int integer = 0;
- int mask = 0xff;
- int shift = 0;
- for (int i = 0; i < size; i++) {
- integer |= (in.get() << shift) & mask;
- mask = mask << 8;
- shift += 8;
- }
- return integer;
- }
-}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterNew.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterNew.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterNew.java
new file mode 100644
index 0000000..d7329f6
--- /dev/null
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterNew.java
@@ -0,0 +1,388 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.measure.hllc;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import org.apache.kylin.common.util.BytesUtil;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.Collection;
+import java.util.Map;
+
+@SuppressWarnings("serial")
+public class HyperLogLogPlusCounterNew implements Serializable, Comparable<HyperLogLogPlusCounterNew> {
+
+ private int p;
+
+ private int m;
+
+ private HashFunction hashFunc = Hashing.murmur3_128();
+
+ private Register register;
+
+ public static double overflowFactor = 0.01;
+
+ public HyperLogLogPlusCounterNew(int p, RegisterType type, HashFunction hashFunc) {
+ this.p = p;
+ this.m = 1 << p;//(int) Math.pow(2, p);
+ this.hashFunc = hashFunc;
+ if (type == RegisterType.SPARSE) {
+ double over = overflowFactor * m;
+ this.register = new SparseRegister((int) over);
+ } else {
+ this.register = new DenseRegister(p);
+ }
+ }
+
+ public HyperLogLogPlusCounterNew() {
+ this(10, RegisterType.SPARSE, Hashing.murmur3_128());
+ }
+
+ public HyperLogLogPlusCounterNew(int p) {
+ this(p, RegisterType.SPARSE, Hashing.murmur3_128());
+ }
+
+ public HyperLogLogPlusCounterNew(int p, RegisterType type) {
+ this(p, type, Hashing.murmur3_128());
+ }
+
+ public HyperLogLogPlusCounterNew(int p, HashFunction hashFunc) {
+ this(p, RegisterType.SPARSE, hashFunc);
+ }
+
+ public HyperLogLogPlusCounterNew(HyperLogLogPlusCounterNew another) {
+ this(another.p, another.hashFunc);
+ merge(another);
+ }
+
+ public void add(int value) {
+ add(hashFunc.hashInt(value).asLong());
+ }
+
+ public void add(String value) {
+ add(hashFunc.hashString(value, Charset.defaultCharset()).asLong());
+ }
+
+ public void add(byte[] value) {
+ add(hashFunc.hashBytes(value).asLong());
+ }
+
+ public void add(byte[] value, int offset, int length) {
+ add(hashFunc.hashBytes(value, offset, length).asLong());
+ }
+
+ protected void add(long hash) {
+ int bucketMask = m - 1;
+ int bucket = (int) (hash & bucketMask);
+ int firstOnePos = Long.numberOfLeadingZeros(hash | bucketMask) + 1;
+ Byte b = register.get(bucket);
+ if (b == null || (byte) firstOnePos > b) {
+ register.set(bucket, (byte) firstOnePos);
+ }
+ if (register instanceof SparseRegister) {
+ if (((SparseRegister) register).isOverThreshold()) {
+ register = ((SparseRegister) register).toDense(p);
+ }
+ }
+ }
+
+ public void merge(HyperLogLogPlusCounterNew another) {
+ assert this.p == another.p;
+ assert this.hashFunc == another.hashFunc;
+ if (register instanceof SparseRegister && another.register instanceof SparseRegister) {
+ register.merge(another.register);
+ if (((SparseRegister) register).isOverThreshold()) {
+ register = ((SparseRegister) register).toDense(p);
+ }
+ } else if (register instanceof SparseRegister && another.register instanceof DenseRegister) {
+ register = ((SparseRegister) register).toDense(p);
+ register.merge(another.register);
+ } else {
+ register.merge(another.register);
+ }
+ }
+
+ public long getCountEstimate() {
+ return new HLLCSnapshot(this).getCountEstimate();
+ }
+
+ public int getPrecision() {
+ return this.p;
+ }
+
+ public double getErrorRate() {
+ return 1.04 / Math.sqrt(m);
+ }
+
+ @Override
+ public String toString() {
+ return "" + getCountEstimate();
+ }
+
+ // ============================================================================
+
+ // a memory efficient snapshot of HLL registers which can yield count
+ // estimate later
+ public static class HLLCSnapshot {
+ byte p;
+ double registerSum;
+ int zeroBuckets;
+
+ public HLLCSnapshot(HyperLogLogPlusCounterNew hllc) {
+ p = (byte) hllc.p;
+ registerSum = 0;
+ zeroBuckets = 0;
+ Register register = hllc.getRegister();
+ DenseRegister dr;
+ if (register instanceof SparseRegister) {
+ dr = ((SparseRegister) register).toDense(p);
+ } else {
+ dr = (DenseRegister) register;
+ }
+ byte[] registers = dr.getRawRegister();
+ for (int i = 0; i < hllc.m; i++) {
+ if (registers[i] == 0) {
+ registerSum++;
+ zeroBuckets++;
+ } else {
+ registerSum += 1.0 / (1L << registers[i]);
+ }
+ }
+ }
+
+ public long getCountEstimate() {
+ int m = 1 << p;
+ double alpha = 0.7213 / (1 + 1.079 / m);
+ double estimate = alpha * m * m / registerSum;
+
+ // small cardinality adjustment
+ if (zeroBuckets >= m * 0.07) { // (reference presto's HLL impl)
+ estimate = m * Math.log(m * 1.0 / zeroBuckets);
+ } else if (HyperLogLogPlusTable.isBiasCorrection(m, estimate)) {
+ estimate = HyperLogLogPlusTable.biasCorrection(p, estimate);
+ }
+
+ return Math.round(estimate);
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+ dumpErrorRates();
+ }
+
+ static void dumpErrorRates() {
+ for (int p = 10; p <= 18; p++) {
+ double rate = new HyperLogLogPlusCounterNew(p, RegisterType.SPARSE).getErrorRate();
+ double er = Math.round(rate * 10000) / 100D;
+ double er2 = Math.round(rate * 2 * 10000) / 100D;
+ double er3 = Math.round(rate * 3 * 10000) / 100D;
+ long size = Math.round(Math.pow(2, p));
+ System.out.println("HLLC" + p + ",\t" + size + " bytes,\t68% err<" + er + "%" + ",\t95% err<" + er2 + "%" + ",\t99.7% err<" + er3 + "%");
+ }
+ }
+
+ public Register getRegister() {
+ return register;
+ }
+
+ public void clear() {
+ register.clear();
+ }
+
+ public RegisterType getRegisterType() {
+ if (register instanceof SparseRegister)
+ return RegisterType.SPARSE;
+ else
+ return RegisterType.DENSE;
+ }
+
+ // ============================================================================
+
+ public void writeRegisters(final ByteBuffer out) throws IOException {
+
+ final int indexLen = getRegisterIndexSize();
+ int size = size();
+
+ // decide output scheme -- map (3*size bytes) or array (2^p bytes)
+ byte scheme;
+ //byte type;
+ if (register instanceof SparseRegister || 5 + (indexLen + 1) * size < m) {
+ scheme = 0; //map
+ } else {
+ scheme = 1; // array
+ }
+ out.put(scheme);
+ if (scheme == 0) { // map scheme
+ BytesUtil.writeVInt(size, out);
+ if (register instanceof SparseRegister) { //sparse\u3000register
+ Collection<Map.Entry<Integer, Byte>> allValue = ((SparseRegister) register).getAllValue();
+ for (Map.Entry<Integer, Byte> entry : allValue) {
+ writeUnsigned(entry.getKey(), indexLen, out);
+ out.put(entry.getValue());
+ }
+ } else { //dense register
+ byte[] registers = ((DenseRegister) register).getRawRegister();
+ for (int i = 0; i < m; i++) {
+ if (registers[i] > 0) {
+ writeUnsigned(i, indexLen, out);
+ out.put(registers[i]);
+ }
+ }
+ }
+ } else if (scheme == 1) { // array scheme
+ out.put(((DenseRegister) register).getRawRegister());
+ } else
+ throw new IllegalStateException();
+ }
+
+ public void readRegisters(ByteBuffer in) throws IOException {
+ byte scheme = in.get();
+ if (scheme == 0) { // map scheme
+ clear();
+ int size = BytesUtil.readVInt(in);
+ if (size > m)
+ throw new IllegalArgumentException("register size (" + size + ") cannot be larger than m (" + m + ")");
+ double over = overflowFactor * m;
+ if (size > (int) over) {
+ this.register = new DenseRegister(p);
+ } else {
+ this.register = new SparseRegister((int) over);//default is sparse
+ }
+ int indexLen = getRegisterIndexSize();
+ int key = 0;
+ for (int i = 0; i < size; i++) {
+ key = readUnsigned(in, indexLen);
+ register.set(key, in.get());
+ }
+ } else if (scheme == 1) { // array scheme
+ this.register = new DenseRegister(p);
+ for (int i = 0; i < m; i++) {
+ register.set(i, in.get());
+ }
+ } else
+ throw new IllegalStateException();
+ }
+
+ public int peekLength(ByteBuffer in) {
+ int mark = in.position();
+ int len;
+ byte scheme = in.get();
+ if (scheme == 0) { // map scheme
+ int size = BytesUtil.readVInt(in);
+ int indexLen = getRegisterIndexSize();
+ len = in.position() - mark + (indexLen + 1) * size;
+ } else {
+ len = in.position() - mark + m;
+ }
+
+ in.position(mark);
+ return len;
+ }
+
+ public int maxLength() {
+ return 1 + m;
+ }
+
+ private int getRegisterIndexSize() {
+ return (p - 1) / 8 + 1; // 2 when p=16, 3 when p=17
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((hashFunc == null) ? 0 : hashFunc.hashCode());
+ result = prime * result + p;
+ result = prime * result + register.getHashCode();
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ HyperLogLogPlusCounterNew other = (HyperLogLogPlusCounterNew) obj;
+ if (hashFunc == null) {
+ if (other.hashFunc != null)
+ return false;
+ } else if (!hashFunc.equals(other.hashFunc))
+ return false;
+ if (p != other.p)
+ return false;
+ if (this.getRegisterType() != other.getRegisterType())
+ return false;
+ if (register.getHashCode() != other.register.getHashCode())
+ return false;
+ return true;
+ }
+
+ @Override
+ public int compareTo(HyperLogLogPlusCounterNew o) {
+ if (o == null)
+ return 1;
+
+ long e1 = this.getCountEstimate();
+ long e2 = o.getCountEstimate();
+
+ if (e1 == e2)
+ return 0;
+ else if (e1 > e2)
+ return 1;
+ else
+ return -1;
+ }
+
+ /**
+ *
+ * @param num
+ * @param size
+ * @param out
+ */
+ public static void writeUnsigned(int num, int size, ByteBuffer out) {
+ for (int i = 0; i < size; i++) {
+ out.put((byte) num);
+ num >>>= 8;
+ }
+ }
+
+ public static int readUnsigned(ByteBuffer in, int size) {
+ int integer = 0;
+ int mask = 0xff;
+ int shift = 0;
+ for (int i = 0; i < size; i++) {
+ integer |= (in.get() << shift) & mask;
+ mask = mask << 8;
+ shift += 8;
+ }
+ return integer;
+ }
+
+ private int size() {
+ return register.getSize();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterOld.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterOld.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterOld.java
new file mode 100644
index 0000000..cb5533e
--- /dev/null
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterOld.java
@@ -0,0 +1,392 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.measure.hllc;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+
+import org.apache.kylin.common.util.BytesUtil;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+
+/**
+ * About compression, test on HLLC data shows
+ *
+ * - LZF compression ratio is around 65%-80%, fast
+ * - GZIP compression ratio is around 41%-46%, very slow
+ *
+ * @author yangli9
+ */
+@SuppressWarnings("serial")
+public class HyperLogLogPlusCounterOld implements Serializable, Comparable<HyperLogLogPlusCounterOld> {
+
+ private final int p;
+ private final int m;
+ private final HashFunction hashFunc;
+ byte[] registers;
+ int singleBucket;
+
+ public HyperLogLogPlusCounterOld() {
+ this(10);
+ }
+
+ public HyperLogLogPlusCounterOld(int p) {
+ this(p, Hashing.murmur3_128());
+ }
+
+ public HyperLogLogPlusCounterOld(HyperLogLogPlusCounterOld another) {
+ this(another.p, another.hashFunc);
+ merge(another);
+ }
+
+ /** The larger p is, the more storage (2^p bytes), the better accuracy */
+ private HyperLogLogPlusCounterOld(int p, HashFunction hashFunc) {
+ this.p = p;
+ this.m = 1 << p;//(int) Math.pow(2, p);
+ this.hashFunc = hashFunc;
+ this.registers = new byte[m];
+ this.singleBucket = -1;
+ }
+
+ public void clear() {
+ byte zero = (byte) 0;
+ if (singleBucket == -1) {
+ //nothing
+ } else if (singleBucket >= 0) {
+ registers[singleBucket] = 0;
+ } else {
+ Arrays.fill(registers, zero);
+ }
+ singleBucket = -1;
+ }
+
+ public void add(int value) {
+ add(hashFunc.hashInt(value).asLong());
+ }
+
+ public void add(String value) {
+ add(hashFunc.hashString(value, Charset.defaultCharset()).asLong());
+ }
+
+ public void add(byte[] value) {
+ add(hashFunc.hashBytes(value).asLong());
+ }
+
+ public void add(byte[] value, int offset, int length) {
+ add(hashFunc.hashBytes(value, offset, length).asLong());
+ }
+
+ protected void add(long hash) {
+ int bucketMask = m - 1;
+ int bucket = (int) (hash & bucketMask);
+ int firstOnePos = Long.numberOfLeadingZeros(hash | bucketMask) + 1;
+
+ if (firstOnePos > registers[bucket])
+ registers[bucket] = (byte) firstOnePos;
+
+ if (singleBucket == -1)
+ singleBucket = bucket;
+ else
+ singleBucket = Integer.MIN_VALUE;
+ }
+
+ public void merge(HyperLogLogPlusCounterOld another) {
+ assert this.p == another.p;
+ assert this.hashFunc == another.hashFunc;
+
+ // quick path for single value HLLC
+ if (another.singleBucket == -1) {
+ return;
+ } else if (another.singleBucket >= 0) {
+ int b = another.singleBucket;
+ if (registers[b] < another.registers[b])
+ registers[b] = another.registers[b];
+ } else {
+ // normal path
+ for (int i = 0; i < m; i++) {
+ if (registers[i] < another.registers[i])
+ registers[i] = another.registers[i];
+ }
+ }
+ singleBucket = Integer.MIN_VALUE;
+ }
+
+ public long getCountEstimate() {
+ return new HLLCSnapshot(this).getCountEstimate();
+ }
+
+ public int getPrecision() {
+ return this.p;
+ }
+
+ public double getErrorRate() {
+ return 1.04 / Math.sqrt(m);
+ }
+
+ private int size() {
+ if (singleBucket == -1) {
+ return 0;
+ } else if (singleBucket >= 0) {
+ return 1;
+ } else {
+ int size = 0;
+ for (int i = 0; i < m; i++) {
+ if (registers[i] > 0)
+ size++;
+ }
+ return size;
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "" + getCountEstimate();
+ }
+
+ // ============================================================================
+
+ // a memory efficient snapshot of HLL registers which can yield count
+ // estimate later
+ public static class HLLCSnapshot {
+ byte p;
+ double registerSum;
+ int zeroBuckets;
+
+ public HLLCSnapshot(HyperLogLogPlusCounterOld hllc) {
+ p = (byte) hllc.p;
+ registerSum = 0;
+ zeroBuckets = 0;
+
+ byte[] registers = hllc.registers;
+ for (int i = 0; i < hllc.m; i++) {
+ if (registers[i] == 0) {
+ registerSum++;
+ zeroBuckets++;
+ } else {
+ registerSum += 1.0 / (1L << registers[i]);
+ }
+ }
+ }
+
+ public long getCountEstimate() {
+ int m = 1 << p;
+ double alpha = 0.7213 / (1 + 1.079 / m);
+ double estimate = alpha * m * m / registerSum;
+
+ // small cardinality adjustment
+ if (zeroBuckets >= m * 0.07) { // (reference presto's HLL impl)
+ estimate = m * Math.log(m * 1.0 / zeroBuckets);
+ } else if (HyperLogLogPlusTable.isBiasCorrection(m, estimate)) {
+ estimate = HyperLogLogPlusTable.biasCorrection(p, estimate);
+ }
+
+ return Math.round(estimate);
+ }
+ }
+
+ // ============================================================================
+
+ public void writeRegisters(final ByteBuffer out) throws IOException {
+
+ final int indexLen = getRegisterIndexSize();
+ int size = size();
+
+ // decide output scheme -- map (3*size bytes) or array (2^p bytes)
+ byte scheme;
+ if (5 + (indexLen + 1) * size < m) // 5 is max len of vint
+ scheme = 0; // map
+ else
+ scheme = 1; // array
+ out.put(scheme);
+
+ if (scheme == 0) { // map scheme
+ BytesUtil.writeVInt(size, out);
+ if (singleBucket == -1) {
+ // no non-zero register
+ } else if (singleBucket >= 0) {
+ writeUnsigned(singleBucket, indexLen, out);
+ out.put(registers[singleBucket]);
+ } else {
+ for (int i = 0; i < m; i++) {
+ if (registers[i] > 0) {
+ writeUnsigned(i, indexLen, out);
+ out.put(registers[i]);
+ }
+ }
+ }
+ } else if (scheme == 1) { // array scheme
+ out.put(registers);
+ } else
+ throw new IllegalStateException();
+ }
+
+ public void readRegisters(ByteBuffer in) throws IOException {
+ byte scheme = in.get();
+
+ if (scheme == 0) { // map scheme
+ clear();
+ int size = BytesUtil.readVInt(in);
+ if (size > m)
+ throw new IllegalArgumentException("register size (" + size + ") cannot be larger than m (" + m + ")");
+ int indexLen = getRegisterIndexSize();
+ int key = 0;
+ for (int i = 0; i < size; i++) {
+ key = readUnsigned(in, indexLen);
+ registers[key] = in.get();
+ }
+
+ if (size == 0)
+ singleBucket = -1;
+ else if (size == 1)
+ singleBucket = key;
+ else
+ singleBucket = Integer.MIN_VALUE;
+
+ } else if (scheme == 1) { // array scheme
+ in.get(registers);
+ singleBucket = Integer.MIN_VALUE;
+ } else
+ throw new IllegalStateException();
+ }
+
+ public int peekLength(ByteBuffer in) {
+ int mark = in.position();
+ int len;
+
+ byte scheme = in.get();
+ if (scheme == 0) { // map scheme
+ int size = BytesUtil.readVInt(in);
+ int indexLen = getRegisterIndexSize();
+ len = in.position() - mark + (indexLen + 1) * size;
+ } else {
+ len = in.position() - mark + m;
+ }
+
+ in.position(mark);
+ return len;
+ }
+
+ public int maxLength() {
+ return 1 + m;
+ }
+
+ /*public void writeRegistersArray(final ByteBuffer out) {
+ out.put(this.registers);
+ }
+
+ public void readRegistersArray(ByteBuffer in) {
+ in.get(registers, 0, m);
+ singleBucket = Integer.MIN_VALUE;
+ }*/
+
+ private int getRegisterIndexSize() {
+ return (p - 1) / 8 + 1; // 2 when p=16, 3 when p=17
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((hashFunc == null) ? 0 : hashFunc.hashCode());
+ result = prime * result + p;
+ result = prime * result + Arrays.hashCode(registers);
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ HyperLogLogPlusCounterOld other = (HyperLogLogPlusCounterOld) obj;
+ if (hashFunc == null) {
+ if (other.hashFunc != null)
+ return false;
+ } else if (!hashFunc.equals(other.hashFunc))
+ return false;
+ if (p != other.p)
+ return false;
+ if (!Arrays.equals(registers, other.registers))
+ return false;
+ return true;
+ }
+
+ @Override
+ public int compareTo(HyperLogLogPlusCounterOld o) {
+ if (o == null)
+ return 1;
+
+ long e1 = this.getCountEstimate();
+ long e2 = o.getCountEstimate();
+
+ if (e1 == e2)
+ return 0;
+ else if (e1 > e2)
+ return 1;
+ else
+ return -1;
+ }
+
+ public static void main(String[] args) throws IOException {
+ dumpErrorRates();
+ }
+
+ static void dumpErrorRates() {
+ for (int p = 10; p <= 18; p++) {
+ double rate = new HyperLogLogPlusCounterOld(p).getErrorRate();
+ double er = Math.round(rate * 10000) / 100D;
+ double er2 = Math.round(rate * 2 * 10000) / 100D;
+ double er3 = Math.round(rate * 3 * 10000) / 100D;
+ long size = Math.round(Math.pow(2, p));
+ System.out.println("HLLC" + p + ",\t" + size + " bytes,\t68% err<" + er + "%" + ",\t95% err<" + er2 + "%" + ",\t99.7% err<" + er3 + "%");
+ }
+ }
+
+ /**
+ *
+ * @param num
+ * @param size
+ * @param out
+ */
+ public static void writeUnsigned(int num, int size, ByteBuffer out) {
+ for (int i = 0; i < size; i++) {
+ out.put((byte) num);
+ num >>>= 8;
+ }
+ }
+
+ public static int readUnsigned(ByteBuffer in, int size) {
+ int integer = 0;
+ int mask = 0xff;
+ int shift = 0;
+ for (int i = 0; i < size; i++) {
+ integer |= (in.get() << shift) & mask;
+ mask = mask << 8;
+ shift += 8;
+ }
+ return integer;
+ }
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/hllc/Register.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/Register.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/Register.java
new file mode 100644
index 0000000..79c4bba
--- /dev/null
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/Register.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package org.apache.kylin.measure.hllc;
+
+/**
+ * Created by xiefan on 16-12-9.
+ */
+public interface Register {
+
+ void set(int pos, byte value);
+
+ Byte get(int pos);
+
+ void merge(Register another);
+
+ void clear();
+
+ int getSize();
+
+ int getHashCode();
+
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/hllc/RegisterType.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/RegisterType.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/RegisterType.java
new file mode 100644
index 0000000..fec9939
--- /dev/null
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/RegisterType.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package org.apache.kylin.measure.hllc;
+
+/**
+ * Created by xiefan on 16-12-9.
+ */
+public enum RegisterType {
+ SPARSE, DENSE
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/main/java/org/apache/kylin/measure/hllc/SparseRegister.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/SparseRegister.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/SparseRegister.java
new file mode 100644
index 0000000..d241e81
--- /dev/null
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/SparseRegister.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+package org.apache.kylin.measure.hllc;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ * Created by xiefan on 16-12-9.
+ */
+public class SparseRegister implements Register {
+
+ private int overThreshold;
+
+ private Map<Integer, Byte> sparseRegister = new TreeMap<>();
+
+ public SparseRegister(int overThreshold) {
+ this.overThreshold = overThreshold;
+ }
+
+ public DenseRegister toDense(int p) {
+ DenseRegister dr = new DenseRegister(p);
+ for (Map.Entry<Integer, Byte> entry : sparseRegister.entrySet()) {
+ dr.set(entry.getKey(), entry.getValue());
+ }
+ return dr;
+ }
+
+ @Override
+ public void set(int pos, byte value) {
+ sparseRegister.put(pos, value);
+ }
+
+ @Override
+ public Byte get(int pos) {
+ return sparseRegister.get(pos);
+ }
+
+ @Override
+ public void merge(Register another) {
+ assert another instanceof SparseRegister;
+ SparseRegister sr = (SparseRegister) another;
+ for (Map.Entry<Integer, Byte> entry : sr.sparseRegister.entrySet()) {
+ Byte v = sparseRegister.get(entry.getKey());
+ if (v == null || entry.getValue() > v)
+ sparseRegister.put(entry.getKey(), entry.getValue());
+ }
+ }
+
+ @Override
+ public void clear() {
+ sparseRegister.clear();
+ }
+
+ @Override
+ public int getSize() {
+ return sparseRegister.size();
+ }
+
+ @Override
+ public int getHashCode() {
+ final int prime = 31;
+ int result = 1;
+ for (Map.Entry<Integer, Byte> entry : sparseRegister.entrySet()) {
+ result = prime * result + entry.getKey();
+ result = prime * result + entry.getValue();
+ }
+ return result;
+ }
+
+ public boolean isOverThreshold() {
+ if (this.sparseRegister.size() > overThreshold)
+ return true;
+ return false;
+ }
+
+ public Collection<Map.Entry<Integer, Byte>> getAllValue() {
+ return Collections.unmodifiableCollection(sparseRegister.entrySet());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/test/java/org/apache/kylin/measure/AggregatorMemEstimateTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/AggregatorMemEstimateTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/AggregatorMemEstimateTest.java
index 3adec73..103e721 100644
--- a/core-metadata/src/test/java/org/apache/kylin/measure/AggregatorMemEstimateTest.java
+++ b/core-metadata/src/test/java/org/apache/kylin/measure/AggregatorMemEstimateTest.java
@@ -26,7 +26,7 @@ import org.apache.kylin.measure.bitmap.BitmapAggregator;
import org.apache.kylin.measure.bitmap.BitmapCounter;
import org.apache.kylin.measure.extendedcolumn.ExtendedColumnMeasureType;
import org.apache.kylin.measure.hllc.HLLCAggregator;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
import org.apache.kylin.metadata.datatype.DataType;
import org.apache.kylin.metadata.datatype.DoubleMutable;
import org.apache.kylin.metadata.datatype.LongMutable;
@@ -94,7 +94,7 @@ public class AggregatorMemEstimateTest extends LocalFileMetadataTestCase {
@Test
public void testAggregatorEstimate() {
HLLCAggregator hllcAggregator = new HLLCAggregator(14);
- hllcAggregator.aggregate(new HyperLogLogPlusCounter(14));
+ hllcAggregator.aggregate(new HyperLogLogPlusCounterNew(14));
BitmapAggregator bitmapAggregator = new BitmapAggregator();
BitmapCounter bitmapCounter = new BitmapCounter();
http://git-wip-us.apache.org/repos/asf/kylin/blob/f05404d5/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterOldTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterOldTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterOldTest.java
new file mode 100644
index 0000000..5d17fea
--- /dev/null
+++ b/core-metadata/src/test/java/org/apache/kylin/measure/hll/HyperLogLogCounterOldTest.java
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.measure.hll;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.kylin.common.util.Bytes;
+import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterOld;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * @author yangli9
+ *
+ */
+public class HyperLogLogCounterOldTest {
+
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 1024);
+ Random rand1 = new Random(1);
+ Random rand2 = new Random(2);
+ Random rand3 = new Random(3);
+ int errorCount1 = 0;
+ int errorCount2 = 0;
+ int errorCount3 = 0;
+
+ @Test
+ public void testOneAdd() throws IOException {
+ HyperLogLogPlusCounterOld hllc = new HyperLogLogPlusCounterOld(14);
+ HyperLogLogPlusCounterOld one = new HyperLogLogPlusCounterOld(14);
+ for (int i = 0; i < 1000000; i++) {
+ one.clear();
+ one.add(rand1.nextInt());
+ hllc.merge(one);
+ }
+ assertTrue(hllc.getCountEstimate() > 1000000 * 0.9);
+ }
+
+ @Test
+ public void testPeekLength() throws IOException {
+ HyperLogLogPlusCounterOld hllc = new HyperLogLogPlusCounterOld(10);
+ HyperLogLogPlusCounterOld copy = new HyperLogLogPlusCounterOld(10);
+ byte[] value = new byte[10];
+ for (int i = 0; i < 200000; i++) {
+ rand1.nextBytes(value);
+ hllc.add(value);
+
+ buf.clear();
+ hllc.writeRegisters(buf);
+
+ int len = buf.position();
+ buf.position(0);
+ assertEquals(len, hllc.peekLength(buf));
+
+ copy.readRegisters(buf);
+ assertEquals(len, buf.position());
+ assertEquals(hllc, copy);
+ }
+ buf.clear();
+ }
+
+ private Set<String> generateTestData(int n) {
+ Set<String> testData = new HashSet<String>();
+ for (int i = 0; i < n; i++) {
+ String[] samples = generateSampleData();
+ for (String sample : samples) {
+ testData.add(sample);
+ }
+ }
+ return testData;
+ }
+
+ // simulate the visit (=visitor+id)
+ private String[] generateSampleData() {
+
+ StringBuilder buf = new StringBuilder();
+ for (int i = 0; i < 19; i++) {
+ buf.append(Math.abs(rand1.nextInt()) % 10);
+ }
+ String header = buf.toString();
+
+ int size = Math.abs(rand3.nextInt()) % 9 + 1;
+ String[] samples = new String[size];
+ for (int k = 0; k < size; k++) {
+ buf = new StringBuilder(header);
+ buf.append("-");
+ for (int i = 0; i < 10; i++) {
+ buf.append(Math.abs(rand3.nextInt()) % 10);
+ }
+ samples[k] = buf.toString();
+ }
+
+ return samples;
+ }
+
+ @Test
+ public void countTest() throws IOException {
+ int n = 10;
+ for (int i = 0; i < 5; i++) {
+ count(n);
+ n *= 10;
+ }
+ }
+
+ private void count(int n) throws IOException {
+ Set<String> testSet = generateTestData(n);
+
+ HyperLogLogPlusCounterOld hllc = newHLLC();
+ for (String testData : testSet) {
+ hllc.add(Bytes.toBytes(testData));
+ }
+ long estimate = hllc.getCountEstimate();
+ double errorRate = hllc.getErrorRate();
+ double actualError = (double) Math.abs(testSet.size() - estimate) / testSet.size();
+ System.out.println(estimate);
+ System.out.println(testSet.size());
+ System.out.println(errorRate);
+ System.out.println("=" + actualError);
+ Assert.assertTrue(actualError < errorRate * 3.0);
+
+ checkSerialize(hllc);
+ }
+
+ private void checkSerialize(HyperLogLogPlusCounterOld hllc) throws IOException {
+ long estimate = hllc.getCountEstimate();
+ buf.clear();
+ hllc.writeRegisters(buf);
+ buf.flip();
+ hllc.readRegisters(buf);
+ Assert.assertEquals(estimate, hllc.getCountEstimate());
+ }
+
+ @Test
+ public void mergeTest() throws IOException {
+ double error = 0;
+ int n = 100;
+ for (int i = 0; i < n; i++) {
+ double e = merge(i);
+ error += e;
+ }
+ System.out.println("Total average error is " + error / n);
+
+ System.out.println(" errorRateCount1 is " + errorCount1 + "!");
+ System.out.println(" errorRateCount2 is " + errorCount2 + "!");
+ System.out.println(" errorRateCount3 is " + errorCount3 + "!");
+
+ Assert.assertTrue(errorCount1 <= n * 0.30);
+ Assert.assertTrue(errorCount2 <= n * 0.05);
+ Assert.assertTrue(errorCount3 <= n * 0.02);
+ }
+
+ private double merge(int round) throws IOException {
+ int ln = 20;
+ int dn = 100 * (round + 1);
+ Set<String> testSet = new HashSet<String>();
+ HyperLogLogPlusCounterOld[] hllcs = new HyperLogLogPlusCounterOld[ln];
+ for (int i = 0; i < ln; i++) {
+ hllcs[i] = newHLLC();
+ for (int k = 0; k < dn; k++) {
+ String[] samples = generateSampleData();
+ for (String data : samples) {
+ testSet.add(data);
+ hllcs[i].add(Bytes.toBytes(data));
+ }
+ }
+ }
+ HyperLogLogPlusCounterOld mergeHllc = newHLLC();
+ for (HyperLogLogPlusCounterOld hllc : hllcs) {
+ mergeHllc.merge(serDes(hllc));
+ }
+
+ double errorRate = mergeHllc.getErrorRate();
+ long estimate = mergeHllc.getCountEstimate();
+ double actualError = Math.abs((double) (testSet.size() - estimate) / testSet.size());
+
+ System.out.println(testSet.size() + "-" + estimate + " ~ " + actualError);
+ Assert.assertTrue(actualError < 0.1);
+
+ if (actualError > errorRate) {
+ errorCount1++;
+ }
+ if (actualError > 2 * errorRate) {
+ errorCount2++;
+ }
+ if (actualError > 3 * errorRate) {
+ errorCount3++;
+ }
+
+ return actualError;
+ }
+
+ private HyperLogLogPlusCounterOld serDes(HyperLogLogPlusCounterOld hllc) throws IOException {
+ buf.clear();
+ hllc.writeRegisters(buf);
+ buf.flip();
+ HyperLogLogPlusCounterOld copy = new HyperLogLogPlusCounterOld(hllc.getPrecision());
+ copy.readRegisters(buf);
+ Assert.assertEquals(copy.getCountEstimate(), hllc.getCountEstimate());
+ return copy;
+ }
+
+ @Test
+ public void testPerformance() throws IOException {
+ int N = 3; // reduce N HLLC into one
+ int M = 1000; // for M times, use 100000 for real perf test
+
+ HyperLogLogPlusCounterOld samples[] = new HyperLogLogPlusCounterOld[N];
+ for (int i = 0; i < N; i++) {
+ samples[i] = newHLLC();
+ for (String str : generateTestData(10000))
+ samples[i].add(str);
+ }
+
+ System.out.println("Perf test running ... ");
+ long start = System.currentTimeMillis();
+ HyperLogLogPlusCounterOld sum = newHLLC();
+ for (int i = 0; i < M; i++) {
+ sum.clear();
+ for (int j = 0; j < N; j++) {
+ sum.merge(samples[j]);
+ checkSerialize(sum);
+ }
+ }
+ long duration = System.currentTimeMillis() - start;
+ System.out.println("Perf test result: " + duration / 1000 + " seconds");
+ }
+
+ @Test
+ public void testEquivalence() {
+ byte[] a = new byte[] { 0, 3, 4, 42, 2, 2 };
+ byte[] b = new byte[] { 3, 4, 42 };
+ HyperLogLogPlusCounterOld ha = new HyperLogLogPlusCounterOld();
+ HyperLogLogPlusCounterOld hb = new HyperLogLogPlusCounterOld();
+ ha.add(a, 1, 3);
+ hb.add(b);
+
+ Assert.assertTrue(ha.getCountEstimate() == hb.getCountEstimate());
+ }
+
+ private HyperLogLogPlusCounterOld newHLLC() {
+ return new HyperLogLogPlusCounterOld(16);
+ }
+}
[3/5] kylin git commit: KYLIN-1832 code review
Posted by li...@apache.org.
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
----------------------------------------------------------------------
diff --git a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
index 76212c8..6e894dd 100644
--- a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
+++ b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java
@@ -83,7 +83,7 @@ import org.apache.kylin.engine.spark.cube.DefaultTupleConverter;
import org.apache.kylin.engine.spark.util.IteratorUtils;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.MeasureAggregators;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.model.FunctionDesc;
import org.apache.kylin.metadata.model.IJoinedFlatTableDesc;
import org.apache.kylin.metadata.model.MeasureDesc;
@@ -241,15 +241,15 @@ public class SparkCubing extends AbstractApplication {
}
}
- private Map<Long, HyperLogLogPlusCounterNew> sampling(final JavaRDD<List<String>> rowJavaRDD, final String cubeName, String segmentId) throws Exception {
+ private Map<Long, HLLCounter> sampling(final JavaRDD<List<String>> rowJavaRDD, final String cubeName, String segmentId) throws Exception {
CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()).reloadCubeLocal(cubeName);
CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);
CubeDesc cubeDesc = cubeInstance.getDescriptor();
CuboidScheduler cuboidScheduler = new CuboidScheduler(cubeDesc);
List<Long> allCuboidIds = cuboidScheduler.getAllCuboidIds();
- final HashMap<Long, HyperLogLogPlusCounterNew> zeroValue = Maps.newHashMap();
+ final HashMap<Long, HLLCounter> zeroValue = Maps.newHashMap();
for (Long id : allCuboidIds) {
- zeroValue.put(id, new HyperLogLogPlusCounterNew(cubeDesc.getConfig().getCubeStatsHLLPrecision()));
+ zeroValue.put(id, new HLLCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision()));
}
CubeJoinedFlatTableEnrich flatDesc = new CubeJoinedFlatTableEnrich(EngineFactory.getJoinedFlatTableDesc(cubeSegment), cubeDesc);
@@ -278,12 +278,12 @@ public class SparkCubing extends AbstractApplication {
row_hashcodes[i] = new ByteArray();
}
- final HashMap<Long, HyperLogLogPlusCounterNew> samplingResult = rowJavaRDD.aggregate(zeroValue, new Function2<HashMap<Long, HyperLogLogPlusCounterNew>, List<String>, HashMap<Long, HyperLogLogPlusCounterNew>>() {
+ final HashMap<Long, HLLCounter> samplingResult = rowJavaRDD.aggregate(zeroValue, new Function2<HashMap<Long, HLLCounter>, List<String>, HashMap<Long, HLLCounter>>() {
final HashFunction hashFunction = Hashing.murmur3_128();
@Override
- public HashMap<Long, HyperLogLogPlusCounterNew> call(HashMap<Long, HyperLogLogPlusCounterNew> v1, List<String> v2) throws Exception {
+ public HashMap<Long, HLLCounter> call(HashMap<Long, HLLCounter> v1, List<String> v2) throws Exception {
for (int i = 0; i < nRowKey; i++) {
Hasher hc = hashFunction.newHasher();
String colValue = v2.get(rowKeyColumnIndexes[i]);
@@ -296,7 +296,7 @@ public class SparkCubing extends AbstractApplication {
for (Map.Entry<Long, Integer[]> entry : allCuboidsBitSet.entrySet()) {
Hasher hc = hashFunction.newHasher();
- HyperLogLogPlusCounterNew counter = v1.get(entry.getKey());
+ HLLCounter counter = v1.get(entry.getKey());
final Integer[] cuboidBitSet = entry.getValue();
for (int position = 0; position < cuboidBitSet.length; position++) {
hc.putBytes(row_hashcodes[cuboidBitSet[position]].array());
@@ -305,14 +305,14 @@ public class SparkCubing extends AbstractApplication {
}
return v1;
}
- }, new Function2<HashMap<Long, HyperLogLogPlusCounterNew>, HashMap<Long, HyperLogLogPlusCounterNew>, HashMap<Long, HyperLogLogPlusCounterNew>>() {
+ }, new Function2<HashMap<Long, HLLCounter>, HashMap<Long, HLLCounter>, HashMap<Long, HLLCounter>>() {
@Override
- public HashMap<Long, HyperLogLogPlusCounterNew> call(HashMap<Long, HyperLogLogPlusCounterNew> v1, HashMap<Long, HyperLogLogPlusCounterNew> v2) throws Exception {
+ public HashMap<Long, HLLCounter> call(HashMap<Long, HLLCounter> v1, HashMap<Long, HLLCounter> v2) throws Exception {
Preconditions.checkArgument(v1.size() == v2.size());
Preconditions.checkArgument(v1.size() > 0);
- for (Map.Entry<Long, HyperLogLogPlusCounterNew> entry : v1.entrySet()) {
- final HyperLogLogPlusCounterNew counter1 = entry.getValue();
- final HyperLogLogPlusCounterNew counter2 = v2.get(entry.getKey());
+ for (Map.Entry<Long, HLLCounter> entry : v1.entrySet()) {
+ final HLLCounter counter1 = entry.getValue();
+ final HLLCounter counter2 = v2.get(entry.getKey());
counter1.merge(Preconditions.checkNotNull(counter2, "counter cannot be null"));
}
return v1;
@@ -470,7 +470,7 @@ public class SparkCubing extends AbstractApplication {
ClassUtil.addClasspath(confPath);
}
- private byte[][] createHTable(String cubeName, String segmentId, Map<Long, HyperLogLogPlusCounterNew> samplingResult) throws Exception {
+ private byte[][] createHTable(String cubeName, String segmentId, Map<Long, HLLCounter> samplingResult) throws Exception {
final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
final CubeInstance cubeInstance = CubeManager.getInstance(kylinConfig).getCube(cubeName);
final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);
@@ -614,7 +614,7 @@ public class SparkCubing extends AbstractApplication {
}
});
- final Map<Long, HyperLogLogPlusCounterNew> samplingResult = sampling(rowJavaRDD, cubeName, segmentId);
+ final Map<Long, HLLCounter> samplingResult = sampling(rowJavaRDD, cubeName, segmentId);
final byte[][] splitKeys = createHTable(cubeName, segmentId, samplingResult);
final String hfile = build(rowJavaRDD, cubeName, segmentId, splitKeys);
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityMapper.java
----------------------------------------------------------------------
diff --git a/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityMapper.java b/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityMapper.java
index 230249f..f046f78 100644
--- a/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityMapper.java
+++ b/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityMapper.java
@@ -35,7 +35,7 @@ import org.apache.kylin.engine.mr.MRUtil;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.measure.BufferedMeasureCodec;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.MetadataManager;
import org.apache.kylin.metadata.model.ColumnDesc;
import org.apache.kylin.metadata.model.TableDesc;
@@ -46,7 +46,7 @@ import org.apache.kylin.metadata.model.TableDesc;
*/
public class ColumnCardinalityMapper<T> extends KylinMapper<T, Object, IntWritable, BytesWritable> {
- private Map<Integer, HyperLogLogPlusCounterNew> hllcMap = new HashMap<Integer, HyperLogLogPlusCounterNew>();
+ private Map<Integer, HLLCounter> hllcMap = new HashMap<Integer, HLLCounter>();
public static final String DEFAULT_DELIM = ",";
private int counter = 0;
@@ -87,9 +87,9 @@ public class ColumnCardinalityMapper<T> extends KylinMapper<T, Object, IntWritab
counter++;
}
- private HyperLogLogPlusCounterNew getHllc(Integer key) {
+ private HLLCounter getHllc(Integer key) {
if (!hllcMap.containsKey(key)) {
- hllcMap.put(key, new HyperLogLogPlusCounterNew());
+ hllcMap.put(key, new HLLCounter());
}
return hllcMap.get(key);
}
@@ -100,7 +100,7 @@ public class ColumnCardinalityMapper<T> extends KylinMapper<T, Object, IntWritab
ByteBuffer buf = ByteBuffer.allocate(BufferedMeasureCodec.DEFAULT_BUFFER_SIZE);
while (it.hasNext()) {
int key = it.next();
- HyperLogLogPlusCounterNew hllc = hllcMap.get(key);
+ HLLCounter hllc = hllcMap.get(key);
buf.clear();
hllc.writeRegisters(buf);
buf.flip();
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducer.java
----------------------------------------------------------------------
diff --git a/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducer.java b/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducer.java
index 32cc6d9..0648960 100644
--- a/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducer.java
+++ b/source-hive/src/main/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducer.java
@@ -32,7 +32,7 @@ import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.kylin.engine.mr.KylinReducer;
import org.apache.kylin.measure.BufferedMeasureCodec;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
/**
* @author Jack
@@ -41,7 +41,7 @@ import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
public class ColumnCardinalityReducer extends KylinReducer<IntWritable, BytesWritable, IntWritable, LongWritable> {
public static final int ONE = 1;
- private Map<Integer, HyperLogLogPlusCounterNew> hllcMap = new HashMap<Integer, HyperLogLogPlusCounterNew>();
+ private Map<Integer, HLLCounter> hllcMap = new HashMap<Integer, HLLCounter>();
@Override
protected void setup(Context context) throws IOException {
@@ -53,16 +53,16 @@ public class ColumnCardinalityReducer extends KylinReducer<IntWritable, BytesWri
int skey = key.get();
for (BytesWritable v : values) {
ByteBuffer buffer = ByteBuffer.wrap(v.getBytes());
- HyperLogLogPlusCounterNew hll = new HyperLogLogPlusCounterNew();
+ HLLCounter hll = new HLLCounter();
hll.readRegisters(buffer);
getHllc(skey).merge(hll);
hll.clear();
}
}
- private HyperLogLogPlusCounterNew getHllc(Integer key) {
+ private HLLCounter getHllc(Integer key) {
if (!hllcMap.containsKey(key)) {
- hllcMap.put(key, new HyperLogLogPlusCounterNew());
+ hllcMap.put(key, new HLLCounter());
}
return hllcMap.get(key);
}
@@ -78,7 +78,7 @@ public class ColumnCardinalityReducer extends KylinReducer<IntWritable, BytesWri
it = keys.iterator();
while (it.hasNext()) {
int key = it.next();
- HyperLogLogPlusCounterNew hllc = hllcMap.get(key);
+ HLLCounter hllc = hllcMap.get(key);
ByteBuffer buf = ByteBuffer.allocate(BufferedMeasureCodec.DEFAULT_BUFFER_SIZE);
buf.clear();
hllc.writeRegisters(buf);
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/source-hive/src/test/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducerTest.java
----------------------------------------------------------------------
diff --git a/source-hive/src/test/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducerTest.java b/source-hive/src/test/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducerTest.java
index 410543a..c32e76d 100644
--- a/source-hive/src/test/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducerTest.java
+++ b/source-hive/src/test/java/org/apache/kylin/source/hive/cardinality/ColumnCardinalityReducerTest.java
@@ -35,7 +35,7 @@ import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.apache.hadoop.mrunit.types.Pair;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.measure.BufferedMeasureCodec;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.junit.Before;
import org.junit.Test;
@@ -57,7 +57,7 @@ public class ColumnCardinalityReducerTest {
}
private byte[] getBytes(String str) throws IOException {
- HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew();
+ HLLCounter hllc = new HLLCounter();
StringTokenizer tokenizer = new StringTokenizer(str, ColumnCardinalityMapper.DEFAULT_DELIM);
int i = 0;
while (tokenizer.hasMoreTokens()) {
[5/5] kylin git commit: KYLIN-1832 code review
Posted by li...@apache.org.
KYLIN-1832 code review
Project: http://git-wip-us.apache.org/repos/asf/kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/e6e330a8
Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/e6e330a8
Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/e6e330a8
Branch: refs/heads/master
Commit: e6e330a8bd47f1d2dd5fd6f68b510c3cf0be0287
Parents: f05404d
Author: Li Yang <li...@apache.org>
Authored: Wed Dec 14 15:29:56 2016 +0800
Committer: Li Yang <li...@apache.org>
Committed: Wed Dec 14 15:29:56 2016 +0800
----------------------------------------------------------------------
.../org/apache/kylin/cube/util/CubingUtils.java | 12 +-
.../apache/kylin/gridtable/UnitTestSupport.java | 22 +-
.../benchmark/GTScannerBenchmark2.java | 4 +-
.../gridtable/AggregationCacheMemSizeTest.java | 4 +-
.../metadata/measure/MeasureCodecTest.java | 4 +-
.../org/apache/kylin/measure/MeasureType.java | 2 +-
.../kylin/measure/MeasureTypeFactory.java | 2 +-
.../kylin/measure/hllc/DenseRegister.java | 26 +-
.../kylin/measure/hllc/HLLCAggregator.java | 10 +-
.../kylin/measure/hllc/HLLCMeasureType.java | 20 +-
.../kylin/measure/hllc/HLLCSerializer.java | 16 +-
.../apache/kylin/measure/hllc/HLLCounter.java | 377 ++++++++++++++++++
.../kylin/measure/hllc/HLLCounterOld.java | 393 +++++++++++++++++++
.../measure/hllc/HLLDistinctCountAggFunc.java | 22 +-
.../measure/hllc/HyperLogLogPlusCounterNew.java | 388 ------------------
.../measure/hllc/HyperLogLogPlusCounterOld.java | 392 ------------------
.../org/apache/kylin/measure/hllc/Register.java | 4 +-
.../kylin/measure/hllc/SparseRegister.java | 38 +-
.../measure/AggregatorMemEstimateTest.java | 4 +-
.../measure/hll/HyperLogLogCounterOldTest.java | 265 -------------
.../measure/hll2/HyperLogLogCounterNewTest.java | 301 --------------
.../hll2/NewHyperLogLogBenchmarkTest.java | 288 --------------
.../kylin/measure/hllc/HLLCounterOldTest.java | 266 +++++++++++++
.../kylin/measure/hllc/HLLCounterTest.java | 316 +++++++++++++++
.../hllc/NewHyperLogLogBenchmarkTest.java | 291 ++++++++++++++
.../kylin/engine/mr/common/CubeStatsReader.java | 12 +-
.../kylin/engine/mr/common/CubeStatsWriter.java | 6 +-
.../mr/steps/FactDistinctColumnsReducer.java | 8 +-
.../mr/steps/FactDistinctHiveColumnsMapper.java | 10 +-
.../engine/mr/steps/MergeStatisticsStep.java | 6 +-
.../kylin/engine/mr/steps/CubeSamplingTest.java | 8 +-
.../steps/FactDistinctColumnsReducerTest.java | 4 +-
.../apache/kylin/engine/spark/SparkCubing.java | 28 +-
.../cardinality/ColumnCardinalityMapper.java | 10 +-
.../cardinality/ColumnCardinalityReducer.java | 12 +-
.../ColumnCardinalityReducerTest.java | 4 +-
36 files changed, 1802 insertions(+), 1773 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java
----------------------------------------------------------------------
diff --git a/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java b/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java
index 35139a4..5e63f94 100644
--- a/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java
+++ b/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java
@@ -38,7 +38,7 @@ import org.apache.kylin.dict.DictionaryGenerator;
import org.apache.kylin.dict.DictionaryInfo;
import org.apache.kylin.dict.DictionaryManager;
import org.apache.kylin.dict.IterableDictionaryValueEnumerator;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.model.IJoinedFlatTableDesc;
import org.apache.kylin.metadata.model.TblColRef;
import org.apache.kylin.source.ReadableTable;
@@ -59,7 +59,7 @@ public class CubingUtils {
private static Logger logger = LoggerFactory.getLogger(CubingUtils.class);
- public static Map<Long, HyperLogLogPlusCounterNew> sampling(CubeDesc cubeDesc, IJoinedFlatTableDesc flatDescIn, Iterable<List<String>> streams) {
+ public static Map<Long, HLLCounter> sampling(CubeDesc cubeDesc, IJoinedFlatTableDesc flatDescIn, Iterable<List<String>> streams) {
final CubeJoinedFlatTableEnrich flatDesc = new CubeJoinedFlatTableEnrich(flatDescIn, cubeDesc);
final int rowkeyLength = cubeDesc.getRowkey().getRowKeyColumns().length;
final List<Long> allCuboidIds = new CuboidScheduler(cubeDesc).getAllCuboidIds();
@@ -84,9 +84,9 @@ public class CubingUtils {
return result;
}
});
- final Map<Long, HyperLogLogPlusCounterNew> result = Maps.newHashMapWithExpectedSize(allCuboidIds.size());
+ final Map<Long, HLLCounter> result = Maps.newHashMapWithExpectedSize(allCuboidIds.size());
for (Long cuboidId : allCuboidIds) {
- result.put(cuboidId, new HyperLogLogPlusCounterNew(cubeDesc.getConfig().getCubeStatsHLLPrecision()));
+ result.put(cuboidId, new HLLCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision()));
Integer[] cuboidBitSet = new Integer[Long.bitCount(cuboidId)];
long mask = Long.highestOneBit(baseCuboidId);
@@ -118,9 +118,9 @@ public class CubingUtils {
}
}
- for (Map.Entry<Long, HyperLogLogPlusCounterNew> longHyperLogLogPlusCounterNewEntry : result.entrySet()) {
+ for (Map.Entry<Long, HLLCounter> longHyperLogLogPlusCounterNewEntry : result.entrySet()) {
Long cuboidId = longHyperLogLogPlusCounterNewEntry.getKey();
- HyperLogLogPlusCounterNew counter = longHyperLogLogPlusCounterNewEntry.getValue();
+ HLLCounter counter = longHyperLogLogPlusCounterNewEntry.getValue();
Hasher hc = hf.newHasher();
final Integer[] cuboidBitSet = allCuboidsBitSet.get(cuboidId);
for (int position = 0; position < cuboidBitSet.length; position++) {
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-cube/src/main/java/org/apache/kylin/gridtable/UnitTestSupport.java
----------------------------------------------------------------------
diff --git a/core-cube/src/main/java/org/apache/kylin/gridtable/UnitTestSupport.java b/core-cube/src/main/java/org/apache/kylin/gridtable/UnitTestSupport.java
index 6cbf237..b8d116c 100644
--- a/core-cube/src/main/java/org/apache/kylin/gridtable/UnitTestSupport.java
+++ b/core-cube/src/main/java/org/apache/kylin/gridtable/UnitTestSupport.java
@@ -26,7 +26,7 @@ import java.util.List;
import org.apache.kylin.common.util.DateFormat;
import org.apache.kylin.common.util.ImmutableBitSet;
import org.apache.kylin.gridtable.GTInfo.Builder;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.datatype.DataType;
import org.apache.kylin.metadata.datatype.LongMutable;
@@ -106,16 +106,16 @@ public class UnitTestSupport {
String d_01_15 = datePlus("2015-01-15", i * 4);
String d_01_16 = datePlus("2015-01-16", i * 4);
String d_01_17 = datePlus("2015-01-17", i * 4);
- result.add(newRec(info, d_01_14, "Yang", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
- result.add(newRec(info, d_01_14, "Luke", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
- result.add(newRec(info, d_01_15, "Xu", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
- result.add(newRec(info, d_01_15, "Dong", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
- result.add(newRec(info, d_01_15, "Jason", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
- result.add(newRec(info, d_01_16, "Mahone", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
- result.add(newRec(info, d_01_16, "Shaofeng", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
- result.add(newRec(info, d_01_16, "Qianhao", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
- result.add(newRec(info, d_01_16, "George", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
- result.add(newRec(info, d_01_17, "Kejia", "Food", new LongMutable(10), new BigDecimal("10.5"), new HyperLogLogPlusCounterNew(14)));
+ result.add(newRec(info, d_01_14, "Yang", "Food", new LongMutable(10), new BigDecimal("10.5"), new HLLCounter(14)));
+ result.add(newRec(info, d_01_14, "Luke", "Food", new LongMutable(10), new BigDecimal("10.5"), new HLLCounter(14)));
+ result.add(newRec(info, d_01_15, "Xu", "Food", new LongMutable(10), new BigDecimal("10.5"), new HLLCounter(14)));
+ result.add(newRec(info, d_01_15, "Dong", "Food", new LongMutable(10), new BigDecimal("10.5"), new HLLCounter(14)));
+ result.add(newRec(info, d_01_15, "Jason", "Food", new LongMutable(10), new BigDecimal("10.5"), new HLLCounter(14)));
+ result.add(newRec(info, d_01_16, "Mahone", "Food", new LongMutable(10), new BigDecimal("10.5"), new HLLCounter(14)));
+ result.add(newRec(info, d_01_16, "Shaofeng", "Food", new LongMutable(10), new BigDecimal("10.5"), new HLLCounter(14)));
+ result.add(newRec(info, d_01_16, "Qianhao", "Food", new LongMutable(10), new BigDecimal("10.5"), new HLLCounter(14)));
+ result.add(newRec(info, d_01_16, "George", "Food", new LongMutable(10), new BigDecimal("10.5"), new HLLCounter(14)));
+ result.add(newRec(info, d_01_17, "Kejia", "Food", new LongMutable(10), new BigDecimal("10.5"), new HLLCounter(14)));
}
return result;
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-cube/src/main/java/org/apache/kylin/gridtable/benchmark/GTScannerBenchmark2.java
----------------------------------------------------------------------
diff --git a/core-cube/src/main/java/org/apache/kylin/gridtable/benchmark/GTScannerBenchmark2.java b/core-cube/src/main/java/org/apache/kylin/gridtable/benchmark/GTScannerBenchmark2.java
index f80bd24..85d8c37 100644
--- a/core-cube/src/main/java/org/apache/kylin/gridtable/benchmark/GTScannerBenchmark2.java
+++ b/core-cube/src/main/java/org/apache/kylin/gridtable/benchmark/GTScannerBenchmark2.java
@@ -34,7 +34,7 @@ import org.apache.kylin.gridtable.GTScanRequest;
import org.apache.kylin.gridtable.GTScanRequestBuilder;
import org.apache.kylin.gridtable.IGTScanner;
import org.apache.kylin.gridtable.benchmark.SortedGTRecordGenerator.Randomizer;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.datatype.DataType;
import org.apache.kylin.metadata.filter.ColumnTupleFilter;
import org.apache.kylin.metadata.filter.CompareTupleFilter;
@@ -80,7 +80,7 @@ public class GTScannerBenchmark2 {
gen.addDimension(100, 4, null);
gen.addMeasure(8);
gen.addMeasure(8, new Randomizer() {
- HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew(12);
+ HLLCounter hllc = new HLLCounter(12);
@Override
public int fillRandom(Random rand, byte[] array, int offset) {
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-cube/src/test/java/org/apache/kylin/gridtable/AggregationCacheMemSizeTest.java
----------------------------------------------------------------------
diff --git a/core-cube/src/test/java/org/apache/kylin/gridtable/AggregationCacheMemSizeTest.java b/core-cube/src/test/java/org/apache/kylin/gridtable/AggregationCacheMemSizeTest.java
index 66a6b51..8ffe055 100644
--- a/core-cube/src/test/java/org/apache/kylin/gridtable/AggregationCacheMemSizeTest.java
+++ b/core-cube/src/test/java/org/apache/kylin/gridtable/AggregationCacheMemSizeTest.java
@@ -26,7 +26,7 @@ import org.apache.kylin.measure.basic.LongSumAggregator;
import org.apache.kylin.measure.bitmap.BitmapAggregator;
import org.apache.kylin.measure.bitmap.BitmapCounter;
import org.apache.kylin.measure.hllc.HLLCAggregator;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.datatype.DoubleMutable;
import org.apache.kylin.metadata.datatype.LongMutable;
import org.github.jamm.MemoryMeter;
@@ -105,7 +105,7 @@ public class AggregationCacheMemSizeTest {
private HLLCAggregator createHLLCAggr() {
HLLCAggregator hllcAggregator = new HLLCAggregator(14);
- hllcAggregator.aggregate(new HyperLogLogPlusCounterNew(14));
+ hllcAggregator.aggregate(new HLLCounter(14));
return hllcAggregator;
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java
----------------------------------------------------------------------
diff --git a/core-cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java b/core-cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java
index cd1aa96..0f3f3a9 100644
--- a/core-cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java
+++ b/core-cube/src/test/java/org/apache/kylin/metadata/measure/MeasureCodecTest.java
@@ -26,7 +26,7 @@ import java.nio.ByteBuffer;
import org.apache.kylin.common.util.LocalFileMetadataTestCase;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.bitmap.BitmapCounter;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.datatype.DoubleMutable;
import org.apache.kylin.metadata.datatype.LongMutable;
import org.apache.kylin.metadata.model.FunctionDesc;
@@ -57,7 +57,7 @@ public class MeasureCodecTest extends LocalFileMetadataTestCase {
DoubleMutable d = new DoubleMutable(1.0);
LongMutable l = new LongMutable(2);
BigDecimal b = new BigDecimal("333.1234");
- HyperLogLogPlusCounterNew hllc = new HyperLogLogPlusCounterNew(16);
+ HLLCounter hllc = new HLLCounter(16);
hllc.add("1234567");
hllc.add("abcdefg");
BitmapCounter bitmap = new BitmapCounter();
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/MeasureType.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/MeasureType.java b/core-metadata/src/main/java/org/apache/kylin/measure/MeasureType.java
index 031636e..89ff382 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/MeasureType.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/MeasureType.java
@@ -36,7 +36,7 @@ import org.apache.kylin.metadata.tuple.TupleInfo;
* MeasureType captures how a kind of aggregation is defined, how it is calculated
* during cube build, and how it is involved in query and storage scan.
*
- * @param <T> the Java type of aggregation data object, e.g. HyperLogLogPlusCounterOld
+ * @param <T> the Java type of aggregation data object, e.g. HLLCounter
*/
abstract public class MeasureType<T> {
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java b/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java
index d94dec9..694459b 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java
@@ -62,7 +62,7 @@ import com.google.common.collect.Maps;
}
</pre>
*
- * @param <T> the Java type of aggregation data object, e.g. HyperLogLogPlusCounterOld
+ * @param <T> the Java type of aggregation data object, e.g. HLLCounter
*/
abstract public class MeasureTypeFactory<T> {
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
index 26ee6ab..c5814aa 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/DenseRegister.java
@@ -25,7 +25,6 @@ import java.util.Map;
* Created by xiefan on 16-12-9.
*/
public class DenseRegister implements Register {
- private int p;
private int m;
@@ -41,7 +40,7 @@ public class DenseRegister implements Register {
}
@Override
- public Byte get(int pos) {
+ public byte get(int pos) {
return register[pos];
}
@@ -80,11 +79,28 @@ public class DenseRegister implements Register {
}
@Override
- public int getHashCode() {
- return Arrays.hashCode(register);
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + Arrays.hashCode(register);
+ return result;
}
- public byte[] getRawRegister() {
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ DenseRegister other = (DenseRegister) obj;
+ if (!Arrays.equals(register, other.register))
+ return false;
+ return true;
+ }
+
+ byte[] getRawRegister() {
return this.register;
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCAggregator.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCAggregator.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCAggregator.java
index ca73285..5966c04 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCAggregator.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCAggregator.java
@@ -23,10 +23,10 @@ import org.apache.kylin.measure.MeasureAggregator;
/**
*/
@SuppressWarnings("serial")
-public class HLLCAggregator extends MeasureAggregator<HyperLogLogPlusCounterNew> {
+public class HLLCAggregator extends MeasureAggregator<HLLCounter> {
final int precision;
- HyperLogLogPlusCounterNew sum = null;
+ HLLCounter sum = null;
public HLLCAggregator(int precision) {
this.precision = precision;
@@ -38,15 +38,15 @@ public class HLLCAggregator extends MeasureAggregator<HyperLogLogPlusCounterNew>
}
@Override
- public void aggregate(HyperLogLogPlusCounterNew value) {
+ public void aggregate(HLLCounter value) {
if (sum == null)
- sum = new HyperLogLogPlusCounterNew(value);
+ sum = new HLLCounter(value);
else
sum.merge(value);
}
@Override
- public HyperLogLogPlusCounterNew getState() {
+ public HLLCounter getState() {
return sum;
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCMeasureType.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCMeasureType.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCMeasureType.java
index 481fa4e..9601653 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCMeasureType.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCMeasureType.java
@@ -33,15 +33,15 @@ import org.apache.kylin.metadata.model.TblColRef;
import com.google.common.collect.ImmutableMap;
-public class HLLCMeasureType extends MeasureType<HyperLogLogPlusCounterNew> {
+public class HLLCMeasureType extends MeasureType<HLLCounter> {
public static final String FUNC_COUNT_DISTINCT = FunctionDesc.FUNC_COUNT_DISTINCT;
public static final String DATATYPE_HLLC = "hllc";
- public static class Factory extends MeasureTypeFactory<HyperLogLogPlusCounterNew> {
+ public static class Factory extends MeasureTypeFactory<HLLCounter> {
@Override
- public MeasureType<HyperLogLogPlusCounterNew> createMeasureType(String funcName, DataType dataType) {
+ public MeasureType<HLLCounter> createMeasureType(String funcName, DataType dataType) {
return new HLLCMeasureType(funcName, dataType);
}
@@ -56,7 +56,7 @@ public class HLLCMeasureType extends MeasureType<HyperLogLogPlusCounterNew> {
}
@Override
- public Class<? extends DataTypeSerializer<HyperLogLogPlusCounterNew>> getAggrDataTypeSerializer() {
+ public Class<? extends DataTypeSerializer<HLLCounter>> getAggrDataTypeSerializer() {
return HLLCSerializer.class;
}
}
@@ -91,13 +91,13 @@ public class HLLCMeasureType extends MeasureType<HyperLogLogPlusCounterNew> {
}
@Override
- public MeasureIngester<HyperLogLogPlusCounterNew> newIngester() {
- return new MeasureIngester<HyperLogLogPlusCounterNew>() {
- HyperLogLogPlusCounterNew current = new HyperLogLogPlusCounterNew(dataType.getPrecision());
+ public MeasureIngester<HLLCounter> newIngester() {
+ return new MeasureIngester<HLLCounter>() {
+ HLLCounter current = new HLLCounter(dataType.getPrecision());
@Override
- public HyperLogLogPlusCounterNew valueOf(String[] values, MeasureDesc measureDesc, Map<TblColRef, Dictionary<String>> dictionaryMap) {
- HyperLogLogPlusCounterNew hllc = current;
+ public HLLCounter valueOf(String[] values, MeasureDesc measureDesc, Map<TblColRef, Dictionary<String>> dictionaryMap) {
+ HLLCounter hllc = current;
hllc.clear();
for (String v : values) {
if (v != null)
@@ -109,7 +109,7 @@ public class HLLCMeasureType extends MeasureType<HyperLogLogPlusCounterNew> {
}
@Override
- public MeasureAggregator<HyperLogLogPlusCounterNew> newAggregator() {
+ public MeasureAggregator<HLLCounter> newAggregator() {
return new HLLCAggregator(dataType.getPrecision());
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCSerializer.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCSerializer.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCSerializer.java
index 1d01abc..e0992c7 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCSerializer.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCSerializer.java
@@ -28,10 +28,10 @@ import org.apache.kylin.metadata.datatype.DataTypeSerializer;
* @author yangli9
*
*/
-public class HLLCSerializer extends DataTypeSerializer<HyperLogLogPlusCounterNew> {
+public class HLLCSerializer extends DataTypeSerializer<HLLCounter> {
// be thread-safe and avoid repeated obj creation
- private ThreadLocal<HyperLogLogPlusCounterNew> current = new ThreadLocal<HyperLogLogPlusCounterNew>();
+ private ThreadLocal<HLLCounter> current = new ThreadLocal<HLLCounter>();
private int precision;
@@ -40,7 +40,7 @@ public class HLLCSerializer extends DataTypeSerializer<HyperLogLogPlusCounterNew
}
@Override
- public void serialize(HyperLogLogPlusCounterNew value, ByteBuffer out) {
+ public void serialize(HLLCounter value, ByteBuffer out) {
try {
value.writeRegisters(out);
} catch (IOException e) {
@@ -48,18 +48,18 @@ public class HLLCSerializer extends DataTypeSerializer<HyperLogLogPlusCounterNew
}
}
- private HyperLogLogPlusCounterNew current() {
- HyperLogLogPlusCounterNew hllc = current.get();
+ private HLLCounter current() {
+ HLLCounter hllc = current.get();
if (hllc == null) {
- hllc = new HyperLogLogPlusCounterNew(precision);
+ hllc = new HLLCounter(precision);
current.set(hllc);
}
return hllc;
}
@Override
- public HyperLogLogPlusCounterNew deserialize(ByteBuffer in) {
- HyperLogLogPlusCounterNew hllc = current();
+ public HLLCounter deserialize(ByteBuffer in) {
+ HLLCounter hllc = current();
try {
hllc.readRegisters(in);
} catch (IOException e) {
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
new file mode 100644
index 0000000..22b5e55
--- /dev/null
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounter.java
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.measure.hllc;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import org.apache.kylin.common.util.BytesUtil;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.Collection;
+import java.util.Map;
+
+@SuppressWarnings("serial")
+public class HLLCounter implements Serializable, Comparable<HLLCounter> {
+
+ // not final for test purpose
+ static double OVERFLOW_FACTOR = 0.01;
+
+ private int p;
+
+ private int m;
+
+ private HashFunction hashFunc = Hashing.murmur3_128();
+
+ private Register register;
+
+ public HLLCounter() {
+ this(10, RegisterType.SPARSE, Hashing.murmur3_128());
+ }
+
+ public HLLCounter(int p) {
+ this(p, RegisterType.SPARSE, Hashing.murmur3_128());
+ }
+
+ public HLLCounter(int p, HashFunction hashFunc) {
+ this(p, RegisterType.SPARSE, hashFunc);
+ }
+
+ public HLLCounter(HLLCounter another) {
+ this(another.p, another.hashFunc);
+ merge(another);
+ }
+
+ HLLCounter(int p, RegisterType type) {
+ this(p, type, Hashing.murmur3_128());
+ }
+
+ HLLCounter(int p, RegisterType type, HashFunction hashFunc) {
+ this.p = p;
+ this.m = 1 << p;//(int) Math.pow(2, p);
+ this.hashFunc = hashFunc;
+ if (type == RegisterType.SPARSE) {
+ this.register = new SparseRegister();
+ } else {
+ this.register = new DenseRegister(p);
+ }
+ }
+
+ private boolean isDense(int size) {
+ double over = OVERFLOW_FACTOR * m;
+ return size > (int) over;
+ }
+
+ public void add(int value) {
+ add(hashFunc.hashInt(value).asLong());
+ }
+
+ public void add(String value) {
+ add(hashFunc.hashString(value, Charset.defaultCharset()).asLong());
+ }
+
+ public void add(byte[] value) {
+ add(hashFunc.hashBytes(value).asLong());
+ }
+
+ public void add(byte[] value, int offset, int length) {
+ add(hashFunc.hashBytes(value, offset, length).asLong());
+ }
+
+ protected void add(long hash) {
+ int bucketMask = m - 1;
+ int bucket = (int) (hash & bucketMask);
+ int firstOnePos = Long.numberOfLeadingZeros(hash | bucketMask) + 1;
+ Byte b = register.get(bucket);
+ if (b == null || (byte) firstOnePos > b) {
+ register.set(bucket, (byte) firstOnePos);
+ }
+ toDenseIfNeeded();
+ }
+
+ private void toDenseIfNeeded() {
+ if (register instanceof SparseRegister) {
+ if (isDense(register.getSize())) {
+ register = ((SparseRegister) register).toDense(p);
+ }
+ }
+ }
+
+ public void merge(HLLCounter another) {
+ assert this.p == another.p;
+ assert this.hashFunc == another.hashFunc;
+ if (register instanceof SparseRegister && another.register instanceof SparseRegister) {
+ register.merge(another.register);
+ toDenseIfNeeded();
+ } else if (register instanceof SparseRegister && another.register instanceof DenseRegister) {
+ register = ((SparseRegister) register).toDense(p);
+ register.merge(another.register);
+ } else {
+ register.merge(another.register);
+ }
+ }
+
+ public long getCountEstimate() {
+ return new HLLCSnapshot(this).getCountEstimate();
+ }
+
+ public int getPrecision() {
+ return this.p;
+ }
+
+ public double getErrorRate() {
+ return 1.04 / Math.sqrt(m);
+ }
+
+ @Override
+ public String toString() {
+ return "" + getCountEstimate();
+ }
+
+ // ============================================================================
+
+ // a memory efficient snapshot of HLL registers which can yield count estimate later
+ public static class HLLCSnapshot {
+ byte p;
+ double registerSum;
+ int zeroBuckets;
+
+ public HLLCSnapshot(HLLCounter hllc) {
+ p = (byte) hllc.p;
+ registerSum = 0;
+ zeroBuckets = 0;
+ Register register = hllc.getRegister();
+ DenseRegister dr;
+ if (register instanceof SparseRegister) {
+ dr = ((SparseRegister) register).toDense(p);
+ } else {
+ dr = (DenseRegister) register;
+ }
+ byte[] registers = dr.getRawRegister();
+ for (int i = 0; i < hllc.m; i++) {
+ if (registers[i] == 0) {
+ registerSum++;
+ zeroBuckets++;
+ } else {
+ registerSum += 1.0 / (1L << registers[i]);
+ }
+ }
+ }
+
+ public long getCountEstimate() {
+ int m = 1 << p;
+ double alpha = 0.7213 / (1 + 1.079 / m);
+ double estimate = alpha * m * m / registerSum;
+
+ // small cardinality adjustment
+ if (zeroBuckets >= m * 0.07) { // (reference presto's HLL impl)
+ estimate = m * Math.log(m * 1.0 / zeroBuckets);
+ } else if (HyperLogLogPlusTable.isBiasCorrection(m, estimate)) {
+ estimate = HyperLogLogPlusTable.biasCorrection(p, estimate);
+ }
+
+ return Math.round(estimate);
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+ dumpErrorRates();
+ }
+
+ static void dumpErrorRates() {
+ for (int p = 10; p <= 18; p++) {
+ double rate = new HLLCounter(p, RegisterType.SPARSE).getErrorRate();
+ double er = Math.round(rate * 10000) / 100D;
+ double er2 = Math.round(rate * 2 * 10000) / 100D;
+ double er3 = Math.round(rate * 3 * 10000) / 100D;
+ long size = Math.round(Math.pow(2, p));
+ System.out.println("HLLC" + p + ",\t" + size + " bytes,\t68% err<" + er + "%" + ",\t95% err<" + er2 + "%" + ",\t99.7% err<" + er3 + "%");
+ }
+ }
+
+ public Register getRegister() {
+ return register;
+ }
+
+ public void clear() {
+ register.clear();
+ }
+
+ // ============================================================================
+
+ public void writeRegisters(final ByteBuffer out) throws IOException {
+
+ final int indexLen = getRegisterIndexSize();
+ int size = register.getSize();
+
+ // decide output scheme -- map (3*size bytes) or array (2^p bytes)
+ byte scheme;
+ if (register instanceof SparseRegister || 5 + (indexLen + 1) * size < m) {
+ scheme = 0; // map
+ } else {
+ scheme = 1; // array
+ }
+ out.put(scheme);
+ if (scheme == 0) { // map scheme
+ BytesUtil.writeVInt(size, out);
+ if (register instanceof SparseRegister) { //sparse register
+ Collection<Map.Entry<Integer, Byte>> allValue = ((SparseRegister) register).getAllValue();
+ for (Map.Entry<Integer, Byte> entry : allValue) {
+ writeUnsigned(entry.getKey(), indexLen, out);
+ out.put(entry.getValue());
+ }
+ } else { //dense register
+ byte[] registers = ((DenseRegister) register).getRawRegister();
+ for (int i = 0; i < m; i++) {
+ if (registers[i] > 0) {
+ writeUnsigned(i, indexLen, out);
+ out.put(registers[i]);
+ }
+ }
+ }
+ } else if (scheme == 1) { // array scheme
+ out.put(((DenseRegister) register).getRawRegister());
+ } else
+ throw new IllegalStateException();
+ }
+
+ public void readRegisters(ByteBuffer in) throws IOException {
+ byte scheme = in.get();
+ if (scheme == 0) { // map scheme
+ clear();
+ int size = BytesUtil.readVInt(in);
+ if (size > m)
+ throw new IllegalArgumentException("register size (" + size + ") cannot be larger than m (" + m + ")");
+ if (isDense(size)) {
+ register = new DenseRegister(p);
+ } else {
+ register = new SparseRegister();//default is sparse
+ }
+ int indexLen = getRegisterIndexSize();
+ int key = 0;
+ for (int i = 0; i < size; i++) {
+ key = readUnsigned(in, indexLen);
+ register.set(key, in.get());
+ }
+ } else if (scheme == 1) { // array scheme
+ if (register instanceof SparseRegister) {
+ register = new DenseRegister(p);
+ }
+ in.get(((DenseRegister) register).getRawRegister());
+ } else
+ throw new IllegalStateException();
+ }
+
+ public int peekLength(ByteBuffer in) {
+ int mark = in.position();
+ int len;
+ byte scheme = in.get();
+ if (scheme == 0) { // map scheme
+ int size = BytesUtil.readVInt(in);
+ int indexLen = getRegisterIndexSize();
+ len = in.position() - mark + (indexLen + 1) * size;
+ } else {
+ len = in.position() - mark + m;
+ }
+
+ in.position(mark);
+ return len;
+ }
+
+ public int maxLength() {
+ return 1 + m;
+ }
+
+ private int getRegisterIndexSize() {
+ return (p - 1) / 8 + 1; // 2 when p=16, 3 when p=17
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((hashFunc == null) ? 0 : hashFunc.hashCode());
+ result = prime * result + p;
+ result = prime * result + register.hashCode();
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ HLLCounter other = (HLLCounter) obj;
+ if (!hashFunc.equals(other.hashFunc))
+ return false;
+ if (p != other.p)
+ return false;
+ if (!register.equals(other.register))
+ return false;
+ return true;
+ }
+
+ @Override
+ public int compareTo(HLLCounter o) {
+ if (o == null)
+ return 1;
+
+ long e1 = this.getCountEstimate();
+ long e2 = o.getCountEstimate();
+
+ if (e1 == e2)
+ return 0;
+ else if (e1 > e2)
+ return 1;
+ else
+ return -1;
+ }
+
+ public static void writeUnsigned(int num, int size, ByteBuffer out) {
+ for (int i = 0; i < size; i++) {
+ out.put((byte) num);
+ num >>>= 8;
+ }
+ }
+
+ public static int readUnsigned(ByteBuffer in, int size) {
+ int integer = 0;
+ int mask = 0xff;
+ int shift = 0;
+ for (int i = 0; i < size; i++) {
+ integer |= (in.get() << shift) & mask;
+ mask = mask << 8;
+ shift += 8;
+ }
+ return integer;
+ }
+
+ public RegisterType getRegisterType() {
+ if (register instanceof SparseRegister)
+ return RegisterType.SPARSE;
+ else
+ return RegisterType.DENSE;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounterOld.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounterOld.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounterOld.java
new file mode 100644
index 0000000..5cbdd43
--- /dev/null
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLCounterOld.java
@@ -0,0 +1,393 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.measure.hllc;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+
+import org.apache.kylin.common.util.BytesUtil;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+
+/**
+ * Deprecated, use HLLCounter instead.
+ *
+ * About compression, test on HLLC data shows
+ *
+ * - LZF compression ratio is around 65%-80%, fast
+ * - GZIP compression ratio is around 41%-46%, very slow
+ */
+@Deprecated
+@SuppressWarnings("serial")
+public class HLLCounterOld implements Serializable, Comparable<HLLCounterOld> {
+
+ private final int p;
+ private final int m;
+ private final HashFunction hashFunc;
+ byte[] registers;
+ int singleBucket;
+
+ public HLLCounterOld() {
+ this(10);
+ }
+
+ public HLLCounterOld(int p) {
+ this(p, Hashing.murmur3_128());
+ }
+
+ public HLLCounterOld(HLLCounterOld another) {
+ this(another.p, another.hashFunc);
+ merge(another);
+ }
+
+ /** The larger p is, the more storage (2^p bytes), the better accuracy */
+ private HLLCounterOld(int p, HashFunction hashFunc) {
+ this.p = p;
+ this.m = 1 << p;//(int) Math.pow(2, p);
+ this.hashFunc = hashFunc;
+ this.registers = new byte[m];
+ this.singleBucket = -1;
+ }
+
+ public void clear() {
+ byte zero = (byte) 0;
+ if (singleBucket == -1) {
+ //nothing
+ } else if (singleBucket >= 0) {
+ registers[singleBucket] = 0;
+ } else {
+ Arrays.fill(registers, zero);
+ }
+ singleBucket = -1;
+ }
+
+ public void add(int value) {
+ add(hashFunc.hashInt(value).asLong());
+ }
+
+ public void add(String value) {
+ add(hashFunc.hashString(value, Charset.defaultCharset()).asLong());
+ }
+
+ public void add(byte[] value) {
+ add(hashFunc.hashBytes(value).asLong());
+ }
+
+ public void add(byte[] value, int offset, int length) {
+ add(hashFunc.hashBytes(value, offset, length).asLong());
+ }
+
+ protected void add(long hash) {
+ int bucketMask = m - 1;
+ int bucket = (int) (hash & bucketMask);
+ int firstOnePos = Long.numberOfLeadingZeros(hash | bucketMask) + 1;
+
+ if (firstOnePos > registers[bucket])
+ registers[bucket] = (byte) firstOnePos;
+
+ if (singleBucket == -1)
+ singleBucket = bucket;
+ else
+ singleBucket = Integer.MIN_VALUE;
+ }
+
+ public void merge(HLLCounterOld another) {
+ assert this.p == another.p;
+ assert this.hashFunc == another.hashFunc;
+
+ // quick path for single value HLLC
+ if (another.singleBucket == -1) {
+ return;
+ } else if (another.singleBucket >= 0) {
+ int b = another.singleBucket;
+ if (registers[b] < another.registers[b])
+ registers[b] = another.registers[b];
+ } else {
+ // normal path
+ for (int i = 0; i < m; i++) {
+ if (registers[i] < another.registers[i])
+ registers[i] = another.registers[i];
+ }
+ }
+ singleBucket = Integer.MIN_VALUE;
+ }
+
+ public long getCountEstimate() {
+ return new HLLCSnapshot(this).getCountEstimate();
+ }
+
+ public int getPrecision() {
+ return this.p;
+ }
+
+ public double getErrorRate() {
+ return 1.04 / Math.sqrt(m);
+ }
+
+ private int size() {
+ if (singleBucket == -1) {
+ return 0;
+ } else if (singleBucket >= 0) {
+ return 1;
+ } else {
+ int size = 0;
+ for (int i = 0; i < m; i++) {
+ if (registers[i] > 0)
+ size++;
+ }
+ return size;
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "" + getCountEstimate();
+ }
+
+ // ============================================================================
+
+ // a memory efficient snapshot of HLL registers which can yield count
+ // estimate later
+ public static class HLLCSnapshot {
+ byte p;
+ double registerSum;
+ int zeroBuckets;
+
+ public HLLCSnapshot(HLLCounterOld hllc) {
+ p = (byte) hllc.p;
+ registerSum = 0;
+ zeroBuckets = 0;
+
+ byte[] registers = hllc.registers;
+ for (int i = 0; i < hllc.m; i++) {
+ if (registers[i] == 0) {
+ registerSum++;
+ zeroBuckets++;
+ } else {
+ registerSum += 1.0 / (1L << registers[i]);
+ }
+ }
+ }
+
+ public long getCountEstimate() {
+ int m = 1 << p;
+ double alpha = 0.7213 / (1 + 1.079 / m);
+ double estimate = alpha * m * m / registerSum;
+
+ // small cardinality adjustment
+ if (zeroBuckets >= m * 0.07) { // (reference presto's HLL impl)
+ estimate = m * Math.log(m * 1.0 / zeroBuckets);
+ } else if (HyperLogLogPlusTable.isBiasCorrection(m, estimate)) {
+ estimate = HyperLogLogPlusTable.biasCorrection(p, estimate);
+ }
+
+ return Math.round(estimate);
+ }
+ }
+
+ // ============================================================================
+
+ public void writeRegisters(final ByteBuffer out) throws IOException {
+
+ final int indexLen = getRegisterIndexSize();
+ int size = size();
+
+ // decide output scheme -- map (3*size bytes) or array (2^p bytes)
+ byte scheme;
+ if (5 + (indexLen + 1) * size < m) // 5 is max len of vint
+ scheme = 0; // map
+ else
+ scheme = 1; // array
+ out.put(scheme);
+
+ if (scheme == 0) { // map scheme
+ BytesUtil.writeVInt(size, out);
+ if (singleBucket == -1) {
+ // no non-zero register
+ } else if (singleBucket >= 0) {
+ writeUnsigned(singleBucket, indexLen, out);
+ out.put(registers[singleBucket]);
+ } else {
+ for (int i = 0; i < m; i++) {
+ if (registers[i] > 0) {
+ writeUnsigned(i, indexLen, out);
+ out.put(registers[i]);
+ }
+ }
+ }
+ } else if (scheme == 1) { // array scheme
+ out.put(registers);
+ } else
+ throw new IllegalStateException();
+ }
+
+ public void readRegisters(ByteBuffer in) throws IOException {
+ byte scheme = in.get();
+
+ if (scheme == 0) { // map scheme
+ clear();
+ int size = BytesUtil.readVInt(in);
+ if (size > m)
+ throw new IllegalArgumentException("register size (" + size + ") cannot be larger than m (" + m + ")");
+ int indexLen = getRegisterIndexSize();
+ int key = 0;
+ for (int i = 0; i < size; i++) {
+ key = readUnsigned(in, indexLen);
+ registers[key] = in.get();
+ }
+
+ if (size == 0)
+ singleBucket = -1;
+ else if (size == 1)
+ singleBucket = key;
+ else
+ singleBucket = Integer.MIN_VALUE;
+
+ } else if (scheme == 1) { // array scheme
+ in.get(registers);
+ singleBucket = Integer.MIN_VALUE;
+ } else
+ throw new IllegalStateException();
+ }
+
+ public int peekLength(ByteBuffer in) {
+ int mark = in.position();
+ int len;
+
+ byte scheme = in.get();
+ if (scheme == 0) { // map scheme
+ int size = BytesUtil.readVInt(in);
+ int indexLen = getRegisterIndexSize();
+ len = in.position() - mark + (indexLen + 1) * size;
+ } else {
+ len = in.position() - mark + m;
+ }
+
+ in.position(mark);
+ return len;
+ }
+
+ public int maxLength() {
+ return 1 + m;
+ }
+
+ /*public void writeRegistersArray(final ByteBuffer out) {
+ out.put(this.registers);
+ }
+
+ public void readRegistersArray(ByteBuffer in) {
+ in.get(registers, 0, m);
+ singleBucket = Integer.MIN_VALUE;
+ }*/
+
+ private int getRegisterIndexSize() {
+ return (p - 1) / 8 + 1; // 2 when p=16, 3 when p=17
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((hashFunc == null) ? 0 : hashFunc.hashCode());
+ result = prime * result + p;
+ result = prime * result + Arrays.hashCode(registers);
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ HLLCounterOld other = (HLLCounterOld) obj;
+ if (hashFunc == null) {
+ if (other.hashFunc != null)
+ return false;
+ } else if (!hashFunc.equals(other.hashFunc))
+ return false;
+ if (p != other.p)
+ return false;
+ if (!Arrays.equals(registers, other.registers))
+ return false;
+ return true;
+ }
+
+ @Override
+ public int compareTo(HLLCounterOld o) {
+ if (o == null)
+ return 1;
+
+ long e1 = this.getCountEstimate();
+ long e2 = o.getCountEstimate();
+
+ if (e1 == e2)
+ return 0;
+ else if (e1 > e2)
+ return 1;
+ else
+ return -1;
+ }
+
+ public static void main(String[] args) throws IOException {
+ dumpErrorRates();
+ }
+
+ static void dumpErrorRates() {
+ for (int p = 10; p <= 18; p++) {
+ double rate = new HLLCounterOld(p).getErrorRate();
+ double er = Math.round(rate * 10000) / 100D;
+ double er2 = Math.round(rate * 2 * 10000) / 100D;
+ double er3 = Math.round(rate * 3 * 10000) / 100D;
+ long size = Math.round(Math.pow(2, p));
+ System.out.println("HLLC" + p + ",\t" + size + " bytes,\t68% err<" + er + "%" + ",\t95% err<" + er2 + "%" + ",\t99.7% err<" + er3 + "%");
+ }
+ }
+
+ /**
+ *
+ * @param num
+ * @param size
+ * @param out
+ */
+ public static void writeUnsigned(int num, int size, ByteBuffer out) {
+ for (int i = 0; i < size; i++) {
+ out.put((byte) num);
+ num >>>= 8;
+ }
+ }
+
+ public static int readUnsigned(ByteBuffer in, int size) {
+ int integer = 0;
+ int mask = 0xff;
+ int shift = 0;
+ for (int i = 0; i < size; i++) {
+ integer |= (in.get() << shift) & mask;
+ mask = mask << 8;
+ shift += 8;
+ }
+ return integer;
+ }
+}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLDistinctCountAggFunc.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLDistinctCountAggFunc.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLDistinctCountAggFunc.java
index a72ad09..438a33f 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLDistinctCountAggFunc.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HLLDistinctCountAggFunc.java
@@ -31,21 +31,21 @@ public class HLLDistinctCountAggFunc {
private static final Logger logger = LoggerFactory.getLogger(HLLDistinctCountAggFunc.class);
- public static HyperLogLogPlusCounterNew init() {
+ public static HLLCounter init() {
return null;
}
- public static HyperLogLogPlusCounterNew initAdd(Object v) {
+ public static HLLCounter initAdd(Object v) {
if (v instanceof Long) { // holistic case
long l = (Long) v;
return new FixedValueHLLCMockup(l);
} else {
- HyperLogLogPlusCounterNew c = (HyperLogLogPlusCounterNew) v;
- return new HyperLogLogPlusCounterNew(c);
+ HLLCounter c = (HLLCounter) v;
+ return new HLLCounter(c);
}
}
- public static HyperLogLogPlusCounterNew add(HyperLogLogPlusCounterNew counter, Object v) {
+ public static HLLCounter add(HLLCounter counter, Object v) {
if (v instanceof Long) { // holistic case
long l = (Long) v;
if (counter == null) {
@@ -58,9 +58,9 @@ public class HLLDistinctCountAggFunc {
return counter;
}
} else {
- HyperLogLogPlusCounterNew c = (HyperLogLogPlusCounterNew) v;
+ HLLCounter c = (HLLCounter) v;
if (counter == null) {
- return new HyperLogLogPlusCounterNew(c);
+ return new HLLCounter(c);
} else {
counter.merge(c);
return counter;
@@ -68,16 +68,16 @@ public class HLLDistinctCountAggFunc {
}
}
- public static HyperLogLogPlusCounterNew merge(HyperLogLogPlusCounterNew counter0, Object counter1) {
+ public static HLLCounter merge(HLLCounter counter0, Object counter1) {
return add(counter0, counter1);
}
- public static long result(HyperLogLogPlusCounterNew counter) {
+ public static long result(HLLCounter counter) {
return counter == null ? 0L : counter.getCountEstimate();
}
@SuppressWarnings("serial")
- private static class FixedValueHLLCMockup extends HyperLogLogPlusCounterNew {
+ private static class FixedValueHLLCMockup extends HLLCounter {
private Long value = null;
@@ -107,7 +107,7 @@ public class HLLDistinctCountAggFunc {
}
@Override
- public void merge(HyperLogLogPlusCounterNew another) {
+ public void merge(HLLCounter another) {
throw new UnsupportedOperationException();
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterNew.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterNew.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterNew.java
deleted file mode 100644
index d7329f6..0000000
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterNew.java
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-package org.apache.kylin.measure.hllc;
-
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-import org.apache.kylin.common.util.BytesUtil;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
-import java.util.Collection;
-import java.util.Map;
-
-@SuppressWarnings("serial")
-public class HyperLogLogPlusCounterNew implements Serializable, Comparable<HyperLogLogPlusCounterNew> {
-
- private int p;
-
- private int m;
-
- private HashFunction hashFunc = Hashing.murmur3_128();
-
- private Register register;
-
- public static double overflowFactor = 0.01;
-
- public HyperLogLogPlusCounterNew(int p, RegisterType type, HashFunction hashFunc) {
- this.p = p;
- this.m = 1 << p;//(int) Math.pow(2, p);
- this.hashFunc = hashFunc;
- if (type == RegisterType.SPARSE) {
- double over = overflowFactor * m;
- this.register = new SparseRegister((int) over);
- } else {
- this.register = new DenseRegister(p);
- }
- }
-
- public HyperLogLogPlusCounterNew() {
- this(10, RegisterType.SPARSE, Hashing.murmur3_128());
- }
-
- public HyperLogLogPlusCounterNew(int p) {
- this(p, RegisterType.SPARSE, Hashing.murmur3_128());
- }
-
- public HyperLogLogPlusCounterNew(int p, RegisterType type) {
- this(p, type, Hashing.murmur3_128());
- }
-
- public HyperLogLogPlusCounterNew(int p, HashFunction hashFunc) {
- this(p, RegisterType.SPARSE, hashFunc);
- }
-
- public HyperLogLogPlusCounterNew(HyperLogLogPlusCounterNew another) {
- this(another.p, another.hashFunc);
- merge(another);
- }
-
- public void add(int value) {
- add(hashFunc.hashInt(value).asLong());
- }
-
- public void add(String value) {
- add(hashFunc.hashString(value, Charset.defaultCharset()).asLong());
- }
-
- public void add(byte[] value) {
- add(hashFunc.hashBytes(value).asLong());
- }
-
- public void add(byte[] value, int offset, int length) {
- add(hashFunc.hashBytes(value, offset, length).asLong());
- }
-
- protected void add(long hash) {
- int bucketMask = m - 1;
- int bucket = (int) (hash & bucketMask);
- int firstOnePos = Long.numberOfLeadingZeros(hash | bucketMask) + 1;
- Byte b = register.get(bucket);
- if (b == null || (byte) firstOnePos > b) {
- register.set(bucket, (byte) firstOnePos);
- }
- if (register instanceof SparseRegister) {
- if (((SparseRegister) register).isOverThreshold()) {
- register = ((SparseRegister) register).toDense(p);
- }
- }
- }
-
- public void merge(HyperLogLogPlusCounterNew another) {
- assert this.p == another.p;
- assert this.hashFunc == another.hashFunc;
- if (register instanceof SparseRegister && another.register instanceof SparseRegister) {
- register.merge(another.register);
- if (((SparseRegister) register).isOverThreshold()) {
- register = ((SparseRegister) register).toDense(p);
- }
- } else if (register instanceof SparseRegister && another.register instanceof DenseRegister) {
- register = ((SparseRegister) register).toDense(p);
- register.merge(another.register);
- } else {
- register.merge(another.register);
- }
- }
-
- public long getCountEstimate() {
- return new HLLCSnapshot(this).getCountEstimate();
- }
-
- public int getPrecision() {
- return this.p;
- }
-
- public double getErrorRate() {
- return 1.04 / Math.sqrt(m);
- }
-
- @Override
- public String toString() {
- return "" + getCountEstimate();
- }
-
- // ============================================================================
-
- // a memory efficient snapshot of HLL registers which can yield count
- // estimate later
- public static class HLLCSnapshot {
- byte p;
- double registerSum;
- int zeroBuckets;
-
- public HLLCSnapshot(HyperLogLogPlusCounterNew hllc) {
- p = (byte) hllc.p;
- registerSum = 0;
- zeroBuckets = 0;
- Register register = hllc.getRegister();
- DenseRegister dr;
- if (register instanceof SparseRegister) {
- dr = ((SparseRegister) register).toDense(p);
- } else {
- dr = (DenseRegister) register;
- }
- byte[] registers = dr.getRawRegister();
- for (int i = 0; i < hllc.m; i++) {
- if (registers[i] == 0) {
- registerSum++;
- zeroBuckets++;
- } else {
- registerSum += 1.0 / (1L << registers[i]);
- }
- }
- }
-
- public long getCountEstimate() {
- int m = 1 << p;
- double alpha = 0.7213 / (1 + 1.079 / m);
- double estimate = alpha * m * m / registerSum;
-
- // small cardinality adjustment
- if (zeroBuckets >= m * 0.07) { // (reference presto's HLL impl)
- estimate = m * Math.log(m * 1.0 / zeroBuckets);
- } else if (HyperLogLogPlusTable.isBiasCorrection(m, estimate)) {
- estimate = HyperLogLogPlusTable.biasCorrection(p, estimate);
- }
-
- return Math.round(estimate);
- }
- }
-
- public static void main(String[] args) throws IOException {
- dumpErrorRates();
- }
-
- static void dumpErrorRates() {
- for (int p = 10; p <= 18; p++) {
- double rate = new HyperLogLogPlusCounterNew(p, RegisterType.SPARSE).getErrorRate();
- double er = Math.round(rate * 10000) / 100D;
- double er2 = Math.round(rate * 2 * 10000) / 100D;
- double er3 = Math.round(rate * 3 * 10000) / 100D;
- long size = Math.round(Math.pow(2, p));
- System.out.println("HLLC" + p + ",\t" + size + " bytes,\t68% err<" + er + "%" + ",\t95% err<" + er2 + "%" + ",\t99.7% err<" + er3 + "%");
- }
- }
-
- public Register getRegister() {
- return register;
- }
-
- public void clear() {
- register.clear();
- }
-
- public RegisterType getRegisterType() {
- if (register instanceof SparseRegister)
- return RegisterType.SPARSE;
- else
- return RegisterType.DENSE;
- }
-
- // ============================================================================
-
- public void writeRegisters(final ByteBuffer out) throws IOException {
-
- final int indexLen = getRegisterIndexSize();
- int size = size();
-
- // decide output scheme -- map (3*size bytes) or array (2^p bytes)
- byte scheme;
- //byte type;
- if (register instanceof SparseRegister || 5 + (indexLen + 1) * size < m) {
- scheme = 0; //map
- } else {
- scheme = 1; // array
- }
- out.put(scheme);
- if (scheme == 0) { // map scheme
- BytesUtil.writeVInt(size, out);
- if (register instanceof SparseRegister) { //sparse\u3000register
- Collection<Map.Entry<Integer, Byte>> allValue = ((SparseRegister) register).getAllValue();
- for (Map.Entry<Integer, Byte> entry : allValue) {
- writeUnsigned(entry.getKey(), indexLen, out);
- out.put(entry.getValue());
- }
- } else { //dense register
- byte[] registers = ((DenseRegister) register).getRawRegister();
- for (int i = 0; i < m; i++) {
- if (registers[i] > 0) {
- writeUnsigned(i, indexLen, out);
- out.put(registers[i]);
- }
- }
- }
- } else if (scheme == 1) { // array scheme
- out.put(((DenseRegister) register).getRawRegister());
- } else
- throw new IllegalStateException();
- }
-
- public void readRegisters(ByteBuffer in) throws IOException {
- byte scheme = in.get();
- if (scheme == 0) { // map scheme
- clear();
- int size = BytesUtil.readVInt(in);
- if (size > m)
- throw new IllegalArgumentException("register size (" + size + ") cannot be larger than m (" + m + ")");
- double over = overflowFactor * m;
- if (size > (int) over) {
- this.register = new DenseRegister(p);
- } else {
- this.register = new SparseRegister((int) over);//default is sparse
- }
- int indexLen = getRegisterIndexSize();
- int key = 0;
- for (int i = 0; i < size; i++) {
- key = readUnsigned(in, indexLen);
- register.set(key, in.get());
- }
- } else if (scheme == 1) { // array scheme
- this.register = new DenseRegister(p);
- for (int i = 0; i < m; i++) {
- register.set(i, in.get());
- }
- } else
- throw new IllegalStateException();
- }
-
- public int peekLength(ByteBuffer in) {
- int mark = in.position();
- int len;
- byte scheme = in.get();
- if (scheme == 0) { // map scheme
- int size = BytesUtil.readVInt(in);
- int indexLen = getRegisterIndexSize();
- len = in.position() - mark + (indexLen + 1) * size;
- } else {
- len = in.position() - mark + m;
- }
-
- in.position(mark);
- return len;
- }
-
- public int maxLength() {
- return 1 + m;
- }
-
- private int getRegisterIndexSize() {
- return (p - 1) / 8 + 1; // 2 when p=16, 3 when p=17
- }
-
- @Override
- public int hashCode() {
- final int prime = 31;
- int result = 1;
- result = prime * result + ((hashFunc == null) ? 0 : hashFunc.hashCode());
- result = prime * result + p;
- result = prime * result + register.getHashCode();
- return result;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- HyperLogLogPlusCounterNew other = (HyperLogLogPlusCounterNew) obj;
- if (hashFunc == null) {
- if (other.hashFunc != null)
- return false;
- } else if (!hashFunc.equals(other.hashFunc))
- return false;
- if (p != other.p)
- return false;
- if (this.getRegisterType() != other.getRegisterType())
- return false;
- if (register.getHashCode() != other.register.getHashCode())
- return false;
- return true;
- }
-
- @Override
- public int compareTo(HyperLogLogPlusCounterNew o) {
- if (o == null)
- return 1;
-
- long e1 = this.getCountEstimate();
- long e2 = o.getCountEstimate();
-
- if (e1 == e2)
- return 0;
- else if (e1 > e2)
- return 1;
- else
- return -1;
- }
-
- /**
- *
- * @param num
- * @param size
- * @param out
- */
- public static void writeUnsigned(int num, int size, ByteBuffer out) {
- for (int i = 0; i < size; i++) {
- out.put((byte) num);
- num >>>= 8;
- }
- }
-
- public static int readUnsigned(ByteBuffer in, int size) {
- int integer = 0;
- int mask = 0xff;
- int shift = 0;
- for (int i = 0; i < size; i++) {
- integer |= (in.get() << shift) & mask;
- mask = mask << 8;
- shift += 8;
- }
- return integer;
- }
-
- private int size() {
- return register.getSize();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterOld.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterOld.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterOld.java
deleted file mode 100644
index cb5533e..0000000
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/HyperLogLogPlusCounterOld.java
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-package org.apache.kylin.measure.hllc;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-
-import org.apache.kylin.common.util.BytesUtil;
-
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-
-/**
- * About compression, test on HLLC data shows
- *
- * - LZF compression ratio is around 65%-80%, fast
- * - GZIP compression ratio is around 41%-46%, very slow
- *
- * @author yangli9
- */
-@SuppressWarnings("serial")
-public class HyperLogLogPlusCounterOld implements Serializable, Comparable<HyperLogLogPlusCounterOld> {
-
- private final int p;
- private final int m;
- private final HashFunction hashFunc;
- byte[] registers;
- int singleBucket;
-
- public HyperLogLogPlusCounterOld() {
- this(10);
- }
-
- public HyperLogLogPlusCounterOld(int p) {
- this(p, Hashing.murmur3_128());
- }
-
- public HyperLogLogPlusCounterOld(HyperLogLogPlusCounterOld another) {
- this(another.p, another.hashFunc);
- merge(another);
- }
-
- /** The larger p is, the more storage (2^p bytes), the better accuracy */
- private HyperLogLogPlusCounterOld(int p, HashFunction hashFunc) {
- this.p = p;
- this.m = 1 << p;//(int) Math.pow(2, p);
- this.hashFunc = hashFunc;
- this.registers = new byte[m];
- this.singleBucket = -1;
- }
-
- public void clear() {
- byte zero = (byte) 0;
- if (singleBucket == -1) {
- //nothing
- } else if (singleBucket >= 0) {
- registers[singleBucket] = 0;
- } else {
- Arrays.fill(registers, zero);
- }
- singleBucket = -1;
- }
-
- public void add(int value) {
- add(hashFunc.hashInt(value).asLong());
- }
-
- public void add(String value) {
- add(hashFunc.hashString(value, Charset.defaultCharset()).asLong());
- }
-
- public void add(byte[] value) {
- add(hashFunc.hashBytes(value).asLong());
- }
-
- public void add(byte[] value, int offset, int length) {
- add(hashFunc.hashBytes(value, offset, length).asLong());
- }
-
- protected void add(long hash) {
- int bucketMask = m - 1;
- int bucket = (int) (hash & bucketMask);
- int firstOnePos = Long.numberOfLeadingZeros(hash | bucketMask) + 1;
-
- if (firstOnePos > registers[bucket])
- registers[bucket] = (byte) firstOnePos;
-
- if (singleBucket == -1)
- singleBucket = bucket;
- else
- singleBucket = Integer.MIN_VALUE;
- }
-
- public void merge(HyperLogLogPlusCounterOld another) {
- assert this.p == another.p;
- assert this.hashFunc == another.hashFunc;
-
- // quick path for single value HLLC
- if (another.singleBucket == -1) {
- return;
- } else if (another.singleBucket >= 0) {
- int b = another.singleBucket;
- if (registers[b] < another.registers[b])
- registers[b] = another.registers[b];
- } else {
- // normal path
- for (int i = 0; i < m; i++) {
- if (registers[i] < another.registers[i])
- registers[i] = another.registers[i];
- }
- }
- singleBucket = Integer.MIN_VALUE;
- }
-
- public long getCountEstimate() {
- return new HLLCSnapshot(this).getCountEstimate();
- }
-
- public int getPrecision() {
- return this.p;
- }
-
- public double getErrorRate() {
- return 1.04 / Math.sqrt(m);
- }
-
- private int size() {
- if (singleBucket == -1) {
- return 0;
- } else if (singleBucket >= 0) {
- return 1;
- } else {
- int size = 0;
- for (int i = 0; i < m; i++) {
- if (registers[i] > 0)
- size++;
- }
- return size;
- }
- }
-
- @Override
- public String toString() {
- return "" + getCountEstimate();
- }
-
- // ============================================================================
-
- // a memory efficient snapshot of HLL registers which can yield count
- // estimate later
- public static class HLLCSnapshot {
- byte p;
- double registerSum;
- int zeroBuckets;
-
- public HLLCSnapshot(HyperLogLogPlusCounterOld hllc) {
- p = (byte) hllc.p;
- registerSum = 0;
- zeroBuckets = 0;
-
- byte[] registers = hllc.registers;
- for (int i = 0; i < hllc.m; i++) {
- if (registers[i] == 0) {
- registerSum++;
- zeroBuckets++;
- } else {
- registerSum += 1.0 / (1L << registers[i]);
- }
- }
- }
-
- public long getCountEstimate() {
- int m = 1 << p;
- double alpha = 0.7213 / (1 + 1.079 / m);
- double estimate = alpha * m * m / registerSum;
-
- // small cardinality adjustment
- if (zeroBuckets >= m * 0.07) { // (reference presto's HLL impl)
- estimate = m * Math.log(m * 1.0 / zeroBuckets);
- } else if (HyperLogLogPlusTable.isBiasCorrection(m, estimate)) {
- estimate = HyperLogLogPlusTable.biasCorrection(p, estimate);
- }
-
- return Math.round(estimate);
- }
- }
-
- // ============================================================================
-
- public void writeRegisters(final ByteBuffer out) throws IOException {
-
- final int indexLen = getRegisterIndexSize();
- int size = size();
-
- // decide output scheme -- map (3*size bytes) or array (2^p bytes)
- byte scheme;
- if (5 + (indexLen + 1) * size < m) // 5 is max len of vint
- scheme = 0; // map
- else
- scheme = 1; // array
- out.put(scheme);
-
- if (scheme == 0) { // map scheme
- BytesUtil.writeVInt(size, out);
- if (singleBucket == -1) {
- // no non-zero register
- } else if (singleBucket >= 0) {
- writeUnsigned(singleBucket, indexLen, out);
- out.put(registers[singleBucket]);
- } else {
- for (int i = 0; i < m; i++) {
- if (registers[i] > 0) {
- writeUnsigned(i, indexLen, out);
- out.put(registers[i]);
- }
- }
- }
- } else if (scheme == 1) { // array scheme
- out.put(registers);
- } else
- throw new IllegalStateException();
- }
-
- public void readRegisters(ByteBuffer in) throws IOException {
- byte scheme = in.get();
-
- if (scheme == 0) { // map scheme
- clear();
- int size = BytesUtil.readVInt(in);
- if (size > m)
- throw new IllegalArgumentException("register size (" + size + ") cannot be larger than m (" + m + ")");
- int indexLen = getRegisterIndexSize();
- int key = 0;
- for (int i = 0; i < size; i++) {
- key = readUnsigned(in, indexLen);
- registers[key] = in.get();
- }
-
- if (size == 0)
- singleBucket = -1;
- else if (size == 1)
- singleBucket = key;
- else
- singleBucket = Integer.MIN_VALUE;
-
- } else if (scheme == 1) { // array scheme
- in.get(registers);
- singleBucket = Integer.MIN_VALUE;
- } else
- throw new IllegalStateException();
- }
-
- public int peekLength(ByteBuffer in) {
- int mark = in.position();
- int len;
-
- byte scheme = in.get();
- if (scheme == 0) { // map scheme
- int size = BytesUtil.readVInt(in);
- int indexLen = getRegisterIndexSize();
- len = in.position() - mark + (indexLen + 1) * size;
- } else {
- len = in.position() - mark + m;
- }
-
- in.position(mark);
- return len;
- }
-
- public int maxLength() {
- return 1 + m;
- }
-
- /*public void writeRegistersArray(final ByteBuffer out) {
- out.put(this.registers);
- }
-
- public void readRegistersArray(ByteBuffer in) {
- in.get(registers, 0, m);
- singleBucket = Integer.MIN_VALUE;
- }*/
-
- private int getRegisterIndexSize() {
- return (p - 1) / 8 + 1; // 2 when p=16, 3 when p=17
- }
-
- @Override
- public int hashCode() {
- final int prime = 31;
- int result = 1;
- result = prime * result + ((hashFunc == null) ? 0 : hashFunc.hashCode());
- result = prime * result + p;
- result = prime * result + Arrays.hashCode(registers);
- return result;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- HyperLogLogPlusCounterOld other = (HyperLogLogPlusCounterOld) obj;
- if (hashFunc == null) {
- if (other.hashFunc != null)
- return false;
- } else if (!hashFunc.equals(other.hashFunc))
- return false;
- if (p != other.p)
- return false;
- if (!Arrays.equals(registers, other.registers))
- return false;
- return true;
- }
-
- @Override
- public int compareTo(HyperLogLogPlusCounterOld o) {
- if (o == null)
- return 1;
-
- long e1 = this.getCountEstimate();
- long e2 = o.getCountEstimate();
-
- if (e1 == e2)
- return 0;
- else if (e1 > e2)
- return 1;
- else
- return -1;
- }
-
- public static void main(String[] args) throws IOException {
- dumpErrorRates();
- }
-
- static void dumpErrorRates() {
- for (int p = 10; p <= 18; p++) {
- double rate = new HyperLogLogPlusCounterOld(p).getErrorRate();
- double er = Math.round(rate * 10000) / 100D;
- double er2 = Math.round(rate * 2 * 10000) / 100D;
- double er3 = Math.round(rate * 3 * 10000) / 100D;
- long size = Math.round(Math.pow(2, p));
- System.out.println("HLLC" + p + ",\t" + size + " bytes,\t68% err<" + er + "%" + ",\t95% err<" + er2 + "%" + ",\t99.7% err<" + er3 + "%");
- }
- }
-
- /**
- *
- * @param num
- * @param size
- * @param out
- */
- public static void writeUnsigned(int num, int size, ByteBuffer out) {
- for (int i = 0; i < size; i++) {
- out.put((byte) num);
- num >>>= 8;
- }
- }
-
- public static int readUnsigned(ByteBuffer in, int size) {
- int integer = 0;
- int mask = 0xff;
- int shift = 0;
- for (int i = 0; i < size; i++) {
- integer |= (in.get() << shift) & mask;
- mask = mask << 8;
- shift += 8;
- }
- return integer;
- }
-}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/hllc/Register.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/Register.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/Register.java
index 79c4bba..a6ef94f 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/Register.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/Register.java
@@ -24,7 +24,7 @@ public interface Register {
void set(int pos, byte value);
- Byte get(int pos);
+ byte get(int pos);
void merge(Register another);
@@ -32,6 +32,4 @@ public interface Register {
int getSize();
- int getHashCode();
-
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/main/java/org/apache/kylin/measure/hllc/SparseRegister.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/SparseRegister.java b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/SparseRegister.java
index d241e81..d6bb024 100644
--- a/core-metadata/src/main/java/org/apache/kylin/measure/hllc/SparseRegister.java
+++ b/core-metadata/src/main/java/org/apache/kylin/measure/hllc/SparseRegister.java
@@ -27,12 +27,9 @@ import java.util.TreeMap;
*/
public class SparseRegister implements Register {
- private int overThreshold;
-
private Map<Integer, Byte> sparseRegister = new TreeMap<>();
- public SparseRegister(int overThreshold) {
- this.overThreshold = overThreshold;
+ public SparseRegister() {
}
public DenseRegister toDense(int p) {
@@ -49,8 +46,9 @@ public class SparseRegister implements Register {
}
@Override
- public Byte get(int pos) {
- return sparseRegister.get(pos);
+ public byte get(int pos) {
+ Byte b = sparseRegister.get(pos);
+ return b == null ? 0 : b;
}
@Override
@@ -58,8 +56,8 @@ public class SparseRegister implements Register {
assert another instanceof SparseRegister;
SparseRegister sr = (SparseRegister) another;
for (Map.Entry<Integer, Byte> entry : sr.sparseRegister.entrySet()) {
- Byte v = sparseRegister.get(entry.getKey());
- if (v == null || entry.getValue() > v)
+ byte v = get(entry.getKey());
+ if (entry.getValue() > v)
sparseRegister.put(entry.getKey(), entry.getValue());
}
}
@@ -75,20 +73,28 @@ public class SparseRegister implements Register {
}
@Override
- public int getHashCode() {
+ public int hashCode() {
final int prime = 31;
int result = 1;
- for (Map.Entry<Integer, Byte> entry : sparseRegister.entrySet()) {
- result = prime * result + entry.getKey();
- result = prime * result + entry.getValue();
- }
+ result = prime * result + ((sparseRegister == null) ? 0 : sparseRegister.hashCode());
return result;
}
- public boolean isOverThreshold() {
- if (this.sparseRegister.size() > overThreshold)
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
return true;
- return false;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ SparseRegister other = (SparseRegister) obj;
+ if (sparseRegister == null) {
+ if (other.sparseRegister != null)
+ return false;
+ } else if (!sparseRegister.equals(other.sparseRegister))
+ return false;
+ return true;
}
public Collection<Map.Entry<Integer, Byte>> getAllValue() {
http://git-wip-us.apache.org/repos/asf/kylin/blob/e6e330a8/core-metadata/src/test/java/org/apache/kylin/measure/AggregatorMemEstimateTest.java
----------------------------------------------------------------------
diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/AggregatorMemEstimateTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/AggregatorMemEstimateTest.java
index 103e721..0f22610 100644
--- a/core-metadata/src/test/java/org/apache/kylin/measure/AggregatorMemEstimateTest.java
+++ b/core-metadata/src/test/java/org/apache/kylin/measure/AggregatorMemEstimateTest.java
@@ -26,7 +26,7 @@ import org.apache.kylin.measure.bitmap.BitmapAggregator;
import org.apache.kylin.measure.bitmap.BitmapCounter;
import org.apache.kylin.measure.extendedcolumn.ExtendedColumnMeasureType;
import org.apache.kylin.measure.hllc.HLLCAggregator;
-import org.apache.kylin.measure.hllc.HyperLogLogPlusCounterNew;
+import org.apache.kylin.measure.hllc.HLLCounter;
import org.apache.kylin.metadata.datatype.DataType;
import org.apache.kylin.metadata.datatype.DoubleMutable;
import org.apache.kylin.metadata.datatype.LongMutable;
@@ -94,7 +94,7 @@ public class AggregatorMemEstimateTest extends LocalFileMetadataTestCase {
@Test
public void testAggregatorEstimate() {
HLLCAggregator hllcAggregator = new HLLCAggregator(14);
- hllcAggregator.aggregate(new HyperLogLogPlusCounterNew(14));
+ hllcAggregator.aggregate(new HLLCounter(14));
BitmapAggregator bitmapAggregator = new BitmapAggregator();
BitmapCounter bitmapCounter = new BitmapCounter();