You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by li...@apache.org on 2017/04/18 08:20:42 UTC
kylin git commit: KYLIN-2552 Fix backward compatibility (Fan Xie)
Repository: kylin
Updated Branches:
refs/heads/2.0.x bcfab74a9 -> 8f59aa89b
KYLIN-2552 Fix backward compatibility (Fan Xie)
Project: http://git-wip-us.apache.org/repos/asf/kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/8f59aa89
Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/8f59aa89
Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/8f59aa89
Branch: refs/heads/2.0.x
Commit: 8f59aa89bbaa07a4ca5aec5406d60f5d70318af3
Parents: bcfab74
Author: Hongbin Ma <ma...@apache.org>
Authored: Tue Apr 18 16:20:24 2017 +0800
Committer: Hongbin Ma <ma...@apache.org>
Committed: Tue Apr 18 16:20:31 2017 +0800
----------------------------------------------------------------------
.../mr/steps/FactDistinctColumnsMapper.java | 55 ++++++++++++++++++--
1 file changed, 51 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kylin/blob/8f59aa89/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsMapper.java
----------------------------------------------------------------------
diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsMapper.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsMapper.java
index e6cea2b..d36ae18 100644
--- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsMapper.java
+++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsMapper.java
@@ -24,6 +24,8 @@ import java.util.Collection;
import java.util.List;
import org.apache.hadoop.io.Text;
+import org.apache.kylin.common.KylinVersion;
+import org.apache.kylin.common.util.ByteArray;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.common.util.StringUtil;
import org.apache.kylin.cube.cuboid.CuboidScheduler;
@@ -64,6 +66,7 @@ public class FactDistinctColumnsMapper<KEYIN> extends FactDistinctColumnsMapperB
private int samplingPercentage;
//private ByteArray[] row_hashcodes = null;
private long[] rowHashCodesLong = null;
+ private ByteArray[] row_hashcodes = null;
private ByteBuffer tmpbuf;
private static final Text EMPTY_TEXT = new Text();
public static final byte MARK_FOR_PARTITION_COL = (byte) 0xFE;
@@ -74,6 +77,9 @@ public class FactDistinctColumnsMapper<KEYIN> extends FactDistinctColumnsMapperB
private SelfDefineSortableKey sortableKey = new SelfDefineSortableKey();
+ //about details of the new algorithm, please see KYLIN-2518
+ private boolean isUsePutRowKeyToHllNewAlgorithm;
+
@Override
protected void setup(Context context) throws IOException {
super.setup(context);
@@ -96,8 +102,6 @@ public class FactDistinctColumnsMapper<KEYIN> extends FactDistinctColumnsMapperB
allCuboidsHLL[i] = new HLLCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision(), RegisterType.DENSE);
}
- hf = Hashing.murmur3_128();
- rowHashCodesLong = new long[nRowKey];
TblColRef partitionColRef = cubeDesc.getModel().getPartitionDesc().getPartitionDateColumnRef();
if (partitionColRef != null) {
@@ -111,7 +115,23 @@ public class FactDistinctColumnsMapper<KEYIN> extends FactDistinctColumnsMapperB
} else {
needFetchPartitionCol = true;
}
+ //for KYLIN-2518 backward compatibility
+ if (KylinVersion.isBefore200(cubeDesc.getVersion())) {
+ isUsePutRowKeyToHllNewAlgorithm = false;
+ row_hashcodes = new ByteArray[nRowKey];
+ for (int i = 0; i < nRowKey; i++) {
+ row_hashcodes[i] = new ByteArray();
+ }
+ hf = Hashing.murmur3_32();
+ logger.info("Found KylinVersion : {}. Use old algorithm for cuboid sampling.", cubeDesc.getVersion());
+ } else {
+ isUsePutRowKeyToHllNewAlgorithm = true;
+ rowHashCodesLong = new long[nRowKey];
+ hf = Hashing.murmur3_128();
+ logger.info("Found KylinVersion : {}. Use new algorithm for cuboid sampling. About the details of the new algorithm, please refer to KYLIN-2518", cubeDesc.getVersion());
+ }
}
+
}
private void addCuboidBitSet(long cuboidId, List<Integer[]> allCuboidsBitSet, List<Long> allCuboids) {
@@ -176,7 +196,11 @@ public class FactDistinctColumnsMapper<KEYIN> extends FactDistinctColumnsMapperB
if (collectStatistics) {
if (rowCount % 100 < samplingPercentage) {
- putRowKeyToHLL(row);
+ if (isUsePutRowKeyToHllNewAlgorithm) {
+ putRowKeyToHLLNew(row);
+ } else {
+ putRowKeyToHLLOld(row);
+ }
}
if (needFetchPartitionCol == true) {
@@ -208,7 +232,30 @@ public class FactDistinctColumnsMapper<KEYIN> extends FactDistinctColumnsMapperB
return size;
}
- private void putRowKeyToHLL(String[] row) {
+ private void putRowKeyToHLLOld(String[] row) {
+ //generate hash for each row key column
+ for (int i = 0; i < nRowKey; i++) {
+ Hasher hc = hf.newHasher();
+ String colValue = row[intermediateTableDesc.getRowKeyColumnIndexes()[i]];
+ if (colValue != null) {
+ row_hashcodes[i].set(hc.putString(colValue).hash().asBytes());
+ } else {
+ row_hashcodes[i].set(hc.putInt(0).hash().asBytes());
+ }
+ }
+
+ // user the row key column hash to get a consolidated hash for each cuboid
+ for (int i = 0, n = allCuboidsBitSet.length; i < n; i++) {
+ Hasher hc = hf.newHasher();
+ for (int position = 0; position < allCuboidsBitSet[i].length; position++) {
+ hc.putBytes(row_hashcodes[allCuboidsBitSet[i][position]].array());
+ }
+
+ allCuboidsHLL[i].add(hc.hash().asBytes());
+ }
+ }
+
+ private void putRowKeyToHLLNew(String[] row) {
//generate hash for each row key column
for (int i = 0; i < nRowKey; i++) {
Hasher hc = hf.newHasher();