You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by li...@apache.org on 2015/06/11 12:07:59 UTC
[2/2] incubator-kylin git commit: KYLIN-752 Improved IN clause
performance
KYLIN-752 Improved IN clause performance
Project: http://git-wip-us.apache.org/repos/asf/incubator-kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kylin/commit/aaec6801
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kylin/tree/aaec6801
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kylin/diff/aaec6801
Branch: refs/heads/0.7-staging
Commit: aaec680182712ed4906d1548213688de92cc5538
Parents: eaf6685
Author: Li, Yang <ya...@ebay.com>
Authored: Thu Jun 11 17:55:23 2015 +0800
Committer: Li, Yang <ya...@ebay.com>
Committed: Thu Jun 11 18:07:13 2015 +0800
----------------------------------------------------------------------
.../java/org/apache/kylin/dict/Dictionary.java | 32 +++--
.../kylin/storage/hbase/ColumnValueRange.java | 45 +++++-
.../kylin/storage/hbase/CubeStorageEngine.java | 38 +++---
.../hbase/TupleFilterValueOptimizer.java | 136 -------------------
.../storage/hbase/ColumnValueRangeTest.java | 117 ++++++++++++++++
5 files changed, 196 insertions(+), 172 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/aaec6801/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java
----------------------------------------------------------------------
diff --git a/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java b/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java
index 6a06ed2..3b5e88d 100644
--- a/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java
+++ b/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java
@@ -66,22 +66,24 @@ abstract public class Dictionary<T> implements Writable {
/**
* Convenient form of <code>getIdFromValue(value, 0)</code>
*/
- final public int getIdFromValue(T value) {
+ final public int getIdFromValue(T value) throws IllegalArgumentException {
return getIdFromValue(value, 0);
}
/**
- * Returns the ID integer of given value. In case of not found - if
- * roundingFlag=0, throw IllegalArgumentException; - if roundingFlag<0, the
- * closest smaller ID integer if exist; - if roundingFlag>0, the closest
- * bigger ID integer if exist. The implementation often has cache, thus
- * faster than the byte[] version getIdFromValueBytes()
+ * Returns the ID integer of given value. In case of not found
+ * <p>
+ * - if roundingFlag=0, throw IllegalArgumentException; <br>
+ * - if roundingFlag<0, the closest smaller ID integer if exist; <br>
+ * - if roundingFlag>0, the closest bigger ID integer if exist. <br>
+ * <p>
+ * The implementation often has cache, thus faster than the byte[] version getIdFromValueBytes()
*
* @throws IllegalArgumentException
* if value is not found in dictionary and rounding is off or
* failed
*/
- final public int getIdFromValue(T value, int roundingFlag) {
+ final public int getIdFromValue(T value, int roundingFlag) throws IllegalArgumentException {
if (isNullObjectForm(value))
return nullId();
else
@@ -117,16 +119,16 @@ abstract public class Dictionary<T> implements Writable {
}
/**
- * A lower level API, return ID integer from raw value bytes. In case of not
- * found - if roundingFlag=0, throw IllegalArgumentException; - if
- * roundingFlag<0, the closest smaller ID integer if exist; - if
- * roundingFlag>0, the closest bigger ID integer if exist. Bypassing the
- * cache layer, this could be significantly slower than getIdFromValue(T
- * value).
+ * A lower level API, return ID integer from raw value bytes. In case of not found
+ * <p>
+ * - if roundingFlag=0, throw IllegalArgumentException; <br>
+ * - if roundingFlag<0, the closest smaller ID integer if exist; <br>
+ * - if roundingFlag>0, the closest bigger ID integer if exist. <br>
+ * <p>
+ * Bypassing the cache layer, this could be significantly slower than getIdFromValue(T value).
*
* @throws IllegalArgumentException
- * if value is not found in dictionary and rounding is off or
- * failed
+ * if value is not found in dictionary and rounding is off or failed
*/
final public int getIdFromValueBytes(byte[] value, int offset, int len, int roundingFlag) {
if (isNullByteForm(value, offset, len))
http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/aaec6801/storage/src/main/java/org/apache/kylin/storage/hbase/ColumnValueRange.java
----------------------------------------------------------------------
diff --git a/storage/src/main/java/org/apache/kylin/storage/hbase/ColumnValueRange.java b/storage/src/main/java/org/apache/kylin/storage/hbase/ColumnValueRange.java
index 831bf3d..d5224c5 100644
--- a/storage/src/main/java/org/apache/kylin/storage/hbase/ColumnValueRange.java
+++ b/storage/src/main/java/org/apache/kylin/storage/hbase/ColumnValueRange.java
@@ -20,17 +20,17 @@ package org.apache.kylin.storage.hbase;
import java.util.Collection;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.Set;
import com.google.common.collect.Sets;
+
import org.apache.kylin.cube.kv.RowKeyColumnOrder;
+import org.apache.kylin.dict.Dictionary;
import org.apache.kylin.metadata.model.TblColRef;
import org.apache.kylin.metadata.filter.TupleFilter.FilterOperatorEnum;
/**
- *
- * @author xjiang
- *
*/
public class ColumnValueRange {
private TblColRef column;
@@ -163,6 +163,44 @@ public class ColumnValueRange {
return (beginValue == null || order.compare(beginValue, v) <= 0) && (endValue == null || order.compare(v, endValue) <= 0);
}
+ // remove invalid EQ/IN values and round start/end according to dictionary
+ public void preEvaluateWithDict(Dictionary<String> dict) {
+ if (dict == null)
+ return;
+
+ if (equalValues != null) {
+ Iterator<String> it = equalValues.iterator();
+ while (it.hasNext()) {
+ String v = it.next();
+ try {
+ dict.getIdFromValue(v);
+ } catch (IllegalArgumentException e) {
+ // value not in dictionary
+ it.remove();
+ }
+ }
+ refreshBeginEndFromEquals();
+ }
+
+ if (beginValue != null) {
+ try {
+ beginValue = dict.getValueFromId(dict.getIdFromValue(beginValue, -1));
+ } catch (IllegalArgumentException e) {
+ // value is less than the smallest in dictionary
+ beginValue = null;
+ }
+ }
+
+ if (endValue != null) {
+ try {
+ endValue = dict.getValueFromId(dict.getIdFromValue(endValue, 1));
+ } catch (IllegalArgumentException e) {
+ // value is greater than the biggest in dictionary
+ endValue = null;
+ }
+ }
+ }
+
public String toString() {
if (equalValues == null) {
return column.getName() + " between " + beginValue + " and " + endValue;
@@ -170,4 +208,5 @@ public class ColumnValueRange {
return column.getName() + " in " + equalValues;
}
}
+
}
http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/aaec6801/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
----------------------------------------------------------------------
diff --git a/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java b/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
index 06e1cd2..9b11756 100644
--- a/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
+++ b/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
@@ -26,7 +26,6 @@ import org.apache.kylin.metadata.model.SegmentStatusEnum;
import org.apache.kylin.metadata.model.TblColRef;
import org.apache.kylin.metadata.realization.SQLDigest;
import org.apache.kylin.storage.hbase.coprocessor.observer.ObserverEnabler;
-
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.kylin.common.util.Bytes;
import org.apache.kylin.common.util.Pair;
@@ -37,6 +36,7 @@ import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
+
import org.apache.kylin.common.persistence.HBaseConnection;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
@@ -47,6 +47,7 @@ import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.cube.model.HBaseColumnDesc;
import org.apache.kylin.cube.model.HBaseMappingDesc;
import org.apache.kylin.cube.model.CubeDesc.DeriveInfo;
+import org.apache.kylin.dict.Dictionary;
import org.apache.kylin.dict.lookup.LookupStringTable;
import org.apache.kylin.storage.IStorageEngine;
import org.apache.kylin.metadata.filter.ColumnTupleFilter;
@@ -384,8 +385,7 @@ public class CubeStorageEngine implements IStorageEngine {
// build row key range for each cube segment
for (CubeSegment cubeSeg : cubeInstance.getSegments(SegmentStatusEnum.READY)) {
- // consider derived (lookup snapshot), filter on dimension may
- // differ per segment
+ // consider derived (lookup snapshot), filter on dimension may differ per segment
List<Collection<ColumnValueRange>> orAndDimRanges = translateToOrAndDimRanges(flatFilter, cubeSeg);
if (orAndDimRanges == null) { // has conflict
continue;
@@ -421,8 +421,7 @@ public class CubeStorageEngine implements IStorageEngine {
}
Collection<ColumnValueRange> andRanges = translateToAndDimRanges(andFilter.getChildren(), cubeSegment);
- // ignore the empty-AND
- if (andRanges != null && !andRanges.isEmpty()) {
+ if (andRanges != null) {
result.add(andRanges);
}
}
@@ -458,9 +457,10 @@ public class CubeStorageEngine implements IStorageEngine {
return orAndRanges;
}
+ // return empty collection to mean true; return null to mean false
+ @SuppressWarnings("unchecked")
private Collection<ColumnValueRange> translateToAndDimRanges(List<? extends TupleFilter> andFilters, CubeSegment cubeSegment) {
Map<TblColRef, ColumnValueRange> rangeMap = new HashMap<TblColRef, ColumnValueRange>();
- boolean isEmptyAnd = false;
for (TupleFilter filter : andFilters) {
if ((filter instanceof CompareTupleFilter) == false) {
continue;
@@ -471,20 +471,22 @@ public class CubeStorageEngine implements IStorageEngine {
continue;
}
- // optimize the values of tuple filter
- Collection<String> newValues = TupleFilterValueOptimizer.doOptimization(cubeSegment, comp.getColumn(), comp);
- // in case the current filter is an empty-AND, do not generate ColumnValueRange
- // and also set isEmptyAnd to true so that an empty AND-list will be returned
- if (TupleFilterValueOptimizer.isEmptyAnd(comp, newValues)) {
- isEmptyAnd = true;
- break;
- }
-
- ColumnValueRange range = new ColumnValueRange(comp.getColumn(), newValues, comp.getOperator());
+ ColumnValueRange range = new ColumnValueRange(comp.getColumn(), comp.getValues(), comp.getOperator());
andMerge(range, rangeMap);
-
}
- return isEmptyAnd ? Collections.<ColumnValueRange> emptyList() : rangeMap.values();
+
+ // a little pre-evaluation to remove invalid EQ/IN values and round start/end according to dictionary
+ Iterator<ColumnValueRange> it = rangeMap.values().iterator();
+ while (it.hasNext()) {
+ ColumnValueRange range = it.next();
+ range.preEvaluateWithDict((Dictionary<String>) cubeSegment.getDictionary(range.getColumn()));
+ if (range.satisfyAll())
+ it.remove();
+ else if (range.satisfyNone())
+ return null;
+ }
+
+ return rangeMap.values();
}
private void andMerge(ColumnValueRange range, Map<TblColRef, ColumnValueRange> rangeMap) {
http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/aaec6801/storage/src/main/java/org/apache/kylin/storage/hbase/TupleFilterValueOptimizer.java
----------------------------------------------------------------------
diff --git a/storage/src/main/java/org/apache/kylin/storage/hbase/TupleFilterValueOptimizer.java b/storage/src/main/java/org/apache/kylin/storage/hbase/TupleFilterValueOptimizer.java
deleted file mode 100644
index 49d3143..0000000
--- a/storage/src/main/java/org/apache/kylin/storage/hbase/TupleFilterValueOptimizer.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-package org.apache.kylin.storage.hbase;
-
-import org.apache.kylin.cube.CubeSegment;
-import org.apache.kylin.cube.kv.RowKeyColumnIO;
-import org.apache.kylin.dict.Dictionary;
-import org.apache.kylin.metadata.model.TblColRef;
-import org.apache.kylin.metadata.filter.CompareTupleFilter;
-import org.apache.kylin.metadata.filter.TupleFilter;
-
-import java.util.Collection;
-import java.util.LinkedList;
-
-/**
- * @author Huang, Hua
- */
-public class TupleFilterValueOptimizer {
-
- private static boolean isInDictionary(Dictionary<String> dict, String value) {
- boolean inFlag = true;
- try {
- int id = dict.getIdFromValue(value, 0);
- } catch (IllegalArgumentException ex) {
- inFlag = false;
- }
-
- return inFlag;
- }
-
- private static Collection<String> removeNonDictionaryValues(CubeSegment cubeSegment, TblColRef column, Collection<String> values) {
- RowKeyColumnIO rowKeyColumnIO = new RowKeyColumnIO(cubeSegment);
-
- Dictionary<String> dict = rowKeyColumnIO.getDictionary(column);
- // in case that dict is null, just return values
- if (dict == null) return values;
-
- Collection<String> newValues = new LinkedList<String>();
- for (String value : values) {
- if (isInDictionary(dict, value)) newValues.add(value);
- }
-
- return newValues;
- }
-
- private static Collection<String> roundDictionaryValues(CubeSegment cubeSegment, TblColRef column, Collection<String> values, int roundingFlag) {
- RowKeyColumnIO rowKeyColumnIO = new RowKeyColumnIO(cubeSegment);
-
- Dictionary<String> dict = rowKeyColumnIO.getDictionary(column);
- // in case that dict is null, just return values
- if (dict == null) return values;
-
- Collection<String> newValues = new LinkedList<String>();
- for (String value : values) {
- if (isInDictionary(dict, value)) {
- newValues.add(value);
- }
- else {
- try {
- int id = dict.getIdFromValue(value, roundingFlag);
- String newValue = dict.getValueFromId(id);
- newValues.add(newValue);
- } catch (IllegalArgumentException ex) {
- }
- }
- }
-
- return newValues;
- }
-
- private static Collection<String> optimizeCompareTupleFilter(CubeSegment cubeSegment, TblColRef column, CompareTupleFilter comp) {
- Collection<String> newValues = comp.getValues();
- switch (comp.getOperator()) {
- case EQ:
- case IN:
- newValues = removeNonDictionaryValues(cubeSegment, column, comp.getValues());
- break;
- case LT:
- case LTE:
- newValues = roundDictionaryValues(cubeSegment, column, comp.getValues(), -1);
- break;
- case GT:
- case GTE:
- newValues = roundDictionaryValues(cubeSegment, column, comp.getValues(), 1);
- break;
- default:
- break;
- }
-
- return newValues;
- }
-
- public static Collection<String> doOptimization(CubeSegment cubeSegment, TblColRef column, TupleFilter filter) {
- if (filter instanceof CompareTupleFilter) {
- return optimizeCompareTupleFilter(cubeSegment, column, (CompareTupleFilter)filter);
- }
-
- return filter.getValues();
- }
-
- public static boolean isEmptyAnd(TupleFilter filter, Collection<String> values) {
- boolean isEmptyAnd = false;
- switch (filter.getOperator()) {
- case EQ:
- case IN:
- case LT:
- case LTE:
- case GT:
- case GTE:
- if (values == null || values.isEmpty()) {
- isEmptyAnd = true;
- }
- break;
- default:
- break;
- }
-
- return isEmptyAnd;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/aaec6801/storage/src/test/java/org/apache/kylin/storage/hbase/ColumnValueRangeTest.java
----------------------------------------------------------------------
diff --git a/storage/src/test/java/org/apache/kylin/storage/hbase/ColumnValueRangeTest.java b/storage/src/test/java/org/apache/kylin/storage/hbase/ColumnValueRangeTest.java
new file mode 100644
index 0000000..47b81d3
--- /dev/null
+++ b/storage/src/test/java/org/apache/kylin/storage/hbase/ColumnValueRangeTest.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.storage.hbase;
+
+import static org.junit.Assert.*;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.kylin.dict.Dictionary;
+import org.apache.kylin.dict.StringBytesConverter;
+import org.apache.kylin.dict.TrieDictionaryBuilder;
+import org.apache.kylin.metadata.filter.TupleFilter.FilterOperatorEnum;
+import org.apache.kylin.metadata.model.ColumnDesc;
+import org.apache.kylin.metadata.model.TableDesc;
+import org.apache.kylin.metadata.model.TblColRef;
+import org.junit.Test;
+
+public class ColumnValueRangeTest {
+
+ @Test
+ public void testPreEvaluateWithDict() {
+ TblColRef col = mockupTblColRef();
+ Dictionary<String> dict = mockupDictionary(col, "CN", "US");
+
+ ColumnValueRange r1 = new ColumnValueRange(col, set("CN", "US", "Other"), FilterOperatorEnum.EQ);
+ r1.preEvaluateWithDict(dict);
+ assertEquals(set("CN", "US"), r1.getEqualValues());
+
+ {
+ ColumnValueRange r2 = new ColumnValueRange(col, set("CN"), FilterOperatorEnum.LT);
+ r2.preEvaluateWithDict(dict);
+ assertEquals(null, r2.getBeginValue());
+ assertEquals("CN", r2.getEndValue());
+
+ ColumnValueRange r3 = new ColumnValueRange(col, set("Other"), FilterOperatorEnum.LT);
+ r3.preEvaluateWithDict(dict);
+ assertEquals(null, r3.getBeginValue());
+ assertEquals("US", r3.getEndValue());
+
+ ColumnValueRange r4 = new ColumnValueRange(col, set("UT"), FilterOperatorEnum.LT);
+ r4.preEvaluateWithDict(dict);
+ assertEquals(null, r4.getBeginValue());
+ assertEquals(null, r4.getEndValue());
+ }
+
+ {
+ ColumnValueRange r2 = new ColumnValueRange(col, set("CN"), FilterOperatorEnum.GTE);
+ r2.preEvaluateWithDict(dict);
+ assertEquals("CN", r2.getBeginValue());
+ assertEquals(null, r2.getEndValue());
+
+ ColumnValueRange r3 = new ColumnValueRange(col, set("Other"), FilterOperatorEnum.GTE);
+ r3.preEvaluateWithDict(dict);
+ assertEquals("CN", r3.getBeginValue());
+ assertEquals(null, r3.getEndValue());
+
+ ColumnValueRange r4 = new ColumnValueRange(col, set("CI"), FilterOperatorEnum.GTE);
+ r4.preEvaluateWithDict(dict);
+ assertEquals(null, r4.getBeginValue());
+ assertEquals(null, r4.getEndValue());
+ }
+ }
+
+ public static Dictionary<String> mockupDictionary(TblColRef col, String... values) {
+ TrieDictionaryBuilder<String> builder = new TrieDictionaryBuilder<String>(new StringBytesConverter());
+ for (String v : values) {
+ builder.addValue(v);
+ }
+ return builder.build(0);
+ }
+
+ private static Set<String> set(String... values) {
+ HashSet<String> list = new HashSet<String>();
+ list.addAll(Arrays.asList(values));
+ return list;
+ }
+
+ public static TblColRef mockupTblColRef() {
+ TableDesc t = mockupTableDesc("table_a");
+ ColumnDesc c = mockupColumnDesc(t, 1, "col_1", "string");
+ return new TblColRef(c);
+ }
+
+ private static TableDesc mockupTableDesc(String tableName) {
+ TableDesc mockup = new TableDesc();
+ mockup.setName(tableName);
+ return mockup;
+ }
+
+ private static ColumnDesc mockupColumnDesc(TableDesc table, int oneBasedColumnIndex, String name, String datatype) {
+ ColumnDesc desc = new ColumnDesc();
+ String id = "" + oneBasedColumnIndex;
+ desc.setId(id);
+ desc.setName(name);
+ desc.setDatatype(datatype);
+ desc.init(table);
+ return desc;
+ }
+}