You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by li...@apache.org on 2015/06/11 12:07:59 UTC

[2/2] incubator-kylin git commit: KYLIN-752 Improved IN clause performance

KYLIN-752 Improved IN clause performance


Project: http://git-wip-us.apache.org/repos/asf/incubator-kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kylin/commit/aaec6801
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kylin/tree/aaec6801
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kylin/diff/aaec6801

Branch: refs/heads/0.7-staging
Commit: aaec680182712ed4906d1548213688de92cc5538
Parents: eaf6685
Author: Li, Yang <ya...@ebay.com>
Authored: Thu Jun 11 17:55:23 2015 +0800
Committer: Li, Yang <ya...@ebay.com>
Committed: Thu Jun 11 18:07:13 2015 +0800

----------------------------------------------------------------------
 .../java/org/apache/kylin/dict/Dictionary.java  |  32 +++--
 .../kylin/storage/hbase/ColumnValueRange.java   |  45 +++++-
 .../kylin/storage/hbase/CubeStorageEngine.java  |  38 +++---
 .../hbase/TupleFilterValueOptimizer.java        | 136 -------------------
 .../storage/hbase/ColumnValueRangeTest.java     | 117 ++++++++++++++++
 5 files changed, 196 insertions(+), 172 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/aaec6801/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java
----------------------------------------------------------------------
diff --git a/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java b/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java
index 6a06ed2..3b5e88d 100644
--- a/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java
+++ b/dictionary/src/main/java/org/apache/kylin/dict/Dictionary.java
@@ -66,22 +66,24 @@ abstract public class Dictionary<T> implements Writable {
     /**
      * Convenient form of <code>getIdFromValue(value, 0)</code>
      */
-    final public int getIdFromValue(T value) {
+    final public int getIdFromValue(T value) throws IllegalArgumentException {
         return getIdFromValue(value, 0);
     }
 
     /**
-     * Returns the ID integer of given value. In case of not found - if
-     * roundingFlag=0, throw IllegalArgumentException; - if roundingFlag<0, the
-     * closest smaller ID integer if exist; - if roundingFlag>0, the closest
-     * bigger ID integer if exist. The implementation often has cache, thus
-     * faster than the byte[] version getIdFromValueBytes()
+     * Returns the ID integer of given value. In case of not found
+     * <p>
+     * - if roundingFlag=0, throw IllegalArgumentException; <br>
+     * - if roundingFlag<0, the closest smaller ID integer if exist; <br>
+     * - if roundingFlag>0, the closest bigger ID integer if exist. <br>
+     * <p>
+     * The implementation often has cache, thus faster than the byte[] version getIdFromValueBytes()
      * 
      * @throws IllegalArgumentException
      *             if value is not found in dictionary and rounding is off or
      *             failed
      */
-    final public int getIdFromValue(T value, int roundingFlag) {
+    final public int getIdFromValue(T value, int roundingFlag) throws IllegalArgumentException {
         if (isNullObjectForm(value))
             return nullId();
         else
@@ -117,16 +119,16 @@ abstract public class Dictionary<T> implements Writable {
     }
 
     /**
-     * A lower level API, return ID integer from raw value bytes. In case of not
-     * found - if roundingFlag=0, throw IllegalArgumentException; - if
-     * roundingFlag<0, the closest smaller ID integer if exist; - if
-     * roundingFlag>0, the closest bigger ID integer if exist. Bypassing the
-     * cache layer, this could be significantly slower than getIdFromValue(T
-     * value).
+     * A lower level API, return ID integer from raw value bytes. In case of not found
+     * <p>
+     * - if roundingFlag=0, throw IllegalArgumentException; <br>
+     * - if roundingFlag<0, the closest smaller ID integer if exist; <br> 
+     * - if roundingFlag>0, the closest bigger ID integer if exist. <br>
+     * <p>
+     * Bypassing the cache layer, this could be significantly slower than getIdFromValue(T value).
      * 
      * @throws IllegalArgumentException
-     *             if value is not found in dictionary and rounding is off or
-     *             failed
+     *             if value is not found in dictionary and rounding is off or failed
      */
     final public int getIdFromValueBytes(byte[] value, int offset, int len, int roundingFlag) {
         if (isNullByteForm(value, offset, len))

http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/aaec6801/storage/src/main/java/org/apache/kylin/storage/hbase/ColumnValueRange.java
----------------------------------------------------------------------
diff --git a/storage/src/main/java/org/apache/kylin/storage/hbase/ColumnValueRange.java b/storage/src/main/java/org/apache/kylin/storage/hbase/ColumnValueRange.java
index 831bf3d..d5224c5 100644
--- a/storage/src/main/java/org/apache/kylin/storage/hbase/ColumnValueRange.java
+++ b/storage/src/main/java/org/apache/kylin/storage/hbase/ColumnValueRange.java
@@ -20,17 +20,17 @@ package org.apache.kylin.storage.hbase;
 
 import java.util.Collection;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Set;
 
 import com.google.common.collect.Sets;
+
 import org.apache.kylin.cube.kv.RowKeyColumnOrder;
+import org.apache.kylin.dict.Dictionary;
 import org.apache.kylin.metadata.model.TblColRef;
 import org.apache.kylin.metadata.filter.TupleFilter.FilterOperatorEnum;
 
 /**
- * 
- * @author xjiang
- * 
  */
 public class ColumnValueRange {
     private TblColRef column;
@@ -163,6 +163,44 @@ public class ColumnValueRange {
         return (beginValue == null || order.compare(beginValue, v) <= 0) && (endValue == null || order.compare(v, endValue) <= 0);
     }
 
+    // remove invalid EQ/IN values and round start/end according to dictionary
+    public void preEvaluateWithDict(Dictionary<String> dict) {
+        if (dict == null)
+            return;
+        
+        if (equalValues != null) {
+            Iterator<String> it = equalValues.iterator();
+            while (it.hasNext()) {
+                String v = it.next();
+                try {
+                    dict.getIdFromValue(v);
+                } catch (IllegalArgumentException e) {
+                    // value not in dictionary
+                    it.remove();
+                }
+            }
+            refreshBeginEndFromEquals();
+        }
+        
+        if (beginValue != null) {
+            try {
+                beginValue = dict.getValueFromId(dict.getIdFromValue(beginValue, -1));
+            } catch (IllegalArgumentException e) {
+                // value is less than the smallest in dictionary
+                beginValue = null;
+            }
+        }
+        
+        if (endValue != null) {
+            try {
+                endValue = dict.getValueFromId(dict.getIdFromValue(endValue, 1));
+            } catch (IllegalArgumentException e) {
+                // value is greater than the biggest in dictionary
+                endValue = null;
+            }
+        }
+    }
+
     public String toString() {
         if (equalValues == null) {
             return column.getName() + " between " + beginValue + " and " + endValue;
@@ -170,4 +208,5 @@ public class ColumnValueRange {
             return column.getName() + " in " + equalValues;
         }
     }
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/aaec6801/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
----------------------------------------------------------------------
diff --git a/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java b/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
index 06e1cd2..9b11756 100644
--- a/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
+++ b/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
@@ -26,7 +26,6 @@ import org.apache.kylin.metadata.model.SegmentStatusEnum;
 import org.apache.kylin.metadata.model.TblColRef;
 import org.apache.kylin.metadata.realization.SQLDigest;
 import org.apache.kylin.storage.hbase.coprocessor.observer.ObserverEnabler;
-
 import org.apache.hadoop.hbase.client.HConnection;
 import org.apache.kylin.common.util.Bytes;
 import org.apache.kylin.common.util.Pair;
@@ -37,6 +36,7 @@ import org.slf4j.LoggerFactory;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
+
 import org.apache.kylin.common.persistence.HBaseConnection;
 import org.apache.kylin.cube.CubeInstance;
 import org.apache.kylin.cube.CubeManager;
@@ -47,6 +47,7 @@ import org.apache.kylin.cube.model.CubeDesc;
 import org.apache.kylin.cube.model.HBaseColumnDesc;
 import org.apache.kylin.cube.model.HBaseMappingDesc;
 import org.apache.kylin.cube.model.CubeDesc.DeriveInfo;
+import org.apache.kylin.dict.Dictionary;
 import org.apache.kylin.dict.lookup.LookupStringTable;
 import org.apache.kylin.storage.IStorageEngine;
 import org.apache.kylin.metadata.filter.ColumnTupleFilter;
@@ -384,8 +385,7 @@ public class CubeStorageEngine implements IStorageEngine {
         // build row key range for each cube segment
         for (CubeSegment cubeSeg : cubeInstance.getSegments(SegmentStatusEnum.READY)) {
 
-            // consider derived (lookup snapshot), filter on dimension may
-            // differ per segment
+            // consider derived (lookup snapshot), filter on dimension may differ per segment
             List<Collection<ColumnValueRange>> orAndDimRanges = translateToOrAndDimRanges(flatFilter, cubeSeg);
             if (orAndDimRanges == null) { // has conflict
                 continue;
@@ -421,8 +421,7 @@ public class CubeStorageEngine implements IStorageEngine {
             }
 
             Collection<ColumnValueRange> andRanges = translateToAndDimRanges(andFilter.getChildren(), cubeSegment);
-            // ignore the empty-AND
-            if (andRanges != null && !andRanges.isEmpty()) {
+            if (andRanges != null) {
                 result.add(andRanges);
             }
         }
@@ -458,9 +457,10 @@ public class CubeStorageEngine implements IStorageEngine {
         return orAndRanges;
     }
 
+    // return empty collection to mean true; return null to mean false
+    @SuppressWarnings("unchecked")
     private Collection<ColumnValueRange> translateToAndDimRanges(List<? extends TupleFilter> andFilters, CubeSegment cubeSegment) {
         Map<TblColRef, ColumnValueRange> rangeMap = new HashMap<TblColRef, ColumnValueRange>();
-        boolean isEmptyAnd = false;
         for (TupleFilter filter : andFilters) {
             if ((filter instanceof CompareTupleFilter) == false) {
                 continue;
@@ -471,20 +471,22 @@ public class CubeStorageEngine implements IStorageEngine {
                 continue;
             }
 
-            // optimize the values of tuple filter
-            Collection<String> newValues = TupleFilterValueOptimizer.doOptimization(cubeSegment, comp.getColumn(), comp);
-            // in case the current filter is an empty-AND, do not generate ColumnValueRange
-            // and also set isEmptyAnd to true so that an empty AND-list will be returned
-            if (TupleFilterValueOptimizer.isEmptyAnd(comp, newValues)) {
-                isEmptyAnd = true;
-                break;
-            }
-
-            ColumnValueRange range = new ColumnValueRange(comp.getColumn(), newValues, comp.getOperator());
+            ColumnValueRange range = new ColumnValueRange(comp.getColumn(), comp.getValues(), comp.getOperator());
             andMerge(range, rangeMap);
-
         }
-        return isEmptyAnd ? Collections.<ColumnValueRange> emptyList() : rangeMap.values();
+        
+        // a little pre-evaluation to remove invalid EQ/IN values and round start/end according to dictionary
+        Iterator<ColumnValueRange> it = rangeMap.values().iterator();
+        while (it.hasNext()) {
+            ColumnValueRange range = it.next();
+            range.preEvaluateWithDict((Dictionary<String>) cubeSegment.getDictionary(range.getColumn()));
+            if (range.satisfyAll())
+                it.remove();
+            else if (range.satisfyNone())
+                return null;
+        }
+        
+        return rangeMap.values();
     }
 
     private void andMerge(ColumnValueRange range, Map<TblColRef, ColumnValueRange> rangeMap) {

http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/aaec6801/storage/src/main/java/org/apache/kylin/storage/hbase/TupleFilterValueOptimizer.java
----------------------------------------------------------------------
diff --git a/storage/src/main/java/org/apache/kylin/storage/hbase/TupleFilterValueOptimizer.java b/storage/src/main/java/org/apache/kylin/storage/hbase/TupleFilterValueOptimizer.java
deleted file mode 100644
index 49d3143..0000000
--- a/storage/src/main/java/org/apache/kylin/storage/hbase/TupleFilterValueOptimizer.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-*/
-
-package org.apache.kylin.storage.hbase;
-
-import org.apache.kylin.cube.CubeSegment;
-import org.apache.kylin.cube.kv.RowKeyColumnIO;
-import org.apache.kylin.dict.Dictionary;
-import org.apache.kylin.metadata.model.TblColRef;
-import org.apache.kylin.metadata.filter.CompareTupleFilter;
-import org.apache.kylin.metadata.filter.TupleFilter;
-
-import java.util.Collection;
-import java.util.LinkedList;
-
-/**
- * @author Huang, Hua
- */
-public class TupleFilterValueOptimizer {
-
-    private static boolean isInDictionary(Dictionary<String> dict, String value) {
-        boolean inFlag = true;
-        try {
-            int id = dict.getIdFromValue(value, 0);
-        } catch (IllegalArgumentException ex) {
-            inFlag = false;
-        }
-
-        return inFlag;
-    }
-
-    private static Collection<String> removeNonDictionaryValues(CubeSegment cubeSegment, TblColRef column, Collection<String> values) {
-        RowKeyColumnIO rowKeyColumnIO = new RowKeyColumnIO(cubeSegment);
-
-        Dictionary<String> dict = rowKeyColumnIO.getDictionary(column);
-        // in case that dict is null, just return values
-        if (dict == null) return values;
-
-        Collection<String> newValues = new LinkedList<String>();
-        for (String value : values) {
-            if (isInDictionary(dict, value)) newValues.add(value);
-        }
-
-        return newValues;
-    }
-
-    private static Collection<String> roundDictionaryValues(CubeSegment cubeSegment, TblColRef column, Collection<String> values, int roundingFlag) {
-        RowKeyColumnIO rowKeyColumnIO = new RowKeyColumnIO(cubeSegment);
-
-        Dictionary<String> dict = rowKeyColumnIO.getDictionary(column);
-        // in case that dict is null, just return values
-        if (dict == null) return values;
-
-        Collection<String> newValues = new LinkedList<String>();
-        for (String value : values) {
-            if (isInDictionary(dict, value)) {
-                newValues.add(value);
-            }
-            else {
-                try {
-                    int id = dict.getIdFromValue(value, roundingFlag);
-                    String newValue = dict.getValueFromId(id);
-                    newValues.add(newValue);
-                } catch (IllegalArgumentException ex) {
-                }
-            }
-        }
-
-        return newValues;
-    }
-
-    private static Collection<String> optimizeCompareTupleFilter(CubeSegment cubeSegment, TblColRef column, CompareTupleFilter comp) {
-        Collection<String> newValues = comp.getValues();
-        switch (comp.getOperator()) {
-            case EQ:
-            case IN:
-                newValues = removeNonDictionaryValues(cubeSegment, column, comp.getValues());
-                break;
-            case LT:
-            case LTE:
-                newValues = roundDictionaryValues(cubeSegment, column, comp.getValues(), -1);
-                break;
-            case GT:
-            case GTE:
-                newValues = roundDictionaryValues(cubeSegment, column, comp.getValues(), 1);
-                break;
-            default:
-                break;
-        }
-
-        return newValues;
-    }
-
-    public static Collection<String> doOptimization(CubeSegment cubeSegment, TblColRef column, TupleFilter filter) {
-        if (filter instanceof CompareTupleFilter) {
-            return optimizeCompareTupleFilter(cubeSegment, column, (CompareTupleFilter)filter);
-        }
-
-        return filter.getValues();
-    }
-
-    public static boolean isEmptyAnd(TupleFilter filter, Collection<String> values) {
-        boolean isEmptyAnd = false;
-        switch (filter.getOperator()) {
-            case EQ:
-            case IN:
-            case LT:
-            case LTE:
-            case GT:
-            case GTE:
-                if (values == null || values.isEmpty()) {
-                    isEmptyAnd = true;
-                }
-                break;
-            default:
-                break;
-        }
-
-        return isEmptyAnd;
-    }
-}

http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/aaec6801/storage/src/test/java/org/apache/kylin/storage/hbase/ColumnValueRangeTest.java
----------------------------------------------------------------------
diff --git a/storage/src/test/java/org/apache/kylin/storage/hbase/ColumnValueRangeTest.java b/storage/src/test/java/org/apache/kylin/storage/hbase/ColumnValueRangeTest.java
new file mode 100644
index 0000000..47b81d3
--- /dev/null
+++ b/storage/src/test/java/org/apache/kylin/storage/hbase/ColumnValueRangeTest.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+package org.apache.kylin.storage.hbase;
+
+import static org.junit.Assert.*;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.kylin.dict.Dictionary;
+import org.apache.kylin.dict.StringBytesConverter;
+import org.apache.kylin.dict.TrieDictionaryBuilder;
+import org.apache.kylin.metadata.filter.TupleFilter.FilterOperatorEnum;
+import org.apache.kylin.metadata.model.ColumnDesc;
+import org.apache.kylin.metadata.model.TableDesc;
+import org.apache.kylin.metadata.model.TblColRef;
+import org.junit.Test;
+
+public class ColumnValueRangeTest {
+
+    @Test
+    public void testPreEvaluateWithDict() {
+        TblColRef col = mockupTblColRef();
+        Dictionary<String> dict = mockupDictionary(col, "CN", "US");
+
+        ColumnValueRange r1 = new ColumnValueRange(col, set("CN", "US", "Other"), FilterOperatorEnum.EQ);
+        r1.preEvaluateWithDict(dict);
+        assertEquals(set("CN", "US"), r1.getEqualValues());
+
+        {
+            ColumnValueRange r2 = new ColumnValueRange(col, set("CN"), FilterOperatorEnum.LT);
+            r2.preEvaluateWithDict(dict);
+            assertEquals(null, r2.getBeginValue());
+            assertEquals("CN", r2.getEndValue());
+
+            ColumnValueRange r3 = new ColumnValueRange(col, set("Other"), FilterOperatorEnum.LT);
+            r3.preEvaluateWithDict(dict);
+            assertEquals(null, r3.getBeginValue());
+            assertEquals("US", r3.getEndValue());
+
+            ColumnValueRange r4 = new ColumnValueRange(col, set("UT"), FilterOperatorEnum.LT);
+            r4.preEvaluateWithDict(dict);
+            assertEquals(null, r4.getBeginValue());
+            assertEquals(null, r4.getEndValue());
+        }
+
+        {
+            ColumnValueRange r2 = new ColumnValueRange(col, set("CN"), FilterOperatorEnum.GTE);
+            r2.preEvaluateWithDict(dict);
+            assertEquals("CN", r2.getBeginValue());
+            assertEquals(null, r2.getEndValue());
+            
+            ColumnValueRange r3 = new ColumnValueRange(col, set("Other"), FilterOperatorEnum.GTE);
+            r3.preEvaluateWithDict(dict);
+            assertEquals("CN", r3.getBeginValue());
+            assertEquals(null, r3.getEndValue());
+            
+            ColumnValueRange r4 = new ColumnValueRange(col, set("CI"), FilterOperatorEnum.GTE);
+            r4.preEvaluateWithDict(dict);
+            assertEquals(null, r4.getBeginValue());
+            assertEquals(null, r4.getEndValue());
+        }
+    }
+
+    public static Dictionary<String> mockupDictionary(TblColRef col, String... values) {
+        TrieDictionaryBuilder<String> builder = new TrieDictionaryBuilder<String>(new StringBytesConverter());
+        for (String v : values) {
+            builder.addValue(v);
+        }
+        return builder.build(0);
+    }
+
+    private static Set<String> set(String... values) {
+        HashSet<String> list = new HashSet<String>();
+        list.addAll(Arrays.asList(values));
+        return list;
+    }
+
+    public static TblColRef mockupTblColRef() {
+        TableDesc t = mockupTableDesc("table_a");
+        ColumnDesc c = mockupColumnDesc(t, 1, "col_1", "string");
+        return new TblColRef(c);
+    }
+
+    private static TableDesc mockupTableDesc(String tableName) {
+        TableDesc mockup = new TableDesc();
+        mockup.setName(tableName);
+        return mockup;
+    }
+
+    private static ColumnDesc mockupColumnDesc(TableDesc table, int oneBasedColumnIndex, String name, String datatype) {
+        ColumnDesc desc = new ColumnDesc();
+        String id = "" + oneBasedColumnIndex;
+        desc.setId(id);
+        desc.setName(name);
+        desc.setDatatype(datatype);
+        desc.init(table);
+        return desc;
+    }
+}