You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by lu...@apache.org on 2015/09/06 09:59:51 UTC

[27/50] [abbrv] incubator-kylin git commit: KYLIN-740 fix many IN clause performance issue

KYLIN-740 fix many IN clause performance issue


Project: http://git-wip-us.apache.org/repos/asf/incubator-kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kylin/commit/92b111c6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kylin/tree/92b111c6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kylin/diff/92b111c6

Branch: refs/heads/0.7
Commit: 92b111c61c05e70920a505036a51c1112fb861a5
Parents: ec41bf0
Author: honma <ho...@ebay.com>
Authored: Wed Aug 26 15:30:13 2015 +0800
Committer: Luke Han <lu...@apache.org>
Committed: Sun Sep 6 14:37:58 2015 +0800

----------------------------------------------------------------------
 .../kylin/storage/hbase/CubeStorageEngine.java  | 17 +++++++++++--
 .../storage/hbase/FuzzyValueCombination.java    | 26 +++++++++-----------
 .../kylin/storage/hbase/HBaseKeyRange.java      |  2 +-
 .../hbase/FuzzyValueCombinationTest.java        |  2 +-
 4 files changed, 28 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/92b111c6/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
----------------------------------------------------------------------
diff --git a/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java b/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
index 5fb6f0c..8eb7bcb 100644
--- a/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
+++ b/storage/src/main/java/org/apache/kylin/storage/hbase/CubeStorageEngine.java
@@ -22,6 +22,7 @@ import java.util.ArrayList;
 import java.util.BitSet;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -29,6 +30,7 @@ import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.TreeSet;
 
 import org.apache.hadoop.hbase.client.HConnection;
 import org.apache.kylin.common.persistence.HBaseConnection;
@@ -557,8 +559,19 @@ public class CubeStorageEngine implements IStorageEngine {
             byte[] stopKey = keyRange.getStopKey();
             long partitionColumnStartDate = Long.MAX_VALUE;
             long partitionColumnEndDate = 0;
-            List<Pair<byte[], byte[]>> newFuzzyKeys = new ArrayList<Pair<byte[], byte[]>>(mergeSize);
+
             List<Collection<ColumnValueRange>> newFlatOrAndFilter = Lists.newLinkedList();
+            TreeSet<Pair<byte[], byte[]>> newFuzzyKeys = new TreeSet<>(new Comparator<Pair<byte[], byte[]>>() {
+                @Override
+                public int compare(Pair<byte[], byte[]> o1, Pair<byte[], byte[]> o2) {
+                    int partialResult = Bytes.compareTo(o1.getFirst(), o2.getFirst());
+                    if (partialResult != 0) {
+                        return partialResult;
+                    } else {
+                        return Bytes.compareTo(o1.getSecond(), o2.getSecond());
+                    }
+                }
+            });
 
             boolean hasNonFuzzyRange = false;
             for (int k = from; k <= to; k++) {
@@ -584,7 +597,7 @@ public class CubeStorageEngine implements IStorageEngine {
 
             partitionColumnStartDate = (partitionColumnStartDate == Long.MAX_VALUE) ? 0 : partitionColumnStartDate;
             partitionColumnEndDate = (partitionColumnEndDate == 0) ? Long.MAX_VALUE : partitionColumnEndDate;
-            keyRange = new HBaseKeyRange(cubeSegment, cuboid, startKey, stopKey, newFuzzyKeys, newFlatOrAndFilter, partitionColumnStartDate, partitionColumnEndDate);
+            keyRange = new HBaseKeyRange(cubeSegment, cuboid, startKey, stopKey, Lists.newArrayList(newFuzzyKeys), newFlatOrAndFilter, partitionColumnStartDate, partitionColumnEndDate);
         }
         return keyRange;
     }

http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/92b111c6/storage/src/main/java/org/apache/kylin/storage/hbase/FuzzyValueCombination.java
----------------------------------------------------------------------
diff --git a/storage/src/main/java/org/apache/kylin/storage/hbase/FuzzyValueCombination.java b/storage/src/main/java/org/apache/kylin/storage/hbase/FuzzyValueCombination.java
index d0208bd..616a232 100644
--- a/storage/src/main/java/org/apache/kylin/storage/hbase/FuzzyValueCombination.java
+++ b/storage/src/main/java/org/apache/kylin/storage/hbase/FuzzyValueCombination.java
@@ -52,8 +52,14 @@ public class FuzzyValueCombination {
 
     public static List<Map<TblColRef, String>> calculate(Map<TblColRef, Set<String>> fuzzyValues, long cap) {
         Dim[] dims = toDims(fuzzyValues);
-        capDims(dims, cap);
-        return combination(dims);
+        // If a query has many IN clause and each IN clause has many values, then it will easily generate 
+        // thousands of fuzzy keys. When there are lots of fuzzy keys, the scan performance is bottle necked 
+        // on it. So simply choose to abandon all fuzzy keys in this case.
+        if (exceedCap(dims, cap)) {
+            return Lists.newArrayList();
+        } else {
+            return combination(dims);
+        }
     }
 
     @SuppressWarnings("unchecked")
@@ -115,21 +121,11 @@ public class FuzzyValueCombination {
         return dims;
     }
 
-    private static void capDims(Dim[] dims, long cap) {
-        Arrays.sort(dims, new Comparator<Dim>() {
-            @Override
-            public int compare(Dim o1, Dim o2) {
-                return -(o1.values.size() - o2.values.size());
-            }
-        });
-
-        for (Dim dim : dims) {
-            if (combCount(dims) < cap)
-                break;
-            dim.values = Collections.emptySet();
-        }
+    private static boolean exceedCap(Dim[] dims, long cap) {
+        return combCount(dims) > cap;
     }
 
+
     private static long combCount(Dim[] dims) {
         long count = 1;
         for (Dim dim : dims) {

http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/92b111c6/storage/src/main/java/org/apache/kylin/storage/hbase/HBaseKeyRange.java
----------------------------------------------------------------------
diff --git a/storage/src/main/java/org/apache/kylin/storage/hbase/HBaseKeyRange.java b/storage/src/main/java/org/apache/kylin/storage/hbase/HBaseKeyRange.java
index 1db277d..cdfafaf 100644
--- a/storage/src/main/java/org/apache/kylin/storage/hbase/HBaseKeyRange.java
+++ b/storage/src/main/java/org/apache/kylin/storage/hbase/HBaseKeyRange.java
@@ -52,7 +52,7 @@ public class HBaseKeyRange implements Comparable<HBaseKeyRange> {
 
     private static final Logger logger = LoggerFactory.getLogger(HBaseKeyRange.class);
 
-    private static final int FUZZY_VALUE_CAP = 20;
+    private static final int FUZZY_VALUE_CAP = 100;
     private static final byte[] ZERO_TAIL_BYTES = new byte[] { 0 };
 
     private final CubeSegment cubeSeg;

http://git-wip-us.apache.org/repos/asf/incubator-kylin/blob/92b111c6/storage/src/test/java/org/apache/kylin/storage/hbase/FuzzyValueCombinationTest.java
----------------------------------------------------------------------
diff --git a/storage/src/test/java/org/apache/kylin/storage/hbase/FuzzyValueCombinationTest.java b/storage/src/test/java/org/apache/kylin/storage/hbase/FuzzyValueCombinationTest.java
index 46f6431..e803116 100644
--- a/storage/src/test/java/org/apache/kylin/storage/hbase/FuzzyValueCombinationTest.java
+++ b/storage/src/test/java/org/apache/kylin/storage/hbase/FuzzyValueCombinationTest.java
@@ -100,7 +100,7 @@ public class FuzzyValueCombinationTest {
         for (Map<TblColRef, String> item : result) {
             System.out.println(item);
         }
-        assertEquals(9, result.size());
+        assertEquals(0, result.size());
     }
 
     private static TblColRef col(int i, TableDesc t) {