You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by li...@apache.org on 2016/11/17 03:27:13 UTC

kylin git commit: KYLIN-1851 Add unit test of unsorted data and modify TrieDictionaryForest a little

Repository: kylin
Updated Branches:
  refs/heads/master 4fcaa8ac3 -> 3b1850eae


KYLIN-1851 Add unit test of unsorted data and modify TrieDictionaryForest a little

Signed-off-by: Li Yang <li...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/3b1850ea
Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/3b1850ea
Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/3b1850ea

Branch: refs/heads/master
Commit: 3b1850eae6ff9ec2ea5cd2d03fb1fa4e05aaafed
Parents: 4fcaa8a
Author: xiefan46 <95...@qq.com>
Authored: Thu Nov 17 10:22:28 2016 +0800
Committer: Li Yang <li...@apache.org>
Committed: Thu Nov 17 11:26:45 2016 +0800

----------------------------------------------------------------------
 .../apache/kylin/dict/TrieDictionaryForest.java | 59 +++++++++++++-------
 .../kylin/dict/TrieDictionaryForestTest.java    | 43 ++++++++++++++
 2 files changed, 82 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kylin/blob/3b1850ea/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
index 9a26c4c..eb33a5a 100755
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
@@ -53,16 +53,19 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
 
     private int baseId;
 
+    private ArrayList<ByteArray> maxValue;
+
     public TrieDictionaryForest() { // default constructor for Writable interface
     }
 
     public TrieDictionaryForest(ArrayList<TrieDictionary<T>> trees, ArrayList<ByteArray> valueDivide, //
-            ArrayList<Integer> accuOffset, BytesConverter<T> bytesConverter, int baseId) {
+                                ArrayList<Integer> accuOffset, BytesConverter<T> bytesConverter, int baseId) {
         this.trees = trees;
         this.valueDivide = valueDivide;
         this.accuOffset = accuOffset;
         this.bytesConvert = bytesConverter;
         this.baseId = baseId;
+        initMaxValue();
     }
 
     @Override
@@ -116,27 +119,30 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
 
     // id = tree_inner_offset + accumulate_offset + baseId
     protected int _getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) throws IllegalArgumentException {
-        ByteArray search = new ByteArray(value, offset, len);
-        int index = findIndexByValue(search);
-        if (index < 0) {
+        int index;
+        if (trees.size() == 1) {
+            index = 0;
+        } else {
+            ByteArray search = new ByteArray(value, offset, len);
+            index = findIndexByValue(search);
+            if (index < 0) {
+                if (roundingFlag > 0) {
+                    return getMinId(); //searching value smaller than the smallest value in dict
+                } else {
+                    throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!");
+                }
+            }
+
             if (roundingFlag > 0) {
-                return getMinId(); //searching value smaller than the smallest value in dict
-            } else {
-                throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!");
+                ByteArray maxValueOfTree = maxValue.get(index);
+                if (search.compareTo(maxValueOfTree) > 0)
+                    index++;
+                if (index >= trees.size())
+                    throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!");
             }
         }
-        int id;
-        if (roundingFlag > 0) {
-            T curTreeMax = trees.get(index).getValueFromId(trees.get(index).getMaxId());
-            byte[] b1 = bytesConvert.convertToBytes(curTreeMax);
-            ByteArray ba1 = new ByteArray(b1, 0, b1.length);
-            if (search.compareTo(ba1) > 0)
-                index++;
-            if (index >= trees.size())
-                throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!");
-        }
         TrieDictionary<T> tree = trees.get(index);
-        id = tree.getIdFromValueBytes(value, offset, len, roundingFlag);
+        int id = tree.getIdFromValueBytes(value, offset, len, roundingFlag);
         id = id + accuOffset.get(index);
         id += baseId;
         return id;
@@ -154,7 +160,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
 
     @Override
     protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset) throws IllegalArgumentException {
-        int index = findIndexById(id);
+        int index = (trees.size() == 1) ? 0 : findIndexById(id);
         int treeInnerOffset = getTreeInnerOffset(id, index);
         TrieDictionary<T> tree = trees.get(index);
         int size = tree.getValueBytesFromIdImpl(treeInnerOffset, returnValue, offset);
@@ -163,7 +169,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
 
     @Override
     protected byte[] getValueBytesFromIdImpl(int id) throws IllegalArgumentException {
-        int index = findIndexById(id);
+        int index = (trees.size() == 1) ? 0 : findIndexById(id);
         int treeInnerOffset = getTreeInnerOffset(id, index);
         TrieDictionary<T> tree = trees.get(index);
         byte[] result = tree.getValueBytesFromId(treeInnerOffset);
@@ -264,6 +270,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
                 dict.readFields(in);
                 trees.add(dict);
             }
+            initMaxValue();
         } catch (Exception e) {
             if (e instanceof RuntimeException)
                 throw (RuntimeException) e;
@@ -363,6 +370,18 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
         }
     }
 
+    private void initMaxValue() throws IllegalStateException {
+        this.maxValue = new ArrayList<>();
+        if (this.trees == null || trees.isEmpty())
+            throw new IllegalStateException("Trees not init yet. Could not init max value of each tree");
+        for (int i = 0; i < trees.size(); i++) {
+            T curTreeMax = trees.get(i).getValueFromId(trees.get(i).getMaxId());
+            byte[] b1 = bytesConvert.convertToBytes(curTreeMax);
+            ByteArray ba1 = new ByteArray(b1, 0, b1.length);
+            this.maxValue.add(ba1);
+        }
+    }
+
     public static void main(String[] args) {
         ArrayList<String> list = new ArrayList<>();
         list.add("\u4e00");

http://git-wip-us.apache.org/repos/asf/kylin/blob/3b1850ea/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java
index 3def7e0..07511d1 100755
--- a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java
+++ b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryForestTest.java
@@ -38,6 +38,7 @@ import static org.junit.Assert.assertEquals;
 public class TrieDictionaryForestTest {
 
 
+
     @Test
     public void testBasicFound() {
         ArrayList<String> strs = new ArrayList<String>();
@@ -432,6 +433,48 @@ public class TrieDictionaryForestTest {
         }
     }
 
+    @Test
+    public void testUnsortedData(){
+        ArrayList<String> strs = new ArrayList<>();
+        Iterator<String> it = new RandomStrings(10000).iterator();
+        int totalSize = 0;
+        final StringBytesConverter converter = new StringBytesConverter();
+        while (it.hasNext()) {
+            String str = it.next();
+            byte[] data = converter.convertToBytes(str);
+            if (data != null) {
+                totalSize += data.length;
+            }
+            strs.add(str);
+        }
+        Collections.shuffle(strs);
+        int baseId = 20;
+        int maxTreeSize = totalSize / 10;
+        System.out.println("data size:" + totalSize / 1024 + "KB  max tree size:" + maxTreeSize / 1024 + "KB");
+        //test maintain one trie
+        TrieDictionaryForestBuilder<String> builder = new TrieDictionaryForestBuilder<String>(converter);
+        builder.setMaxTrieTreeSize(maxTreeSize);
+        for(String str : strs){
+            builder.addValue(str);
+        }
+        TrieDictionaryForest<String> dict = builder.build();
+        assertEquals(1,dict.getTrees().size());
+        //test throws Exception
+        Collections.sort(strs);
+        strs.add("f");
+        strs.add("a");
+        builder = new TrieDictionaryForestBuilder<String>(converter);
+        builder.setMaxTrieTreeSize(maxTreeSize);
+        try {
+            for (String str : strs)
+                builder.addValue(str);
+            dict = builder.build();
+            fail("Input data no sorted and builder have multi trees. Should throw IllegalStateException");
+        }catch (IllegalStateException e){
+            //correct
+        }
+    }
+
     /*
     can not pass cases like 1.7695564055819624E-4
      */