You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by li...@apache.org on 2016/11/16 23:58:42 UTC

kylin git commit: KYLIN-1851 code review TrieDictionaryForest and Builder

Repository: kylin
Updated Branches:
  refs/heads/master 350547e6e -> 21a1c5c09


KYLIN-1851 code review TrieDictionaryForest and Builder


Project: http://git-wip-us.apache.org/repos/asf/kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/21a1c5c0
Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/21a1c5c0
Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/21a1c5c0

Branch: refs/heads/master
Commit: 21a1c5c09e8e40adb4b11aed11ff2878fef18134
Parents: 350547e
Author: Yang Li <li...@apache.org>
Authored: Thu Nov 17 07:57:31 2016 +0800
Committer: Yang Li <li...@apache.org>
Committed: Thu Nov 17 07:57:31 2016 +0800

----------------------------------------------------------------------
 .../apache/kylin/dict/TrieDictionaryForest.java | 153 ++++++++-----------
 .../kylin/dict/TrieDictionaryForestBuilder.java |  74 ++++-----
 2 files changed, 90 insertions(+), 137 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kylin/blob/21a1c5c0/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
index 38cd0dc..9a26c4c 100755
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
@@ -17,7 +17,6 @@
 */
 package org.apache.kylin.dict;
 
-
 import java.io.ByteArrayOutputStream;
 import java.io.DataInput;
 import java.io.DataOutput;
@@ -26,7 +25,6 @@ import java.io.IOException;
 import java.io.PrintStream;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.Comparator;
 import java.util.List;
 
 import org.apache.kylin.common.util.ByteArray;
@@ -35,11 +33,10 @@ import org.apache.kylin.common.util.BytesUtil;
 import org.apache.kylin.common.util.ClassUtil;
 import org.apache.kylin.common.util.Dictionary;
 
-
 /**
- * use trie forest to optimize trie dictionary
+ * Use trie forest to optimize trie dictionary
  * <p>
- * the input data must in an increase order(sort by org.apache.kylin.dict.ByteComparator)
+ * The input data should be in an increase order (sort by org.apache.kylin.dict.ByteComparator)
  * <p>
  * Created by xiefan on 16-10-26.
  */
@@ -48,34 +45,19 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
 
     private ArrayList<TrieDictionary<T>> trees;
 
-    //private ArrayList<byte[]> valueDivide; //find tree
-
     private ArrayList<ByteArray> valueDivide;
 
-    private ArrayList<Integer> accuOffset;  //find tree
+    private ArrayList<Integer> accuOffset; //find tree
 
     private BytesConverter<T> bytesConvert;
 
     private int baseId;
 
-    /*public AtomicLong getValueIndexTime = new AtomicLong(0);
-
-    public AtomicLong getValueTime = new AtomicLong(0);
-
-    public AtomicLong binarySearchTime = new AtomicLong(0);
-
-    public AtomicLong copyTime = new AtomicLong(0);
-
-    public AtomicLong getValueIndexTime2 = new AtomicLong(0);
-
-    public AtomicLong getValueTime2 = new AtomicLong(0);*/
-
     public TrieDictionaryForest() { // default constructor for Writable interface
-
     }
 
-    public TrieDictionaryForest(ArrayList<TrieDictionary<T>> trees,
-                                ArrayList<ByteArray> valueDivide, ArrayList<Integer> accuOffset, BytesConverter<T> bytesConverter, int baseId) {
+    public TrieDictionaryForest(ArrayList<TrieDictionary<T>> trees, ArrayList<ByteArray> valueDivide, //
+            ArrayList<Integer> accuOffset, BytesConverter<T> bytesConverter, int baseId) {
         this.trees = trees;
         this.valueDivide = valueDivide;
         this.accuOffset = accuOffset;
@@ -83,16 +65,17 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
         this.baseId = baseId;
     }
 
-
     @Override
     public int getMinId() {
-        if (trees.isEmpty()) return baseId;
+        if (trees.isEmpty())
+            return baseId;
         return trees.get(0).getMinId() + baseId;
     }
 
     @Override
     public int getMaxId() {
-        if (trees.isEmpty()) return baseId - 1;
+        if (trees.isEmpty())
+            return baseId - 1;
         int index = trees.size() - 1;
         int id = accuOffset.get(index) + trees.get(index).getMaxId() + baseId;
         return id;
@@ -100,7 +83,8 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
 
     @Override
     public int getSizeOfId() {
-        if (trees.isEmpty()) return -1;
+        if (trees.isEmpty())
+            return -1;
         int maxOffset = accuOffset.get(accuOffset.size() - 1);
         TrieDictionary<T> lastTree = trees.get(trees.size() - 1);
         int sizeOfId = BytesUtil.sizeForValue(baseId + maxOffset + lastTree.getMaxId() + 1);
@@ -115,15 +99,13 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
         return maxValue;
     }
 
-    //value --> id
+    // value --> id
     @Override
-    protected int getIdFromValueImpl(T value, int roundingFlag)
-            throws IllegalArgumentException {
+    protected int getIdFromValueImpl(T value, int roundingFlag) throws IllegalArgumentException {
         byte[] valueBytes = bytesConvert.convertToBytes(value);
         return getIdFromValueBytesImpl(valueBytes, 0, valueBytes.length, roundingFlag);
     }
 
-
     @Override
     protected int getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) throws IllegalArgumentException {
 
@@ -132,13 +114,9 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
         return result;
     }
 
-    //id = tree_inner_offset + accumulate_offset + baseId
-    protected int _getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag)
-            throws IllegalArgumentException {
-
-        //long startTime = System.currentTimeMillis();
+    // id = tree_inner_offset + accumulate_offset + baseId
+    protected int _getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) throws IllegalArgumentException {
         ByteArray search = new ByteArray(value, offset, len);
-        //copyTime.addAndGet(System.currentTimeMillis() - startTime);
         int index = findIndexByValue(search);
         if (index < 0) {
             if (roundingFlag > 0) {
@@ -152,7 +130,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
             T curTreeMax = trees.get(index).getValueFromId(trees.get(index).getMaxId());
             byte[] b1 = bytesConvert.convertToBytes(curTreeMax);
             ByteArray ba1 = new ByteArray(b1, 0, b1.length);
-            //ByteArray ba2 = new ByteArray(value, 0, value.length);
             if (search.compareTo(ba1) > 0)
                 index++;
             if (index >= trees.size())
@@ -162,9 +139,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
         id = tree.getIdFromValueBytes(value, offset, len, roundingFlag);
         id = id + accuOffset.get(index);
         id += baseId;
-        if (id < 0) {
-            throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!");
-        }
         return id;
     }
 
@@ -179,9 +153,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
     }
 
     @Override
-    protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset)
-            throws IllegalArgumentException {
-        //long startTime = System.currentTimeMillis();
+    protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset) throws IllegalArgumentException {
         int index = findIndexById(id);
         int treeInnerOffset = getTreeInnerOffset(id, index);
         TrieDictionary<T> tree = trees.get(index);
@@ -189,20 +161,15 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
         return size;
     }
 
-
     @Override
     protected byte[] getValueBytesFromIdImpl(int id) throws IllegalArgumentException {
-        int index = findIndexById(id); //lower bound
-        if (index < 0) {
-            throw new IllegalArgumentException("Tree Not Found. index < 0");
-        }
+        int index = findIndexById(id);
         int treeInnerOffset = getTreeInnerOffset(id, index);
         TrieDictionary<T> tree = trees.get(index);
         byte[] result = tree.getValueBytesFromId(treeInnerOffset);
         return result;
     }
 
-
     private int getTreeInnerOffset(int id, int index) {
         id -= baseId;
         id = id - accuOffset.get(index);
@@ -259,7 +226,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
         out.write(head);
     }
 
-
     private void writeBody(DataOutput out) throws IOException {
         for (int i = 0; i < trees.size(); i++) {
             TrieDictionary<T> tree = trees.get(i);
@@ -307,21 +273,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
 
     }
 
-    /*@Override
-    public boolean equals(Object o) {
-        if ((o instanceof TrieDictionaryForest) == false) {
-            logger.info("Equals return false because it's not TrieDictionaryForest");
-            return false;
-        }
-        TrieDictionaryForest that = (TrieDictionaryForest) o;
-        if(this.trees.size() != that.getTrees().size())
-            return false;
-        for(int i=0;i<trees.size();i++){
-            if(!trees.get(i).equals(that.getTrees().get(i))) return false;
-        }
-        return true;
-    }*/
-
     @Override
     public boolean contains(Dictionary other) {
         if (other.getSize() > this.getSize()) {
@@ -337,35 +288,59 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
         return true;
     }
 
-    public List<TrieDictionary<T>> getTrees() {
+    @Override
+    public int hashCode() {
+        final int prime = 31;
+        int result = 1;
+        result = prime * result + baseId;
+        result = prime * result + ((trees == null) ? 0 : trees.hashCode());
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (getClass() != obj.getClass())
+            return false;
+        TrieDictionaryForest other = (TrieDictionaryForest) obj;
+        if (baseId != other.baseId)
+            return false;
+        if (trees == null) {
+            if (other.trees != null)
+                return false;
+        } else if (!trees.equals(other.trees))
+            return false;
+        return true;
+    }
+
+    List<TrieDictionary<T>> getTrees() {
         return Collections.unmodifiableList(this.trees);
     }
 
+    BytesConverter<T> getBytesConvert() {
+        return bytesConvert;
+    }
+
     private int findIndexByValue(ByteArray value) {
-        int index = lowerBound(value, new Comparator<ByteArray>() {
-            @Override
-            public int compare(ByteArray o1, ByteArray o2) {
-                return o1.compareTo(o2);
-            }
-        }, this.valueDivide);
+        int index = lowerBound(value, this.valueDivide);
         return index;
     }
 
     private int findIndexById(Integer id) {
         id -= baseId;
-        int index = lowerBound(id, new Comparator<Integer>() {
-            @Override
-            public int compare(Integer o1, Integer o2) {
-                return o1.compareTo(o2);
-            }
-        }, this.accuOffset);
+        int index = lowerBound(id, this.accuOffset);
+        if (index < 0) {
+            throw new IllegalArgumentException("Tree Not Found. index < 0");
+        }
         return index;
     }
 
-
-    private static <K> int lowerBound(K lookfor, Comparator<K> comparator, ArrayList<K> list) {
+    private static <K extends Comparable> int lowerBound(K lookfor, ArrayList<K> list) {
         if (list == null || list.isEmpty())
-            return 0; //return the first tree
+            return -1; // not found
         int left = 0;
         int right = list.size() - 1;
         int mid = 0;
@@ -373,7 +348,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
         int comp = 0;
         while (!found && left <= right) {
             mid = left + (right - left) / 2;
-            comp = comparator.compare(lookfor, list.get(mid));
+            comp = lookfor.compareTo(list.get(mid));
             if (comp < 0)
                 right = mid - 1;
             else if (comp > 0)
@@ -384,7 +359,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
         if (found) {
             return mid;
         } else {
-            return Math.min(left, right);  //value may be bigger than the right tree
+            return Math.min(left, right); // value may be bigger than the right tree (could return -1)
         }
     }
 
@@ -402,16 +377,8 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
         list.add("paint");
         Collections.sort(list);
         for (String str : list) {
-            System.out.println("found value:" + str + " index:" + lowerBound(str, new Comparator<String>() {
-                @Override
-                public int compare(String o1, String o2) {
-                    return o1.compareTo(o2);
-                }
-            }, list));
+            System.out.println("found value:" + str + " index:" + lowerBound(str, list));
         }
     }
 
-    public BytesConverter<T> getBytesConvert() {
-        return bytesConvert;
-    }
 }

http://git-wip-us.apache.org/repos/asf/kylin/blob/21a1c5c0/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
index 1ceac27..8c577cf 100755
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
@@ -24,12 +24,14 @@ import org.slf4j.LoggerFactory;
 
 import java.util.ArrayList;
 
-
+/**
+ * Build a trie dictionary forest if the input values is ordered, or the forest falls back to a single trie.
+ */
 public class TrieDictionaryForestBuilder<T> {
 
     public static int DEFAULT_MAX_TRIE_TREE_SIZE_MB = 500;
 
-    //public static int MaxTrieTreeSize = 1024;//1k
+    private static final Logger logger = LoggerFactory.getLogger(TrieDictionaryForestBuilder.class);
 
     private BytesConverter<T> bytesConverter;
 
@@ -41,11 +43,9 @@ public class TrieDictionaryForestBuilder<T> {
 
     private ArrayList<ByteArray> valueDivide = new ArrayList<>(); //find tree
 
-    private ArrayList<Integer> accuOffset = new ArrayList<>();  //find tree
-
-    private ByteArray previousValue = null;  //value use for remove duplicate
+    private ArrayList<Integer> accuOffset = new ArrayList<>(); //find tree
 
-    private static final Logger logger = LoggerFactory.getLogger(TrieDictionaryForestBuilder.class);
+    private ByteArray previousValue = null; //value use for remove duplicate
 
     private int baseId;
 
@@ -55,7 +55,6 @@ public class TrieDictionaryForestBuilder<T> {
 
     private boolean isOrdered = true;
 
-
     public TrieDictionaryForestBuilder(BytesConverter<T> bytesConverter) {
         this(bytesConverter, 0);
     }
@@ -68,46 +67,45 @@ public class TrieDictionaryForestBuilder<T> {
         this.bytesConverter = bytesConverter;
         this.trieBuilder = new TrieDictionaryBuilder<T>(bytesConverter);
         this.baseId = baseId;
-        curOffset = 0;
+        this.curOffset = 0;
         this.maxTrieTreeSize = maxTrieTreeSizeMB * 1024 * 1024;
-        logger.info("maxTrieSize is set to:" + maxTrieTreeSize + "B");
     }
 
-
     public void addValue(T value) {
-        if (value == null) return;
+        if (value == null)
+            return;
         byte[] valueBytes = bytesConverter.convertToBytes(value);
         addValue(new ByteArray(valueBytes, 0, valueBytes.length));
     }
 
     public void addValue(byte[] value) {
-        if (value == null) return;
+        if (value == null)
+            return;
         ByteArray array = new ByteArray(value, 0, value.length);
         addValue(array);
     }
 
     public void addValue(ByteArray value) {
-        //System.out.println("value length:"+value.length);
-        if (value == null) return;
-        //logger.info("going to add value:" + new String(value.array()));
-        if (previousValue == null) {
-            previousValue = value;
-        } else {
+        if (value == null)
+            return;
+        if (previousValue != null && isOrdered) {
             int comp = previousValue.compareTo(value);
             if (comp == 0) {
-                //logger.info("find duplicate value:" + new String(value.array()));
                 return; //duplicate value
             }
-            if (comp > 0 && isOrdered) {
-                logger.info("values not in ascending order:" + new String(value.array()));
+            if (comp > 0) {
+                logger.info("values not in ascending order, previous '{}', current '{}'", previousValue, value);
                 isOrdered = false;
-                //System.out.println(".");
+                if (trees.size() > 0) {
+                    throw new IllegalStateException("Invalid input data. Unordered data cannot be split into multi trees");
+                }
             }
         }
-        this.trieBuilder.addValue(value.array());
         previousValue = value;
-        this.curTreeSize += value.length();
-        if (curTreeSize >= this.maxTrieTreeSize) {
+        trieBuilder.addValue(value.array());
+        curTreeSize += value.length();
+        
+        if (curTreeSize >= maxTrieTreeSize && isOrdered) {
             TrieDictionary<T> tree = trieBuilder.build(0);
             addTree(tree);
             reset();
@@ -115,28 +113,17 @@ public class TrieDictionaryForestBuilder<T> {
     }
 
     public TrieDictionaryForest<T> build() {
-        if (curTreeSize != 0) {  //last tree
+        if (curTreeSize != 0) { //last tree
             TrieDictionary<T> tree = trieBuilder.build(0);
             addTree(tree);
             reset();
         }
-        TrieDictionaryForest<T> forest = new TrieDictionaryForest<T>(this.trees,
-                this.valueDivide, this.accuOffset, this.bytesConverter, baseId);
-
-        //log
-        logger.info("tree num:" + forest.getTrees().size());
-        StringBuilder sb = new StringBuilder();
-        for (ByteArray ba : valueDivide) {
-            sb.append(new String(ba.array()) + " ");
-        }
-        logger.info("value divide:" + sb.toString());
-        /*
-        If input values are not in ascending order and tree num>1,TrieDictionaryForest can not work correctly.
-         */
+        TrieDictionaryForest<T> forest = new TrieDictionaryForest<T>(this.trees, this.valueDivide, this.accuOffset, this.bytesConverter, baseId);
+
+        // if input values are not in ascending order and tree num>1,TrieDictionaryForest can not work correctly.
         if (forest.getTrees().size() > 1 && !isOrdered) {
-            throw new IllegalStateException("Invalid input data.Unordered data can not be split into multi trees");
+            throw new IllegalStateException("Invalid input data. Unordered data can not be split into multi trees");
         }
-
         return forest;
     }
 
@@ -144,7 +131,7 @@ public class TrieDictionaryForestBuilder<T> {
         return maxTrieTreeSize;
     }
 
-    public void setMaxTrieTreeSize(int maxTrieTreeSize) {
+    void setMaxTrieTreeSize(int maxTrieTreeSize) {
         this.maxTrieTreeSize = maxTrieTreeSize;
         logger.info("maxTrieSize is set to:" + maxTrieTreeSize + "B");
     }
@@ -156,7 +143,6 @@ public class TrieDictionaryForestBuilder<T> {
         byte[] valueBytes = tree.getValueBytesFromId(minId);
         valueDivide.add(new ByteArray(valueBytes, 0, valueBytes.length));
         curOffset += (tree.getMaxId() + 1);
-        //System.out.println(" curOffset:"+ curOffset);
     }
 
     private void reset() {
@@ -169,7 +155,7 @@ public class TrieDictionaryForestBuilder<T> {
         try {
             config = KylinConfig.getInstanceFromEnv();
         } catch (RuntimeException e) {
-            logger.info("can not get KylinConfig from env.Use default setting:" + DEFAULT_MAX_TRIE_TREE_SIZE_MB + "MB");
+            logger.info("cannot get KylinConfig from env.Use default setting:" + DEFAULT_MAX_TRIE_TREE_SIZE_MB + "MB");
         }
         int maxTrieTreeSizeMB;
         if (config != null) {