You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by li...@apache.org on 2016/11/16 23:58:42 UTC
kylin git commit: KYLIN-1851 code review TrieDictionaryForest and
Builder
Repository: kylin
Updated Branches:
refs/heads/master 350547e6e -> 21a1c5c09
KYLIN-1851 code review TrieDictionaryForest and Builder
Project: http://git-wip-us.apache.org/repos/asf/kylin/repo
Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/21a1c5c0
Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/21a1c5c0
Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/21a1c5c0
Branch: refs/heads/master
Commit: 21a1c5c09e8e40adb4b11aed11ff2878fef18134
Parents: 350547e
Author: Yang Li <li...@apache.org>
Authored: Thu Nov 17 07:57:31 2016 +0800
Committer: Yang Li <li...@apache.org>
Committed: Thu Nov 17 07:57:31 2016 +0800
----------------------------------------------------------------------
.../apache/kylin/dict/TrieDictionaryForest.java | 153 ++++++++-----------
.../kylin/dict/TrieDictionaryForestBuilder.java | 74 ++++-----
2 files changed, 90 insertions(+), 137 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kylin/blob/21a1c5c0/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
index 38cd0dc..9a26c4c 100755
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForest.java
@@ -17,7 +17,6 @@
*/
package org.apache.kylin.dict;
-
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
@@ -26,7 +25,6 @@ import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
-import java.util.Comparator;
import java.util.List;
import org.apache.kylin.common.util.ByteArray;
@@ -35,11 +33,10 @@ import org.apache.kylin.common.util.BytesUtil;
import org.apache.kylin.common.util.ClassUtil;
import org.apache.kylin.common.util.Dictionary;
-
/**
- * use trie forest to optimize trie dictionary
+ * Use trie forest to optimize trie dictionary
* <p>
- * the input data must in an increase order(sort by org.apache.kylin.dict.ByteComparator)
+ * The input data should be in an increase order (sort by org.apache.kylin.dict.ByteComparator)
* <p>
* Created by xiefan on 16-10-26.
*/
@@ -48,34 +45,19 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
private ArrayList<TrieDictionary<T>> trees;
- //private ArrayList<byte[]> valueDivide; //find tree
-
private ArrayList<ByteArray> valueDivide;
- private ArrayList<Integer> accuOffset; //find tree
+ private ArrayList<Integer> accuOffset; //find tree
private BytesConverter<T> bytesConvert;
private int baseId;
- /*public AtomicLong getValueIndexTime = new AtomicLong(0);
-
- public AtomicLong getValueTime = new AtomicLong(0);
-
- public AtomicLong binarySearchTime = new AtomicLong(0);
-
- public AtomicLong copyTime = new AtomicLong(0);
-
- public AtomicLong getValueIndexTime2 = new AtomicLong(0);
-
- public AtomicLong getValueTime2 = new AtomicLong(0);*/
-
public TrieDictionaryForest() { // default constructor for Writable interface
-
}
- public TrieDictionaryForest(ArrayList<TrieDictionary<T>> trees,
- ArrayList<ByteArray> valueDivide, ArrayList<Integer> accuOffset, BytesConverter<T> bytesConverter, int baseId) {
+ public TrieDictionaryForest(ArrayList<TrieDictionary<T>> trees, ArrayList<ByteArray> valueDivide, //
+ ArrayList<Integer> accuOffset, BytesConverter<T> bytesConverter, int baseId) {
this.trees = trees;
this.valueDivide = valueDivide;
this.accuOffset = accuOffset;
@@ -83,16 +65,17 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
this.baseId = baseId;
}
-
@Override
public int getMinId() {
- if (trees.isEmpty()) return baseId;
+ if (trees.isEmpty())
+ return baseId;
return trees.get(0).getMinId() + baseId;
}
@Override
public int getMaxId() {
- if (trees.isEmpty()) return baseId - 1;
+ if (trees.isEmpty())
+ return baseId - 1;
int index = trees.size() - 1;
int id = accuOffset.get(index) + trees.get(index).getMaxId() + baseId;
return id;
@@ -100,7 +83,8 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
@Override
public int getSizeOfId() {
- if (trees.isEmpty()) return -1;
+ if (trees.isEmpty())
+ return -1;
int maxOffset = accuOffset.get(accuOffset.size() - 1);
TrieDictionary<T> lastTree = trees.get(trees.size() - 1);
int sizeOfId = BytesUtil.sizeForValue(baseId + maxOffset + lastTree.getMaxId() + 1);
@@ -115,15 +99,13 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
return maxValue;
}
- //value --> id
+ // value --> id
@Override
- protected int getIdFromValueImpl(T value, int roundingFlag)
- throws IllegalArgumentException {
+ protected int getIdFromValueImpl(T value, int roundingFlag) throws IllegalArgumentException {
byte[] valueBytes = bytesConvert.convertToBytes(value);
return getIdFromValueBytesImpl(valueBytes, 0, valueBytes.length, roundingFlag);
}
-
@Override
protected int getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) throws IllegalArgumentException {
@@ -132,13 +114,9 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
return result;
}
- //id = tree_inner_offset + accumulate_offset + baseId
- protected int _getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag)
- throws IllegalArgumentException {
-
- //long startTime = System.currentTimeMillis();
+ // id = tree_inner_offset + accumulate_offset + baseId
+ protected int _getIdFromValueBytesImpl(byte[] value, int offset, int len, int roundingFlag) throws IllegalArgumentException {
ByteArray search = new ByteArray(value, offset, len);
- //copyTime.addAndGet(System.currentTimeMillis() - startTime);
int index = findIndexByValue(search);
if (index < 0) {
if (roundingFlag > 0) {
@@ -152,7 +130,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
T curTreeMax = trees.get(index).getValueFromId(trees.get(index).getMaxId());
byte[] b1 = bytesConvert.convertToBytes(curTreeMax);
ByteArray ba1 = new ByteArray(b1, 0, b1.length);
- //ByteArray ba2 = new ByteArray(value, 0, value.length);
if (search.compareTo(ba1) > 0)
index++;
if (index >= trees.size())
@@ -162,9 +139,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
id = tree.getIdFromValueBytes(value, offset, len, roundingFlag);
id = id + accuOffset.get(index);
id += baseId;
- if (id < 0) {
- throw new IllegalArgumentException("Value '" + Bytes.toString(value, offset, len) + "' (" + Bytes.toStringBinary(value, offset, len) + ") not exists!");
- }
return id;
}
@@ -179,9 +153,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
}
@Override
- protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset)
- throws IllegalArgumentException {
- //long startTime = System.currentTimeMillis();
+ protected int getValueBytesFromIdImpl(int id, byte[] returnValue, int offset) throws IllegalArgumentException {
int index = findIndexById(id);
int treeInnerOffset = getTreeInnerOffset(id, index);
TrieDictionary<T> tree = trees.get(index);
@@ -189,20 +161,15 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
return size;
}
-
@Override
protected byte[] getValueBytesFromIdImpl(int id) throws IllegalArgumentException {
- int index = findIndexById(id); //lower bound
- if (index < 0) {
- throw new IllegalArgumentException("Tree Not Found. index < 0");
- }
+ int index = findIndexById(id);
int treeInnerOffset = getTreeInnerOffset(id, index);
TrieDictionary<T> tree = trees.get(index);
byte[] result = tree.getValueBytesFromId(treeInnerOffset);
return result;
}
-
private int getTreeInnerOffset(int id, int index) {
id -= baseId;
id = id - accuOffset.get(index);
@@ -259,7 +226,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
out.write(head);
}
-
private void writeBody(DataOutput out) throws IOException {
for (int i = 0; i < trees.size(); i++) {
TrieDictionary<T> tree = trees.get(i);
@@ -307,21 +273,6 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
}
- /*@Override
- public boolean equals(Object o) {
- if ((o instanceof TrieDictionaryForest) == false) {
- logger.info("Equals return false because it's not TrieDictionaryForest");
- return false;
- }
- TrieDictionaryForest that = (TrieDictionaryForest) o;
- if(this.trees.size() != that.getTrees().size())
- return false;
- for(int i=0;i<trees.size();i++){
- if(!trees.get(i).equals(that.getTrees().get(i))) return false;
- }
- return true;
- }*/
-
@Override
public boolean contains(Dictionary other) {
if (other.getSize() > this.getSize()) {
@@ -337,35 +288,59 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
return true;
}
- public List<TrieDictionary<T>> getTrees() {
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + baseId;
+ result = prime * result + ((trees == null) ? 0 : trees.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ TrieDictionaryForest other = (TrieDictionaryForest) obj;
+ if (baseId != other.baseId)
+ return false;
+ if (trees == null) {
+ if (other.trees != null)
+ return false;
+ } else if (!trees.equals(other.trees))
+ return false;
+ return true;
+ }
+
+ List<TrieDictionary<T>> getTrees() {
return Collections.unmodifiableList(this.trees);
}
+ BytesConverter<T> getBytesConvert() {
+ return bytesConvert;
+ }
+
private int findIndexByValue(ByteArray value) {
- int index = lowerBound(value, new Comparator<ByteArray>() {
- @Override
- public int compare(ByteArray o1, ByteArray o2) {
- return o1.compareTo(o2);
- }
- }, this.valueDivide);
+ int index = lowerBound(value, this.valueDivide);
return index;
}
private int findIndexById(Integer id) {
id -= baseId;
- int index = lowerBound(id, new Comparator<Integer>() {
- @Override
- public int compare(Integer o1, Integer o2) {
- return o1.compareTo(o2);
- }
- }, this.accuOffset);
+ int index = lowerBound(id, this.accuOffset);
+ if (index < 0) {
+ throw new IllegalArgumentException("Tree Not Found. index < 0");
+ }
return index;
}
-
- private static <K> int lowerBound(K lookfor, Comparator<K> comparator, ArrayList<K> list) {
+ private static <K extends Comparable> int lowerBound(K lookfor, ArrayList<K> list) {
if (list == null || list.isEmpty())
- return 0; //return the first tree
+ return -1; // not found
int left = 0;
int right = list.size() - 1;
int mid = 0;
@@ -373,7 +348,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
int comp = 0;
while (!found && left <= right) {
mid = left + (right - left) / 2;
- comp = comparator.compare(lookfor, list.get(mid));
+ comp = lookfor.compareTo(list.get(mid));
if (comp < 0)
right = mid - 1;
else if (comp > 0)
@@ -384,7 +359,7 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
if (found) {
return mid;
} else {
- return Math.min(left, right); //value may be bigger than the right tree
+ return Math.min(left, right); // value may be bigger than the right tree (could return -1)
}
}
@@ -402,16 +377,8 @@ public class TrieDictionaryForest<T> extends Dictionary<T> {
list.add("paint");
Collections.sort(list);
for (String str : list) {
- System.out.println("found value:" + str + " index:" + lowerBound(str, new Comparator<String>() {
- @Override
- public int compare(String o1, String o2) {
- return o1.compareTo(o2);
- }
- }, list));
+ System.out.println("found value:" + str + " index:" + lowerBound(str, list));
}
}
- public BytesConverter<T> getBytesConvert() {
- return bytesConvert;
- }
}
http://git-wip-us.apache.org/repos/asf/kylin/blob/21a1c5c0/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
----------------------------------------------------------------------
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
index 1ceac27..8c577cf 100755
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionaryForestBuilder.java
@@ -24,12 +24,14 @@ import org.slf4j.LoggerFactory;
import java.util.ArrayList;
-
+/**
+ * Build a trie dictionary forest if the input values is ordered, or the forest falls back to a single trie.
+ */
public class TrieDictionaryForestBuilder<T> {
public static int DEFAULT_MAX_TRIE_TREE_SIZE_MB = 500;
- //public static int MaxTrieTreeSize = 1024;//1k
+ private static final Logger logger = LoggerFactory.getLogger(TrieDictionaryForestBuilder.class);
private BytesConverter<T> bytesConverter;
@@ -41,11 +43,9 @@ public class TrieDictionaryForestBuilder<T> {
private ArrayList<ByteArray> valueDivide = new ArrayList<>(); //find tree
- private ArrayList<Integer> accuOffset = new ArrayList<>(); //find tree
-
- private ByteArray previousValue = null; //value use for remove duplicate
+ private ArrayList<Integer> accuOffset = new ArrayList<>(); //find tree
- private static final Logger logger = LoggerFactory.getLogger(TrieDictionaryForestBuilder.class);
+ private ByteArray previousValue = null; //value use for remove duplicate
private int baseId;
@@ -55,7 +55,6 @@ public class TrieDictionaryForestBuilder<T> {
private boolean isOrdered = true;
-
public TrieDictionaryForestBuilder(BytesConverter<T> bytesConverter) {
this(bytesConverter, 0);
}
@@ -68,46 +67,45 @@ public class TrieDictionaryForestBuilder<T> {
this.bytesConverter = bytesConverter;
this.trieBuilder = new TrieDictionaryBuilder<T>(bytesConverter);
this.baseId = baseId;
- curOffset = 0;
+ this.curOffset = 0;
this.maxTrieTreeSize = maxTrieTreeSizeMB * 1024 * 1024;
- logger.info("maxTrieSize is set to:" + maxTrieTreeSize + "B");
}
-
public void addValue(T value) {
- if (value == null) return;
+ if (value == null)
+ return;
byte[] valueBytes = bytesConverter.convertToBytes(value);
addValue(new ByteArray(valueBytes, 0, valueBytes.length));
}
public void addValue(byte[] value) {
- if (value == null) return;
+ if (value == null)
+ return;
ByteArray array = new ByteArray(value, 0, value.length);
addValue(array);
}
public void addValue(ByteArray value) {
- //System.out.println("value length:"+value.length);
- if (value == null) return;
- //logger.info("going to add value:" + new String(value.array()));
- if (previousValue == null) {
- previousValue = value;
- } else {
+ if (value == null)
+ return;
+ if (previousValue != null && isOrdered) {
int comp = previousValue.compareTo(value);
if (comp == 0) {
- //logger.info("find duplicate value:" + new String(value.array()));
return; //duplicate value
}
- if (comp > 0 && isOrdered) {
- logger.info("values not in ascending order:" + new String(value.array()));
+ if (comp > 0) {
+ logger.info("values not in ascending order, previous '{}', current '{}'", previousValue, value);
isOrdered = false;
- //System.out.println(".");
+ if (trees.size() > 0) {
+ throw new IllegalStateException("Invalid input data. Unordered data cannot be split into multi trees");
+ }
}
}
- this.trieBuilder.addValue(value.array());
previousValue = value;
- this.curTreeSize += value.length();
- if (curTreeSize >= this.maxTrieTreeSize) {
+ trieBuilder.addValue(value.array());
+ curTreeSize += value.length();
+
+ if (curTreeSize >= maxTrieTreeSize && isOrdered) {
TrieDictionary<T> tree = trieBuilder.build(0);
addTree(tree);
reset();
@@ -115,28 +113,17 @@ public class TrieDictionaryForestBuilder<T> {
}
public TrieDictionaryForest<T> build() {
- if (curTreeSize != 0) { //last tree
+ if (curTreeSize != 0) { //last tree
TrieDictionary<T> tree = trieBuilder.build(0);
addTree(tree);
reset();
}
- TrieDictionaryForest<T> forest = new TrieDictionaryForest<T>(this.trees,
- this.valueDivide, this.accuOffset, this.bytesConverter, baseId);
-
- //log
- logger.info("tree num:" + forest.getTrees().size());
- StringBuilder sb = new StringBuilder();
- for (ByteArray ba : valueDivide) {
- sb.append(new String(ba.array()) + " ");
- }
- logger.info("value divide:" + sb.toString());
- /*
- If input values are not in ascending order and tree num>1,TrieDictionaryForest can not work correctly.
- */
+ TrieDictionaryForest<T> forest = new TrieDictionaryForest<T>(this.trees, this.valueDivide, this.accuOffset, this.bytesConverter, baseId);
+
+ // if input values are not in ascending order and tree num>1,TrieDictionaryForest can not work correctly.
if (forest.getTrees().size() > 1 && !isOrdered) {
- throw new IllegalStateException("Invalid input data.Unordered data can not be split into multi trees");
+ throw new IllegalStateException("Invalid input data. Unordered data can not be split into multi trees");
}
-
return forest;
}
@@ -144,7 +131,7 @@ public class TrieDictionaryForestBuilder<T> {
return maxTrieTreeSize;
}
- public void setMaxTrieTreeSize(int maxTrieTreeSize) {
+ void setMaxTrieTreeSize(int maxTrieTreeSize) {
this.maxTrieTreeSize = maxTrieTreeSize;
logger.info("maxTrieSize is set to:" + maxTrieTreeSize + "B");
}
@@ -156,7 +143,6 @@ public class TrieDictionaryForestBuilder<T> {
byte[] valueBytes = tree.getValueBytesFromId(minId);
valueDivide.add(new ByteArray(valueBytes, 0, valueBytes.length));
curOffset += (tree.getMaxId() + 1);
- //System.out.println(" curOffset:"+ curOffset);
}
private void reset() {
@@ -169,7 +155,7 @@ public class TrieDictionaryForestBuilder<T> {
try {
config = KylinConfig.getInstanceFromEnv();
} catch (RuntimeException e) {
- logger.info("can not get KylinConfig from env.Use default setting:" + DEFAULT_MAX_TRIE_TREE_SIZE_MB + "MB");
+ logger.info("cannot get KylinConfig from env.Use default setting:" + DEFAULT_MAX_TRIE_TREE_SIZE_MB + "MB");
}
int maxTrieTreeSizeMB;
if (config != null) {