You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kylin.apache.org by sh...@apache.org on 2018/08/20 10:48:08 UTC
[kylin] 02/03: KYLIN-3489 improve the efficiency of enumerating dictionary values by pre-order visiting

This is an automated email from the ASF dual-hosted git repository.

shaofengshi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kylin.git

commit c9337cb4a5e6c4f04c10b7fde1d5e3f6a8ffa25b
Author: Zhong <nj...@apache.org>
AuthorDate: Wed Aug 15 16:04:30 2018 +0800

    KYLIN-3489 improve the efficiency of enumerating dictionary values by pre-order visiting
---
 .../org/apache/kylin/common/util/Dictionary.java   | 11 +++++
 .../java/org/apache/kylin/dict/TrieDictionary.java | 51 ++++++++++++++++++++++
 .../org/apache/kylin/dict/TrieDictionaryTest.java  | 30 +++++++++++++
 3 files changed, 92 insertions(+)

diff --git a/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java b/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java
index 7c1e8ab..5c47399 100644
--- a/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java
+++ b/core-common/src/main/java/org/apache/kylin/common/util/Dictionary.java
@@ -24,11 +24,14 @@ import java.io.IOException;
 import java.io.PrintStream;
 import java.io.Serializable;
 import java.io.UnsupportedEncodingException;
+import java.util.List;
 
 import org.apache.kylin.common.KylinConfig;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.collect.Lists;
+
 /**
  * A bi-way dictionary that maps from dimension/column values to IDs and vice
  * versa. By storing IDs instead of real values, the size of cube is
@@ -170,6 +173,14 @@ abstract public class Dictionary<T> implements Serializable {
 
     abstract public void dump(PrintStream out);
 
+    public List<T> enumeratorValues() {
+        List<T> ret = Lists.newArrayListWithExpectedSize(getSize());
+        for (int i = getMinId(); i <= getMaxId(); i++) {
+            ret.add(getValueFromId(i));
+        }
+        return ret;
+    }
+
     public int nullId() {
         return NULL_ID[getSizeOfId()];
     }
diff --git a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java
index d531c05..8303bb0 100644
--- a/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java
+++ b/core-dictionary/src/main/java/org/apache/kylin/dict/TrieDictionary.java
@@ -28,6 +28,7 @@ import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 import java.io.PrintStream;
 import java.util.Arrays;
+import java.util.List;
 
 import org.apache.kylin.common.util.Bytes;
 import org.apache.kylin.common.util.BytesUtil;
@@ -36,7 +37,9 @@ import org.apache.kylin.common.util.Dictionary;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
 
 /**
  * A dictionary based on Trie data structure that maps enumerations of byte[] to
@@ -359,6 +362,54 @@ public class TrieDictionary<T> extends CacheDictionary<T> {
     }
 
     @Override
+    public List<T> enumeratorValues() {
+        List<T> result = Lists.newArrayListWithExpectedSize(getSize());
+        byte[] buf = new byte[maxValueLength];
+        visitNode(headSize, buf, 0, result);
+        return result;
+    }
+
+    @VisibleForTesting
+    List<T> enumeratorValuesByParent() {
+        return super.enumeratorValues();
+    }
+
+    /**
+     * Visit the trie tree by pre-order
+     * @param n           -- the offset of current node in trieBytes
+     * @param returnValue -- where return value is written to
+     */
+    private void visitNode(int n, byte[] returnValue, int offset, List<T> result) {
+        int o = offset;
+
+        // write current node value
+        int p = n + firstByteOffset;
+        int len = BytesUtil.readUnsigned(trieBytes, p - 1, 1);
+        System.arraycopy(trieBytes, p, returnValue, o, len);
+        o += len;
+
+        // if the value is ended
+        boolean isEndOfValue = checkFlag(n, BIT_IS_END_OF_VALUE);
+        if (isEndOfValue) {
+            T curNodeValue = bytesConvert.convertFromBytes(returnValue, 0, o);
+            result.add(curNodeValue);
+        }
+
+        // find a child to continue
+        int c = getChildOffset(n);
+        if (c == headSize) // has no children
+            return;
+        while (true) {
+            visitNode(c, returnValue, o, result);
+            if (checkFlag(c, BIT_IS_LAST_CHILD))
+                return;
+            // go to next child
+            p = c + firstByteOffset;
+            c = p + BytesUtil.readUnsigned(trieBytes, p - 1, 1);
+        }
+    }
+
+    @Override
     public void dump(PrintStream out) {
         out.println("Total " + nValues + " values");
         for (int i = 0; i < nValues; i++) {
diff --git a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java
index 600b072..c873035 100644
--- a/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java
+++ b/core-dictionary/src/test/java/org/apache/kylin/dict/TrieDictionaryTest.java
@@ -35,13 +35,18 @@ import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.List;
 import java.util.NoSuchElementException;
 import java.util.Random;
 import java.util.TreeSet;
+import java.util.concurrent.TimeUnit;
 
 import org.junit.Assert;
 import org.junit.Test;
 
+import com.google.common.base.Stopwatch;
+import com.google.common.collect.Sets;
+
 public class TrieDictionaryTest {
 
     public static void main(String[] args) throws Exception {
@@ -209,6 +214,31 @@ public class TrieDictionaryTest {
     }
 
     @Test
+    public void testEnumeratorValues() throws Exception {
+        testEnumeratorValues("src/test/resources/dict/english-words.80 (scowl-2015.05.18).txt");
+        testEnumeratorValues("src/test/resources/dict/dw_category_grouping_names.dat");
+    }
+
+    private void testEnumeratorValues(String file) throws Exception {
+        InputStream is = new FileInputStream(file);
+        ArrayList<String> str = loadStrings(is);
+        TrieDictionaryBuilder<String> b = newDictBuilder(str);
+        TrieDictionary<String> dict = b.build(0);
+        System.out.println("Dictionary size for file " + file + " is " + dict.getSize());
+
+        Stopwatch sw = new Stopwatch();
+        sw.start();
+        List<String> values1 = dict.enumeratorValuesByParent();
+        System.out.println("By iterating id visit the time cost " + sw.elapsed(TimeUnit.MILLISECONDS) + " ms");
+        sw.reset();
+        sw.start();
+        List<String> values2 = dict.enumeratorValues();
+        System.out.println("By pre-order visit the time cost " + sw.elapsed(TimeUnit.MILLISECONDS) + " ms");
+        sw.stop();
+        assertEquals(Sets.newHashSet(values1), Sets.newHashSet(values2));
+    }
+
+    @Test
     public void englishWordsTest() throws Exception {
         InputStream is = new FileInputStream("src/test/resources/dict/english-words.80 (scowl-2015.05.18).txt");
         ArrayList<String> str = loadStrings(is);