You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2019/08/16 02:55:18 UTC

[arrow] branch master updated: ARROW-5862: [Java] Provide dictionary builder

This is an automated email from the ASF dual-hosted git repository.

emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 1f5ebd0  ARROW-5862: [Java] Provide dictionary builder
1f5ebd0 is described below

commit 1f5ebd0fae2c49831d9c52c64bd5e1b81e1b860a
Author: liyafan82 <fa...@foxmail.com>
AuthorDate: Thu Aug 15 19:54:18 2019 -0700

    ARROW-5862: [Java] Provide dictionary builder
    
    The dictionary builder servers for the following scenario which is frequently encountered in practice when dictionary encoding is involved: the dictionary values are not known a priori, so they are determined dynamically, as new data arrive continually.
    
    In particular, when a new value arrives, it is tested to check if it is already in the dictionary. If so, it is simply neglected, otherwise, it is added to the dictionary.
    
    When all values have been evaluated, the dictionary can be considered complete. So encoding can start afterward.
    
    The code snippet using a dictionary builder should be like this:
    
    DictonaryBuilder<IntVector> dictionaryBuilder = ...
    dictionaryBuilder.startBuild();
    ...
    dictionaryBuild.addValue(newValue);
    ...
    dictionaryBuilder.endBuild();
    
    Closes #4813 from liyafan82/fly_0705_build and squashes the following commits:
    
    2007b87c7 <liyafan82>  Provide dictionary builder
    
    Authored-by: liyafan82 <fa...@foxmail.com>
    Signed-off-by: Micah Kornfield <em...@gmail.com>
---
 .../SearchTreeBasedDictionaryBuilder.java          | 162 +++++++++++++++
 .../TestSearchTreeBasedDictionaryBuilder.java      | 222 +++++++++++++++++++++
 2 files changed, 384 insertions(+)

diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/dictionary/SearchTreeBasedDictionaryBuilder.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/dictionary/SearchTreeBasedDictionaryBuilder.java
new file mode 100644
index 0000000..a6f5642
--- /dev/null
+++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/dictionary/SearchTreeBasedDictionaryBuilder.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.algorithm.dictionary;
+
+import java.util.TreeSet;
+
+import org.apache.arrow.algorithm.sort.VectorValueComparator;
+import org.apache.arrow.vector.ValueVector;
+
+/**
+ * A dictionary builder is intended for the scenario frequently encountered in practice:
+ * the dictionary is not known a priori, so it is generated dynamically.
+ * In particular, when a new value arrives, it is tested to check if it is already
+ * in the dictionary. If so, it is simply neglected, otherwise, it is added to the dictionary.
+ *
+ * <p>
+ *   This class builds the dictionary based on a binary search tree.
+ *   Each add operation can be finished in O(log(n)) time,
+ *   where n is the current dictionary size.
+ * </p>
+ * <p>
+ *   The dictionary builder is intended to build a single dictionary.
+ *   So it cannot be used for different dictionaries.
+ * </p>
+ * <p>Below gives the sample code for using the dictionary builder
+ * <pre>{@code
+ * SearchTreeBasedDictionaryBuilder dictionaryBuilder = ...
+ * ...
+ * dictionaryBuild.addValue(newValue);
+ * ...
+ * }</pre>
+ * </p>
+ * <p>
+ *  With the above code, the dictionary vector will be populated,
+ *  and it can be retrieved by the {@link SearchTreeBasedDictionaryBuilder#getDictionary()} method.
+ *  After that, dictionary encoding can proceed with the populated dictionary.
+ * </p>
+ * @param <V> the dictionary vector type.
+ */
+public class SearchTreeBasedDictionaryBuilder<V extends ValueVector> {
+
+  /**
+   * The dictionary to be built.
+   */
+  private final V dictionary;
+
+  /**
+   * The criteria for sorting in the search tree.
+   */
+  protected final VectorValueComparator<V> comparator;
+
+  /**
+   * If null should be encoded.
+   */
+  private final boolean encodeNull;
+
+  /**
+   * The search tree for storing the value index.
+   */
+  private TreeSet<Integer> searchTree;
+
+  /**
+   * Construct a search tree-based dictionary builder.
+   * @param dictionary the dictionary vector.
+   * @param comparator the criteria for value equality.
+   */
+  public SearchTreeBasedDictionaryBuilder(V dictionary, VectorValueComparator<V> comparator) {
+    this(dictionary, comparator, false);
+  }
+
+  /**
+   * Construct a search tree-based dictionary builder.
+   * @param dictionary the dictionary vector.
+   * @param comparator the criteria for value equality.
+   * @param encodeNull if null values should be added to the dictionary.
+   */
+  public SearchTreeBasedDictionaryBuilder(V dictionary, VectorValueComparator<V> comparator, boolean encodeNull) {
+    this.dictionary = dictionary;
+    this.comparator = comparator;
+    this.encodeNull = encodeNull;
+    this.comparator.attachVector(dictionary);
+
+    searchTree = new TreeSet<>((index1, index2) -> comparator.compare(index1, index2));
+  }
+
+  /**
+   * Gets the dictionary built.
+   * Please note that the dictionary is not in sorted order.
+   * Instead, its order is determined by the order of element insertion.
+   * To get the dictionary in sorted order, please use
+   * {@link SearchTreeBasedDictionaryBuilder#populateSortedDictionary(ValueVector)}.
+   * @return the dictionary.
+   */
+  public V getDictionary() {
+    return dictionary;
+  }
+
+  /**
+   * Try to add all values from the target vector to the dictionary.
+   * @param targetVector the target vector containing values to probe.
+   * @return the number of values actually added to the dictionary.
+   */
+  public int addValues(V targetVector) {
+    int ret = 0;
+    for (int i = 0; i < targetVector.getValueCount(); i++) {
+      if (!encodeNull && targetVector.isNull(i)) {
+        continue;
+      }
+      if (addValue(targetVector, i)) {
+        dictionary.setValueCount(dictionary.getValueCount() + 1);
+        ret += 1;
+      }
+    }
+    return ret;
+  }
+
+  /**
+   * Try to add an element from the target vector to the dictionary.
+   * @param targetVector the target vector containing new element.
+   * @param targetIndex the index of the new element in the target vector.
+   * @return true if the element is added to the dictionary, and false otherwise.
+   */
+  public boolean addValue(V targetVector, int targetIndex) {
+    // first copy the value to the end of the dictionary
+    int dictSize = dictionary.getValueCount();
+    dictionary.copyFromSafe(targetIndex, dictSize, targetVector);
+
+    // try to add the value to the dictionary,
+    // if an equal element does not exist.
+    // this operation can be done in O(logn) time.
+    boolean ret = searchTree.add(dictSize);
+    return ret;
+  }
+
+  /**
+   * Gets the sorted dictionary.
+   * Note that given the binary search tree, the sort can finish in O(n).
+   */
+  public void populateSortedDictionary(V sortedDictionary) {
+    int idx = 0;
+    for (Integer dictIdx : searchTree) {
+      sortedDictionary.copyFromSafe(dictIdx, idx++, dictionary);
+    }
+
+    sortedDictionary.setValueCount(dictionary.getValueCount());
+  }
+}
diff --git a/java/algorithm/src/test/java/org/apache/arrow/algorithm/dictionary/TestSearchTreeBasedDictionaryBuilder.java b/java/algorithm/src/test/java/org/apache/arrow/algorithm/dictionary/TestSearchTreeBasedDictionaryBuilder.java
new file mode 100644
index 0000000..72b113c
--- /dev/null
+++ b/java/algorithm/src/test/java/org/apache/arrow/algorithm/dictionary/TestSearchTreeBasedDictionaryBuilder.java
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.algorithm.dictionary;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
+import org.apache.arrow.algorithm.sort.VectorValueComparator;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.VarCharVector;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for {@link SearchTreeBasedDictionaryBuilder}.
+ */
+public class TestSearchTreeBasedDictionaryBuilder {
+
+  private BufferAllocator allocator;
+
+  @Before
+  public void prepare() {
+    allocator = new RootAllocator(1024 * 1024);
+  }
+
+  @After
+  public void shutdown() {
+    allocator.close();
+  }
+
+  @Test
+  public void testBuildVariableWidthDictionaryWithNull() {
+    try (VarCharVector vec = new VarCharVector("", allocator);
+         VarCharVector dictionary = new VarCharVector("", allocator);
+         VarCharVector sortedDictionary = new VarCharVector("", allocator)) {
+
+      vec.allocateNew(100, 10);
+      vec.setValueCount(10);
+
+      dictionary.allocateNew();
+      sortedDictionary.allocateNew();
+
+      // fill data
+      vec.set(0, "hello".getBytes());
+      vec.set(1, "abc".getBytes());
+      vec.setNull(2);
+      vec.set(3, "world".getBytes());
+      vec.set(4, "12".getBytes());
+      vec.set(5, "dictionary".getBytes());
+      vec.setNull(6);
+      vec.set(7, "hello".getBytes());
+      vec.set(8, "good".getBytes());
+      vec.set(9, "abc".getBytes());
+
+      VectorValueComparator<VarCharVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
+      SearchTreeBasedDictionaryBuilder<VarCharVector> dictionaryBuilder =
+              new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, true);
+
+      int result = dictionaryBuilder.addValues(vec);
+
+      assertEquals(7, result);
+      assertEquals(7, dictionary.getValueCount());
+
+      dictionaryBuilder.populateSortedDictionary(sortedDictionary);
+
+      assertTrue(sortedDictionary.isNull(0));
+      assertEquals("12", new String(sortedDictionary.get(1)));
+      assertEquals("abc", new String(sortedDictionary.get(2)));
+      assertEquals("dictionary", new String(sortedDictionary.get(3)));
+      assertEquals("good", new String(sortedDictionary.get(4)));
+      assertEquals("hello", new String(sortedDictionary.get(5)));
+      assertEquals("world", new String(sortedDictionary.get(6)));
+    }
+  }
+
+  @Test
+  public void testBuildVariableWidthDictionaryWithoutNull() {
+    try (VarCharVector vec = new VarCharVector("", allocator);
+         VarCharVector dictionary = new VarCharVector("", allocator);
+         VarCharVector sortedDictionary = new VarCharVector("", allocator)) {
+
+      vec.allocateNew(100, 10);
+      vec.setValueCount(10);
+
+      dictionary.allocateNew();
+      sortedDictionary.allocateNew();
+
+      // fill data
+      vec.set(0, "hello".getBytes());
+      vec.set(1, "abc".getBytes());
+      vec.setNull(2);
+      vec.set(3, "world".getBytes());
+      vec.set(4, "12".getBytes());
+      vec.set(5, "dictionary".getBytes());
+      vec.setNull(6);
+      vec.set(7, "hello".getBytes());
+      vec.set(8, "good".getBytes());
+      vec.set(9, "abc".getBytes());
+
+      VectorValueComparator<VarCharVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
+      SearchTreeBasedDictionaryBuilder<VarCharVector> dictionaryBuilder =
+              new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, false);
+
+      int result = dictionaryBuilder.addValues(vec);
+
+      assertEquals(6, result);
+      assertEquals(6, dictionary.getValueCount());
+
+      dictionaryBuilder.populateSortedDictionary(sortedDictionary);
+
+      assertEquals("12", new String(sortedDictionary.get(0)));
+      assertEquals("abc", new String(sortedDictionary.get(1)));
+      assertEquals("dictionary", new String(sortedDictionary.get(2)));
+      assertEquals("good", new String(sortedDictionary.get(3)));
+      assertEquals("hello", new String(sortedDictionary.get(4)));
+      assertEquals("world", new String(sortedDictionary.get(5)));
+    }
+  }
+
+  @Test
+  public void testBuildFixedWidthDictionaryWithNull() {
+    try (IntVector vec = new IntVector("", allocator);
+         IntVector dictionary = new IntVector("", allocator);
+         IntVector sortedDictionary = new IntVector("", allocator)) {
+      vec.allocateNew(10);
+      vec.setValueCount(10);
+
+      dictionary.allocateNew();
+      sortedDictionary.allocateNew();
+
+      // fill data
+      vec.set(0, 4);
+      vec.set(1, 8);
+      vec.set(2, 32);
+      vec.set(3, 8);
+      vec.set(4, 16);
+      vec.set(5, 32);
+      vec.setNull(6);
+      vec.set(7, 4);
+      vec.set(8, 4);
+      vec.setNull(9);
+
+      VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
+      SearchTreeBasedDictionaryBuilder<IntVector> dictionaryBuilder =
+              new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, true);
+
+      int result = dictionaryBuilder.addValues(vec);
+
+      assertEquals(5, result);
+      assertEquals(5, dictionary.getValueCount());
+
+      dictionaryBuilder.populateSortedDictionary(sortedDictionary);
+
+      assertTrue(sortedDictionary.isNull(0));
+      assertEquals(4, sortedDictionary.get(1));
+      assertEquals(8, sortedDictionary.get(2));
+      assertEquals(16, sortedDictionary.get(3));
+      assertEquals(32, sortedDictionary.get(4));
+    }
+  }
+
+  @Test
+  public void testBuildFixedWidthDictionaryWithoutNull() {
+    try (IntVector vec = new IntVector("", allocator);
+         IntVector dictionary = new IntVector("", allocator);
+         IntVector sortedDictionary = new IntVector("", allocator)) {
+      vec.allocateNew(10);
+      vec.setValueCount(10);
+
+      dictionary.allocateNew();
+      sortedDictionary.allocateNew();
+
+      // fill data
+      vec.set(0, 4);
+      vec.set(1, 8);
+      vec.set(2, 32);
+      vec.set(3, 8);
+      vec.set(4, 16);
+      vec.set(5, 32);
+      vec.setNull(6);
+      vec.set(7, 4);
+      vec.set(8, 4);
+      vec.setNull(9);
+
+      VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
+      SearchTreeBasedDictionaryBuilder<IntVector> dictionaryBuilder =
+              new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, false);
+
+      int result = dictionaryBuilder.addValues(vec);
+
+      assertEquals(4, result);
+      assertEquals(4, dictionary.getValueCount());
+
+      dictionaryBuilder.populateSortedDictionary(sortedDictionary);
+
+      assertEquals(4, sortedDictionary.get(0));
+      assertEquals(8, sortedDictionary.get(1));
+      assertEquals(16, sortedDictionary.get(2));
+      assertEquals(32, sortedDictionary.get(3));
+    }
+  }
+}