You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/07/04 05:52:10 UTC

[arrow] 31/38: ARROW-5814: [Java] Implement a HashMap for DictionaryEncoder

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 7b6dfa58a2f3f37dcabc1c54b7398f3f891e837c
Author: tianchen <ni...@alibaba-inc.com>
AuthorDate: Tue Jul 2 21:13:04 2019 -0700

    ARROW-5814: [Java] Implement a <Object, int> HashMap for DictionaryEncoder
    
    Related to [ARROW-5814](https://issues.apache.org/jira/browse/ARROW-5814).
    
    As a follow-up of https://github.com/apache/arrow/pull/4698. Implement a Map<Object, int> for DictionaryEncoder to reduce boxing/unboxing operations.
    
    Benchmark:
    DictionaryEncodeHashMapBenchmarks.testHashMap: avgt 5 31151.345 ± 1661.878 ns/op
    DictionaryEncodeHashMapBenchmarks.testDictionaryEncodeHashMap: avgt 5 15549.902 ± 771.647 ns/op
    
    Author: tianchen <ni...@alibaba-inc.com>
    
    Closes #4765 from tianchen92/map and squashes the following commits:
    
    38ee5a4af <tianchen> add UT
    f62003337 <tianchen> add javadoc and change package
    10596ad87 <tianchen> fix style
    86eb350b3 <tianchen> add interface
    98f4c5593 <tianchen> init
---
 .../DictionaryEncodeHashMapBenchmarks.java         | 117 +++++++
 .../vector/dictionary/DictionaryEncodeHashMap.java | 368 +++++++++++++++++++++
 .../arrow/vector/dictionary/ObjectIntMap.java      |  50 +++
 .../arrow/vector/TestDictionaryEncodeHashMap.java  | 123 +++++++
 4 files changed, 658 insertions(+)

diff --git a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java
new file mode 100644
index 0000000..e97bff2
--- /dev/null
+++ b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.dictionary;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Test;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+/**
+ * Benchmarks for {@link DictionaryEncodeHashMap}.
+ */
+@State(Scope.Benchmark)
+public class DictionaryEncodeHashMapBenchmarks {
+  private static final int SIZE = 1000;
+
+  private static final int KEY_LENGTH = 10;
+
+  private List<String> testData = new ArrayList<>();
+
+  private HashMap<String, Integer> hashMap =  new HashMap();
+  private DictionaryEncodeHashMap<String> dictionaryEncodeHashMap = new DictionaryEncodeHashMap();
+
+  /**
+   * Setup benchmarks.
+   */
+  @Setup
+  public void prepare() {
+    for (int i = 0; i < SIZE; i++) {
+      testData.add(getRandomString(KEY_LENGTH));
+    }
+  }
+
+  private String getRandomString(int length) {
+    String str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+    Random random = new Random();
+    StringBuffer sb = new StringBuffer();
+    for (int i = 0; i < length; i++) {
+      int number = random.nextInt(62);
+      sb.append(str.charAt(number));
+    }
+    return sb.toString();
+  }
+
+  /**
+   * Test set/get int values for {@link HashMap}.
+   * @return useless. To avoid DCE by JIT.
+   */
+  @Benchmark
+  @BenchmarkMode(Mode.AverageTime)
+  @OutputTimeUnit(TimeUnit.NANOSECONDS)
+  public int testHashMap() {
+    for (int i = 0; i < SIZE; i++) {
+      hashMap.put(testData.get(i), i);
+    }
+    for (int i = 0; i < SIZE; i++) {
+      hashMap.get(testData.get(i));
+    }
+    return 0;
+  }
+
+  /**
+   * Test set/get int values for {@link DictionaryEncodeHashMap}.
+   * @return useless. To avoid DCE by JIT.
+   */
+  @Benchmark
+  @BenchmarkMode(Mode.AverageTime)
+  @OutputTimeUnit(TimeUnit.NANOSECONDS)
+  public int testDictionaryEncodeHashMap() {
+    for (int i = 0; i < SIZE; i++) {
+      dictionaryEncodeHashMap.put(testData.get(i), i);
+    }
+    for (int i = 0; i < SIZE; i++) {
+      dictionaryEncodeHashMap.get(testData.get(i));
+    }
+    return 0;
+  }
+
+  @Test
+  public void evaluate() throws RunnerException {
+    Options opt = new OptionsBuilder()
+        .include(DictionaryEncodeHashMapBenchmarks.class.getSimpleName())
+        .forks(1)
+        .build();
+
+    new Runner(opt).run();
+  }
+}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java
new file mode 100644
index 0000000..659a8d6
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.dictionary;
+
+import java.util.Objects;
+
+/**
+ * Implementation of the {@link ObjectIntMap} interface, used for DictionaryEncoder.
+ * Note that value in this map is always not less than 0, and -1 represents a null value.
+ * @param <K> key type.
+ */
+public class DictionaryEncodeHashMap<K> implements ObjectIntMap<K> {
+
+  /**
+   * Represents a null value in map.
+   */
+  static final int NULL_VALUE = -1;
+
+  /**
+   * The default initial capacity - MUST be a power of two.
+   */
+  static final int DEFAULT_INITIAL_CAPACITY = 1 << 4;
+
+  /**
+   * The maximum capacity, used if a higher value is implicitly specified
+   * by either of the constructors with arguments.
+   */
+  static final int MAXIMUM_CAPACITY = 1 << 30;
+
+  /**
+   * The load factor used when none specified in constructor.
+   */
+  static final float DEFAULT_LOAD_FACTOR = 0.75f;
+
+  static final Entry<?>[] EMPTY_TABLE = {};
+
+  /**
+   * The table, initialized on first use, and resized as
+   * necessary. When allocated, length is always a power of two.
+   */
+  transient Entry<K>[] table = (Entry<K>[]) EMPTY_TABLE;
+
+  /**
+   * The number of key-value mappings contained in this map.
+   */
+  transient int size;
+
+  /**
+   * The next size value at which to resize (capacity * load factor).
+   */
+  int threshold;
+
+  /**
+   * The load factor for the hash table.
+   */
+  final float loadFactor;
+
+  /**
+   * Constructs an empty map with the specified initial capacity and load factor.
+   */
+  public DictionaryEncodeHashMap(int initialCapacity, float loadFactor) {
+    if (initialCapacity < 0) {
+      throw new IllegalArgumentException("Illegal initial capacity: " +
+          initialCapacity);
+    }
+    if (initialCapacity > MAXIMUM_CAPACITY) {
+      initialCapacity = MAXIMUM_CAPACITY;
+    }
+    if (loadFactor <= 0 || Float.isNaN(loadFactor)) {
+      throw new IllegalArgumentException("Illegal load factor: " +
+          loadFactor);
+    }
+    this.loadFactor = loadFactor;
+    this.threshold = initialCapacity;
+  }
+
+  public DictionaryEncodeHashMap(int initialCapacity) {
+    this(initialCapacity, DEFAULT_LOAD_FACTOR);
+  }
+
+  public DictionaryEncodeHashMap() {
+    this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR);
+  }
+
+  /**
+   * Compute the capacity with given threshold and create init table.
+   */
+  private void inflateTable(int threshold) {
+    int capacity = roundUpToPowerOf2(threshold);
+    this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1);
+    table = new Entry[capacity];
+  }
+
+  /**
+   * Computes key.hashCode() and spreads (XORs) higher bits of hash to lower.
+   */
+  static final int hash(Object key) {
+    int h;
+    return (key == null) ? 0 : (h = key.hashCode()) ^ (h >>> 16);
+  }
+
+  /**
+   * Computes the storage location in an array for the given hashCode.
+   */
+  static int indexFor(int h, int length) {
+    return h & (length - 1);
+  }
+
+  /**
+   * Returns a power of two size for the given size.
+   */
+  static final int roundUpToPowerOf2(int size) {
+    int n = size - 1;
+    n |= n >>> 1;
+    n |= n >>> 2;
+    n |= n >>> 4;
+    n |= n >>> 8;
+    n |= n >>> 16;
+    return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1;
+  }
+
+  /**
+   * Returns the value to which the specified key is mapped,
+   * or -1 if this map contains no mapping for the key.
+   */
+  @Override
+  public int get(K key) {
+    int hash = hash(key);
+    int index = indexFor(hash, table.length);
+    for (Entry<K> e = table[index] ; e != null ; e = e.next) {
+      if ((e.hash == hash) && e.key.equals(key)) {
+        return e.value;
+      }
+    }
+    return NULL_VALUE;
+  }
+
+  /**
+   * Associates the specified value with the specified key in this map.
+   * If the map previously contained a mapping for the key, the old
+   * value is replaced.
+   */
+  @Override
+  public int put(K key, int value) {
+    if (table == EMPTY_TABLE) {
+      inflateTable(threshold);
+    }
+
+    if (key == null) {
+      return putForNullKey(value);
+    }
+
+    int hash = hash(key);
+    int i = indexFor(hash, table.length);
+    for (Entry<K> e = table[i]; e != null; e = e.next) {
+      Object k;
+      if (e.hash == hash && ((k = e.key) == key || key.equals(k))) {
+        int oldValue = e.value;
+        e.value = value;
+        return oldValue;
+      }
+    }
+
+    addEntry(hash, key, value, i);
+    return NULL_VALUE;
+  }
+
+  /**
+   * Removes the mapping for the specified key from this map if present.
+   * @param key key whose mapping is to be removed from the map
+   * @return the previous value associated with <tt>key</tt>, or
+   *         -1 if there was no mapping for <tt>key</tt>.
+   */
+  @Override
+  public int remove(K key) {
+    Entry<K> e = removeEntryForKey(key);
+    return (e == null ? NULL_VALUE : e.value);
+  }
+
+  /**
+   * Create a new Entry at the specific position of table.
+   */
+  void createEntry(int hash, K key, int value, int bucketIndex) {
+    Entry<K> e = table[bucketIndex];
+    table[bucketIndex] = new Entry<>(hash, key, value, e);
+    size++;
+  }
+
+  /**
+   * Put value when key is null.
+   */
+  private int putForNullKey(int value) {
+    for (Entry<K> e = table[0]; e != null; e = e.next) {
+      if (e.key == null) {
+        int oldValue = e.value;
+        e.value = value;
+        return oldValue;
+      }
+    }
+    addEntry(0, null, value, 0);
+    return NULL_VALUE;
+  }
+
+  /**
+   * Add Entry at the specified location of the table.
+   */
+  void addEntry(int hash, K key, int value, int bucketIndex) {
+    if ((size >= threshold) && (null != table[bucketIndex])) {
+      resize(2 * table.length);
+      hash = (null != key) ? hash(key) : 0;
+      bucketIndex = indexFor(hash, table.length);
+    }
+
+    createEntry(hash, key, value, bucketIndex);
+  }
+
+  /**
+   * Resize table with given new capacity.
+   */
+  void resize(int newCapacity) {
+    Entry[] oldTable = table;
+    int oldCapacity = oldTable.length;
+    if (oldCapacity == MAXIMUM_CAPACITY) {
+      threshold = Integer.MAX_VALUE;
+      return;
+    }
+
+    Entry[] newTable = new Entry[newCapacity];
+    transfer(newTable);
+    table = newTable;
+    threshold = (int)Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1);
+  }
+
+  /**
+   * Transfer entries into new table from old table.
+   * @param newTable new table
+   */
+  void transfer(Entry[] newTable) {
+    int newCapacity = newTable.length;
+    for (Entry<K> e : table) {
+      while (null != e) {
+        Entry<K> next = e.next;
+        int i = indexFor(e.hash, newCapacity);
+        e.next = newTable[i];
+        newTable[i] = e;
+        e = next;
+      }
+    }
+  }
+
+  /**
+   * Remove entry associated with the given key.
+   */
+  final Entry<K> removeEntryForKey(Object key) {
+    if (size == 0) {
+      return null;
+    }
+    int hash = (key == null) ? 0 : hash(key);
+    int i = indexFor(hash, table.length);
+    Entry<K> prev = table[i];
+    Entry<K> e = prev;
+
+    while (e != null) {
+      Entry<K> next = e.next;
+      Object k;
+      if (e.hash == hash && ((k = e.key) == key || (key != null && key.equals(k)))) {
+        size--;
+        if (prev == e) {
+          table[i] = next;
+        } else {
+          prev.next = next;
+        }
+
+        return e;
+      }
+      prev = e;
+      e = next;
+    }
+
+    return e;
+  }
+
+  /**
+   * Returns the number of mappings in this Map.
+   */
+  public int size() {
+    return size;
+  }
+
+  /**
+   * Removes all elements from this map, leaving it empty.
+   */
+  public void clear() {
+    size = 0;
+    for (int i = 0; i < table.length; i++) {
+      table[i] = null;
+    }
+  }
+
+  /**
+   * Class to keep key-value data within hash map.
+   * @param <K> key type.
+   */
+  static class Entry<K> {
+    final K key;
+    int value;
+    Entry<K> next;
+    int hash;
+
+    Entry(int hash, K key, int value, Entry<K> next) {
+      this.key = key;
+      this.value = value;
+      this.hash = hash;
+      this.next = next;
+    }
+
+    public final K getKey() {
+      return key;
+    }
+
+    public final int getValue() {
+      return value;
+    }
+
+    public final int setValue(int newValue) {
+      int oldValue = value;
+      value = newValue;
+      return oldValue;
+    }
+
+    public final boolean equals(Object o) {
+      if (!(o instanceof DictionaryEncodeHashMap.Entry)) {
+        return false;
+      }
+      Entry e = (Entry) o;
+      if (Objects.equals(key, e.getKey())) {
+        if (value == e.getValue()) {
+          return true;
+        }
+      }
+      return false;
+    }
+
+    public final int hashCode() {
+      return Objects.hashCode(key) ^ Objects.hashCode(value);
+    }
+
+    public final String toString() {
+      return getKey() + "=" + getValue();
+    }
+  }
+
+}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java
new file mode 100644
index 0000000..582bb56
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.dictionary;
+
+/**
+ * Specific hash map for int type value, reducing boxing/unboxing operations.
+ * @param <K> key type.
+ */
+public interface ObjectIntMap<K> {
+
+  /**
+   * Associates the specified value with the specified key in this map.
+   * If the map previously contained a mapping for the key, the old
+   * value is replaced.
+   * @param key key with which the specified value is to be associated
+   * @param value value to be associated with the specified key
+   * @return the previous value associated with <tt>key</tt>, or
+   *         -1 if there was no mapping for <tt>key</tt>.
+   */
+  int put(K key, int value);
+
+  /**
+   * Returns the value to which the specified key is mapped,
+   * or -1 if this map contains no mapping for the key.
+   */
+  int get(K key);
+
+  /**
+   * Removes the mapping for the specified key from this map if present.
+   * @param key key whose mapping is to be removed from the map
+   * @return the previous value associated with <tt>key</tt>, or
+   *         -1 if there was no mapping for <tt>key</tt>.
+   */
+  int remove(K key);
+}
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java
new file mode 100644
index 0000000..5f4e710
--- /dev/null
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.arrow.vector.dictionary.DictionaryEncodeHashMap;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+
+
+public class TestDictionaryEncodeHashMap {
+
+  private List<String> testData = new ArrayList<>();
+
+  private static final int SIZE = 100;
+
+  private static final int KEY_LENGTH = 5;
+
+  private DictionaryEncodeHashMap<String> map = new DictionaryEncodeHashMap();
+
+  @Before
+  public void init() {
+    for (int i = 0; i < SIZE; i++) {
+      testData.add(generateUniqueKey(KEY_LENGTH));
+    }
+  }
+
+  @After
+  public void terminate() throws Exception {
+    testData.clear();
+  }
+
+  private String generateUniqueKey(int length) {
+    String str = "abcdefghijklmnopqrstuvwxyz";
+    Random random = new Random();
+    StringBuffer sb = new StringBuffer();
+    for (int i = 0; i < length; i++) {
+      int number = random.nextInt(26);
+      sb.append(str.charAt(number));
+    }
+    if (testData.contains(sb.toString())) {
+      return generateUniqueKey(length);
+    }
+    return sb.toString();
+  }
+
+  @Test
+  public void testPutAndGet() {
+    for (int i = 0; i < SIZE; i++) {
+      map.put(testData.get(i), i);
+    }
+
+    for (int i = 0; i < SIZE; i++) {
+      assertEquals(i, map.get(testData.get(i)));
+    }
+  }
+
+  @Test
+  public void testPutExistKey() {
+    for (int i = 0; i < SIZE; i++) {
+      map.put(testData.get(i), i);
+    }
+    map.put("test_key", 101);
+    assertEquals(101, map.get("test_key"));
+    map.put("test_key", 102);
+    assertEquals(102, map.get("test_key"));
+  }
+
+  @Test
+  public void testGetNonExistKey() {
+    for (int i = 0; i < SIZE; i++) {
+      map.put(testData.get(i), i);
+    }
+    //remove if exists.
+    map.remove("test_key");
+    assertEquals(-1, map.get("test_key"));
+  }
+
+  @Test
+  public void testRemove() {
+    for (int i = 0; i < SIZE; i++) {
+      map.put(testData.get(i), i);
+    }
+    map.put("test_key", 10000);
+    assertEquals(SIZE + 1, map.size());
+    assertEquals(10000, map.get("test_key"));
+    map.remove("test_key");
+    assertEquals(SIZE, map.size());
+    assertEquals(-1, map.get("test_key"));
+  }
+
+  @Test
+  public void testSize() {
+    assertEquals(0, map.size());
+    for (int i = 0; i < SIZE; i++) {
+      map.put(testData.get(i), i);
+    }
+    assertEquals(SIZE, map.size());
+  }
+}