You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/07/04 05:52:10 UTC
[arrow] 31/38: ARROW-5814: [Java] Implement a
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 7b6dfa58a2f3f37dcabc1c54b7398f3f891e837c
Author: tianchen <ni...@alibaba-inc.com>
AuthorDate: Tue Jul 2 21:13:04 2019 -0700
ARROW-5814: [Java] Implement a <Object, int> HashMap for DictionaryEncoder
Related to [ARROW-5814](https://issues.apache.org/jira/browse/ARROW-5814).
As a follow-up of https://github.com/apache/arrow/pull/4698. Implement a Map<Object, int> for DictionaryEncoder to reduce boxing/unboxing operations.
Benchmark:
DictionaryEncodeHashMapBenchmarks.testHashMap: avgt 5 31151.345 ± 1661.878 ns/op
DictionaryEncodeHashMapBenchmarks.testDictionaryEncodeHashMap: avgt 5 15549.902 ± 771.647 ns/op
Author: tianchen <ni...@alibaba-inc.com>
Closes #4765 from tianchen92/map and squashes the following commits:
38ee5a4af <tianchen> add UT
f62003337 <tianchen> add javadoc and change package
10596ad87 <tianchen> fix style
86eb350b3 <tianchen> add interface
98f4c5593 <tianchen> init
---
.../DictionaryEncodeHashMapBenchmarks.java | 117 +++++++
.../vector/dictionary/DictionaryEncodeHashMap.java | 368 +++++++++++++++++++++
.../arrow/vector/dictionary/ObjectIntMap.java | 50 +++
.../arrow/vector/TestDictionaryEncodeHashMap.java | 123 +++++++
4 files changed, 658 insertions(+)
diff --git a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java
new file mode 100644
index 0000000..e97bff2
--- /dev/null
+++ b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.dictionary;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Test;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+/**
+ * Benchmarks for {@link DictionaryEncodeHashMap}.
+ */
+@State(Scope.Benchmark)
+public class DictionaryEncodeHashMapBenchmarks {
+ private static final int SIZE = 1000;
+
+ private static final int KEY_LENGTH = 10;
+
+ private List<String> testData = new ArrayList<>();
+
+ private HashMap<String, Integer> hashMap = new HashMap();
+ private DictionaryEncodeHashMap<String> dictionaryEncodeHashMap = new DictionaryEncodeHashMap();
+
+ /**
+ * Setup benchmarks.
+ */
+ @Setup
+ public void prepare() {
+ for (int i = 0; i < SIZE; i++) {
+ testData.add(getRandomString(KEY_LENGTH));
+ }
+ }
+
+ private String getRandomString(int length) {
+ String str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+ Random random = new Random();
+ StringBuffer sb = new StringBuffer();
+ for (int i = 0; i < length; i++) {
+ int number = random.nextInt(62);
+ sb.append(str.charAt(number));
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Test set/get int values for {@link HashMap}.
+ * @return useless. To avoid DCE by JIT.
+ */
+ @Benchmark
+ @BenchmarkMode(Mode.AverageTime)
+ @OutputTimeUnit(TimeUnit.NANOSECONDS)
+ public int testHashMap() {
+ for (int i = 0; i < SIZE; i++) {
+ hashMap.put(testData.get(i), i);
+ }
+ for (int i = 0; i < SIZE; i++) {
+ hashMap.get(testData.get(i));
+ }
+ return 0;
+ }
+
+ /**
+ * Test set/get int values for {@link DictionaryEncodeHashMap}.
+ * @return useless. To avoid DCE by JIT.
+ */
+ @Benchmark
+ @BenchmarkMode(Mode.AverageTime)
+ @OutputTimeUnit(TimeUnit.NANOSECONDS)
+ public int testDictionaryEncodeHashMap() {
+ for (int i = 0; i < SIZE; i++) {
+ dictionaryEncodeHashMap.put(testData.get(i), i);
+ }
+ for (int i = 0; i < SIZE; i++) {
+ dictionaryEncodeHashMap.get(testData.get(i));
+ }
+ return 0;
+ }
+
+ @Test
+ public void evaluate() throws RunnerException {
+ Options opt = new OptionsBuilder()
+ .include(DictionaryEncodeHashMapBenchmarks.class.getSimpleName())
+ .forks(1)
+ .build();
+
+ new Runner(opt).run();
+ }
+}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java
new file mode 100644
index 0000000..659a8d6
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.dictionary;
+
+import java.util.Objects;
+
+/**
+ * Implementation of the {@link ObjectIntMap} interface, used for DictionaryEncoder.
+ * Note that value in this map is always not less than 0, and -1 represents a null value.
+ * @param <K> key type.
+ */
+public class DictionaryEncodeHashMap<K> implements ObjectIntMap<K> {
+
+ /**
+ * Represents a null value in map.
+ */
+ static final int NULL_VALUE = -1;
+
+ /**
+ * The default initial capacity - MUST be a power of two.
+ */
+ static final int DEFAULT_INITIAL_CAPACITY = 1 << 4;
+
+ /**
+ * The maximum capacity, used if a higher value is implicitly specified
+ * by either of the constructors with arguments.
+ */
+ static final int MAXIMUM_CAPACITY = 1 << 30;
+
+ /**
+ * The load factor used when none specified in constructor.
+ */
+ static final float DEFAULT_LOAD_FACTOR = 0.75f;
+
+ static final Entry<?>[] EMPTY_TABLE = {};
+
+ /**
+ * The table, initialized on first use, and resized as
+ * necessary. When allocated, length is always a power of two.
+ */
+ transient Entry<K>[] table = (Entry<K>[]) EMPTY_TABLE;
+
+ /**
+ * The number of key-value mappings contained in this map.
+ */
+ transient int size;
+
+ /**
+ * The next size value at which to resize (capacity * load factor).
+ */
+ int threshold;
+
+ /**
+ * The load factor for the hash table.
+ */
+ final float loadFactor;
+
+ /**
+ * Constructs an empty map with the specified initial capacity and load factor.
+ */
+ public DictionaryEncodeHashMap(int initialCapacity, float loadFactor) {
+ if (initialCapacity < 0) {
+ throw new IllegalArgumentException("Illegal initial capacity: " +
+ initialCapacity);
+ }
+ if (initialCapacity > MAXIMUM_CAPACITY) {
+ initialCapacity = MAXIMUM_CAPACITY;
+ }
+ if (loadFactor <= 0 || Float.isNaN(loadFactor)) {
+ throw new IllegalArgumentException("Illegal load factor: " +
+ loadFactor);
+ }
+ this.loadFactor = loadFactor;
+ this.threshold = initialCapacity;
+ }
+
+ public DictionaryEncodeHashMap(int initialCapacity) {
+ this(initialCapacity, DEFAULT_LOAD_FACTOR);
+ }
+
+ public DictionaryEncodeHashMap() {
+ this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR);
+ }
+
+ /**
+ * Compute the capacity with given threshold and create init table.
+ */
+ private void inflateTable(int threshold) {
+ int capacity = roundUpToPowerOf2(threshold);
+ this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1);
+ table = new Entry[capacity];
+ }
+
+ /**
+ * Computes key.hashCode() and spreads (XORs) higher bits of hash to lower.
+ */
+ static final int hash(Object key) {
+ int h;
+ return (key == null) ? 0 : (h = key.hashCode()) ^ (h >>> 16);
+ }
+
+ /**
+ * Computes the storage location in an array for the given hashCode.
+ */
+ static int indexFor(int h, int length) {
+ return h & (length - 1);
+ }
+
+ /**
+ * Returns a power of two size for the given size.
+ */
+ static final int roundUpToPowerOf2(int size) {
+ int n = size - 1;
+ n |= n >>> 1;
+ n |= n >>> 2;
+ n |= n >>> 4;
+ n |= n >>> 8;
+ n |= n >>> 16;
+ return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1;
+ }
+
+ /**
+ * Returns the value to which the specified key is mapped,
+ * or -1 if this map contains no mapping for the key.
+ */
+ @Override
+ public int get(K key) {
+ int hash = hash(key);
+ int index = indexFor(hash, table.length);
+ for (Entry<K> e = table[index] ; e != null ; e = e.next) {
+ if ((e.hash == hash) && e.key.equals(key)) {
+ return e.value;
+ }
+ }
+ return NULL_VALUE;
+ }
+
+ /**
+ * Associates the specified value with the specified key in this map.
+ * If the map previously contained a mapping for the key, the old
+ * value is replaced.
+ */
+ @Override
+ public int put(K key, int value) {
+ if (table == EMPTY_TABLE) {
+ inflateTable(threshold);
+ }
+
+ if (key == null) {
+ return putForNullKey(value);
+ }
+
+ int hash = hash(key);
+ int i = indexFor(hash, table.length);
+ for (Entry<K> e = table[i]; e != null; e = e.next) {
+ Object k;
+ if (e.hash == hash && ((k = e.key) == key || key.equals(k))) {
+ int oldValue = e.value;
+ e.value = value;
+ return oldValue;
+ }
+ }
+
+ addEntry(hash, key, value, i);
+ return NULL_VALUE;
+ }
+
+ /**
+ * Removes the mapping for the specified key from this map if present.
+ * @param key key whose mapping is to be removed from the map
+ * @return the previous value associated with <tt>key</tt>, or
+ * -1 if there was no mapping for <tt>key</tt>.
+ */
+ @Override
+ public int remove(K key) {
+ Entry<K> e = removeEntryForKey(key);
+ return (e == null ? NULL_VALUE : e.value);
+ }
+
+ /**
+ * Create a new Entry at the specific position of table.
+ */
+ void createEntry(int hash, K key, int value, int bucketIndex) {
+ Entry<K> e = table[bucketIndex];
+ table[bucketIndex] = new Entry<>(hash, key, value, e);
+ size++;
+ }
+
+ /**
+ * Put value when key is null.
+ */
+ private int putForNullKey(int value) {
+ for (Entry<K> e = table[0]; e != null; e = e.next) {
+ if (e.key == null) {
+ int oldValue = e.value;
+ e.value = value;
+ return oldValue;
+ }
+ }
+ addEntry(0, null, value, 0);
+ return NULL_VALUE;
+ }
+
+ /**
+ * Add Entry at the specified location of the table.
+ */
+ void addEntry(int hash, K key, int value, int bucketIndex) {
+ if ((size >= threshold) && (null != table[bucketIndex])) {
+ resize(2 * table.length);
+ hash = (null != key) ? hash(key) : 0;
+ bucketIndex = indexFor(hash, table.length);
+ }
+
+ createEntry(hash, key, value, bucketIndex);
+ }
+
+ /**
+ * Resize table with given new capacity.
+ */
+ void resize(int newCapacity) {
+ Entry[] oldTable = table;
+ int oldCapacity = oldTable.length;
+ if (oldCapacity == MAXIMUM_CAPACITY) {
+ threshold = Integer.MAX_VALUE;
+ return;
+ }
+
+ Entry[] newTable = new Entry[newCapacity];
+ transfer(newTable);
+ table = newTable;
+ threshold = (int)Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1);
+ }
+
+ /**
+ * Transfer entries into new table from old table.
+ * @param newTable new table
+ */
+ void transfer(Entry[] newTable) {
+ int newCapacity = newTable.length;
+ for (Entry<K> e : table) {
+ while (null != e) {
+ Entry<K> next = e.next;
+ int i = indexFor(e.hash, newCapacity);
+ e.next = newTable[i];
+ newTable[i] = e;
+ e = next;
+ }
+ }
+ }
+
+ /**
+ * Remove entry associated with the given key.
+ */
+ final Entry<K> removeEntryForKey(Object key) {
+ if (size == 0) {
+ return null;
+ }
+ int hash = (key == null) ? 0 : hash(key);
+ int i = indexFor(hash, table.length);
+ Entry<K> prev = table[i];
+ Entry<K> e = prev;
+
+ while (e != null) {
+ Entry<K> next = e.next;
+ Object k;
+ if (e.hash == hash && ((k = e.key) == key || (key != null && key.equals(k)))) {
+ size--;
+ if (prev == e) {
+ table[i] = next;
+ } else {
+ prev.next = next;
+ }
+
+ return e;
+ }
+ prev = e;
+ e = next;
+ }
+
+ return e;
+ }
+
+ /**
+ * Returns the number of mappings in this Map.
+ */
+ public int size() {
+ return size;
+ }
+
+ /**
+ * Removes all elements from this map, leaving it empty.
+ */
+ public void clear() {
+ size = 0;
+ for (int i = 0; i < table.length; i++) {
+ table[i] = null;
+ }
+ }
+
+ /**
+ * Class to keep key-value data within hash map.
+ * @param <K> key type.
+ */
+ static class Entry<K> {
+ final K key;
+ int value;
+ Entry<K> next;
+ int hash;
+
+ Entry(int hash, K key, int value, Entry<K> next) {
+ this.key = key;
+ this.value = value;
+ this.hash = hash;
+ this.next = next;
+ }
+
+ public final K getKey() {
+ return key;
+ }
+
+ public final int getValue() {
+ return value;
+ }
+
+ public final int setValue(int newValue) {
+ int oldValue = value;
+ value = newValue;
+ return oldValue;
+ }
+
+ public final boolean equals(Object o) {
+ if (!(o instanceof DictionaryEncodeHashMap.Entry)) {
+ return false;
+ }
+ Entry e = (Entry) o;
+ if (Objects.equals(key, e.getKey())) {
+ if (value == e.getValue()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public final int hashCode() {
+ return Objects.hashCode(key) ^ Objects.hashCode(value);
+ }
+
+ public final String toString() {
+ return getKey() + "=" + getValue();
+ }
+ }
+
+}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java
new file mode 100644
index 0000000..582bb56
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.dictionary;
+
+/**
+ * Specific hash map for int type value, reducing boxing/unboxing operations.
+ * @param <K> key type.
+ */
+public interface ObjectIntMap<K> {
+
+ /**
+ * Associates the specified value with the specified key in this map.
+ * If the map previously contained a mapping for the key, the old
+ * value is replaced.
+ * @param key key with which the specified value is to be associated
+ * @param value value to be associated with the specified key
+ * @return the previous value associated with <tt>key</tt>, or
+ * -1 if there was no mapping for <tt>key</tt>.
+ */
+ int put(K key, int value);
+
+ /**
+ * Returns the value to which the specified key is mapped,
+ * or -1 if this map contains no mapping for the key.
+ */
+ int get(K key);
+
+ /**
+ * Removes the mapping for the specified key from this map if present.
+ * @param key key whose mapping is to be removed from the map
+ * @return the previous value associated with <tt>key</tt>, or
+ * -1 if there was no mapping for <tt>key</tt>.
+ */
+ int remove(K key);
+}
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java
new file mode 100644
index 0000000..5f4e710
--- /dev/null
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.arrow.vector.dictionary.DictionaryEncodeHashMap;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+
+
+public class TestDictionaryEncodeHashMap {
+
+ private List<String> testData = new ArrayList<>();
+
+ private static final int SIZE = 100;
+
+ private static final int KEY_LENGTH = 5;
+
+ private DictionaryEncodeHashMap<String> map = new DictionaryEncodeHashMap();
+
+ @Before
+ public void init() {
+ for (int i = 0; i < SIZE; i++) {
+ testData.add(generateUniqueKey(KEY_LENGTH));
+ }
+ }
+
+ @After
+ public void terminate() throws Exception {
+ testData.clear();
+ }
+
+ private String generateUniqueKey(int length) {
+ String str = "abcdefghijklmnopqrstuvwxyz";
+ Random random = new Random();
+ StringBuffer sb = new StringBuffer();
+ for (int i = 0; i < length; i++) {
+ int number = random.nextInt(26);
+ sb.append(str.charAt(number));
+ }
+ if (testData.contains(sb.toString())) {
+ return generateUniqueKey(length);
+ }
+ return sb.toString();
+ }
+
+ @Test
+ public void testPutAndGet() {
+ for (int i = 0; i < SIZE; i++) {
+ map.put(testData.get(i), i);
+ }
+
+ for (int i = 0; i < SIZE; i++) {
+ assertEquals(i, map.get(testData.get(i)));
+ }
+ }
+
+ @Test
+ public void testPutExistKey() {
+ for (int i = 0; i < SIZE; i++) {
+ map.put(testData.get(i), i);
+ }
+ map.put("test_key", 101);
+ assertEquals(101, map.get("test_key"));
+ map.put("test_key", 102);
+ assertEquals(102, map.get("test_key"));
+ }
+
+ @Test
+ public void testGetNonExistKey() {
+ for (int i = 0; i < SIZE; i++) {
+ map.put(testData.get(i), i);
+ }
+ //remove if exists.
+ map.remove("test_key");
+ assertEquals(-1, map.get("test_key"));
+ }
+
+ @Test
+ public void testRemove() {
+ for (int i = 0; i < SIZE; i++) {
+ map.put(testData.get(i), i);
+ }
+ map.put("test_key", 10000);
+ assertEquals(SIZE + 1, map.size());
+ assertEquals(10000, map.get("test_key"));
+ map.remove("test_key");
+ assertEquals(SIZE, map.size());
+ assertEquals(-1, map.get("test_key"));
+ }
+
+ @Test
+ public void testSize() {
+ assertEquals(0, map.size());
+ for (int i = 0; i < SIZE; i++) {
+ map.put(testData.get(i), i);
+ }
+ assertEquals(SIZE, map.size());
+ }
+}