You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2019/07/12 06:54:37 UTC

[arrow] branch master updated: ARROW-5832: [Java] Support search operations for vector data

This is an automated email from the ASF dual-hosted git repository.

emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 03360e1  ARROW-5832: [Java] Support search operations for vector data
03360e1 is described below

commit 03360e112521bbfb62e0cb9eecdfb6f57e04d494
Author: liyafan82 <fa...@foxmail.com>
AuthorDate: Thu Jul 11 23:54:06 2019 -0700

    ARROW-5832: [Java] Support search operations for vector data
    
    Support searching for a particular data item in a sorted/unsorted vector.
    
    Author: liyafan82 <fa...@foxmail.com>
    
    Closes #4788 from liyafan82/fly_0703_search and squashes the following commits:
    
    769113e44 <liyafan82>  Add negative test cases
    34be18e2c <liyafan82>  Support search operations for vector data
---
 .../arrow/algorithm/search/VectorSearcher.java     |  89 ++++++++++
 .../arrow/algorithm/search/TestVectorSearcher.java | 188 +++++++++++++++++++++
 2 files changed, 277 insertions(+)

diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/search/VectorSearcher.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/search/VectorSearcher.java
new file mode 100644
index 0000000..8bed811
--- /dev/null
+++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/search/VectorSearcher.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.algorithm.search;
+
+import org.apache.arrow.algorithm.sort.VectorValueComparator;
+import org.apache.arrow.vector.ValueVector;
+
+/**
+ * Search for a particular element in the vector.
+ */
+public final class VectorSearcher {
+
+  /**
+   * Search for a particular element from the key vector in the target vector by binary search.
+   * The target vector must be sorted.
+   * @param targetVector the vector from which to perform the sort.
+   * @param comparator the criterion for the sort.
+   * @param keyVector the vector containing the element to search.
+   * @param keyIndex the index of the search key in the key vector.
+   * @param <V> the vector type.
+   * @return the index of a matched element if any, and -1 otherwise.
+   */
+  public static <V extends ValueVector> int binarySearch(
+          V targetVector, VectorValueComparator<V> comparator, V keyVector, int keyIndex) {
+    comparator.attachVectors(keyVector, targetVector);
+
+    // perform binary search
+    int low = 0;
+    int high = targetVector.getValueCount() - 1;
+
+    while (low <= high) {
+      int mid = (high + low) / 2;
+
+      if (mid < 0) {
+        // overflow has occurred, so calculate the mid by converting to long first
+        mid = (int) (((long) high + (long) low) / 2L);
+      }
+
+      int cmp = comparator.compare(keyIndex, mid);
+      if (cmp < 0) {
+        high = mid - 1;
+      } else if (cmp > 0) {
+        low = mid + 1;
+      } else {
+        return mid;
+      }
+    }
+    return -1;
+  }
+
+  /**
+   * Search for a particular element from the key vector in the target vector by traversing the vector in sequence.
+   * @param targetVector the vector from which to perform the sort.
+   * @param comparator the criterion for element equality.
+   * @param keyVector the vector containing the element to search.
+   * @param keyIndex the index of the search key in the key vector.
+   * @param <V> the vector type.
+   * @return the index of a matched element if any, and -1 otherwise.
+   */
+  public static <V extends ValueVector> int linearSearch(
+          V targetVector, VectorValueComparator<V> comparator, V keyVector, int keyIndex) {
+    comparator.attachVectors(keyVector, targetVector);
+    for (int i = 0; i < targetVector.getValueCount(); i++) {
+      if (comparator.compare(keyIndex, i) == 0) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  private VectorSearcher() {
+
+  }
+}
diff --git a/java/algorithm/src/test/java/org/apache/arrow/algorithm/search/TestVectorSearcher.java b/java/algorithm/src/test/java/org/apache/arrow/algorithm/search/TestVectorSearcher.java
new file mode 100644
index 0000000..f5c2912
--- /dev/null
+++ b/java/algorithm/src/test/java/org/apache/arrow/algorithm/search/TestVectorSearcher.java
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.algorithm.search;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
+import org.apache.arrow.algorithm.sort.VectorValueComparator;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.VarCharVector;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for {@link org.apache.arrow.algorithm.search.VectorSearcher}.
+ */
+public class TestVectorSearcher {
+
+  private final int VECTOR_LENGTH = 100;
+
+  private BufferAllocator allocator;
+
+  @Before
+  public void prepare() {
+    allocator = new RootAllocator(1024 * 1024);
+  }
+
+  @After
+  public void shutdown() {
+    allocator.close();
+  }
+
+  @Test
+  public void testBinarySearchInt() {
+    try (IntVector rawVector = new IntVector("", allocator);
+         IntVector negVector = new IntVector("", allocator)) {
+      rawVector.allocateNew(VECTOR_LENGTH);
+      rawVector.setValueCount(VECTOR_LENGTH);
+      negVector.allocateNew(1);
+      negVector.setValueCount(1);
+
+      // prepare data in sorted order
+      for (int i = 0; i < VECTOR_LENGTH; i++) {
+        if (i == 0) {
+          rawVector.setNull(i);
+        } else {
+          rawVector.set(i, i);
+        }
+      }
+      negVector.set(0, -333);
+
+      // do search
+      VectorValueComparator<IntVector> comparator = new DefaultVectorComparators.IntComparator();
+      for (int i = 0; i < VECTOR_LENGTH; i++) {
+        int result = VectorSearcher.binarySearch(rawVector, comparator, rawVector, i);
+        assertEquals(i, result);
+      }
+
+      // negative case
+      assertEquals(-1, VectorSearcher.binarySearch(rawVector, comparator, negVector, 0));
+    }
+  }
+
+  @Test
+  public void testLinearSearchInt() {
+    try (IntVector rawVector = new IntVector("", allocator);
+         IntVector negVector = new IntVector("", allocator)) {
+      rawVector.allocateNew(VECTOR_LENGTH);
+      rawVector.setValueCount(VECTOR_LENGTH);
+      negVector.allocateNew(1);
+      negVector.setValueCount(1);
+
+      // prepare data in sorted order
+      for (int i = 0; i < VECTOR_LENGTH; i++) {
+        if (i == 0) {
+          rawVector.setNull(i);
+        } else {
+          rawVector.set(i, i);
+        }
+      }
+      negVector.set(0, -333);
+
+      // do search
+      VectorValueComparator<IntVector> comparator = new DefaultVectorComparators.IntComparator();
+      for (int i = 0; i < VECTOR_LENGTH; i++) {
+        int result = VectorSearcher.linearSearch(rawVector, comparator, rawVector, i);
+        assertEquals(i, result);
+      }
+
+      // negative case
+      assertEquals(-1, VectorSearcher.linearSearch(rawVector, comparator, negVector, 0));
+    }
+  }
+
+  @Test
+  public void testBinarySearchVarChar() {
+    try (VarCharVector rawVector = new VarCharVector("", allocator);
+         VarCharVector negVector = new VarCharVector("", allocator)) {
+      rawVector.allocateNew(VECTOR_LENGTH * 16, VECTOR_LENGTH);
+      rawVector.setValueCount(VECTOR_LENGTH);
+      negVector.allocateNew(VECTOR_LENGTH, 1);
+      negVector.setValueCount(1);
+
+      byte[] content = new byte[2];
+
+      // prepare data in sorted order
+      for (int i = 0; i < VECTOR_LENGTH; i++) {
+        if (i == 0) {
+          rawVector.setNull(i);
+        } else {
+          int q = i / 10;
+          int r = i % 10;
+
+          content[0] = (byte) ('a' + q);
+          content[1] = (byte) r;
+          rawVector.set(i, content);
+        }
+      }
+      negVector.set(0, "abcd".getBytes());
+
+      // do search
+      VectorValueComparator<VarCharVector> comparator = new DefaultVectorComparators.VarCharComparator();
+      for (int i = 0; i < VECTOR_LENGTH; i++) {
+        int result = VectorSearcher.binarySearch(rawVector, comparator, rawVector, i);
+        assertEquals(i, result);
+      }
+
+      // negative case
+      assertEquals(-1, VectorSearcher.binarySearch(rawVector, comparator, negVector, 0));
+    }
+  }
+
+  @Test
+  public void testLinearSearchVarChar() {
+    try (VarCharVector rawVector = new VarCharVector("", allocator);
+         VarCharVector negVector = new VarCharVector("", allocator)) {
+      rawVector.allocateNew(VECTOR_LENGTH * 16, VECTOR_LENGTH);
+      rawVector.setValueCount(VECTOR_LENGTH);
+      negVector.allocateNew(VECTOR_LENGTH, 1);
+      negVector.setValueCount(1);
+
+      byte[] content = new byte[2];
+
+      // prepare data in sorted order
+      for (int i = 0; i < VECTOR_LENGTH; i++) {
+        if (i == 0) {
+          rawVector.setNull(i);
+        } else {
+          int q = i / 10;
+          int r = i % 10;
+
+          content[0] = (byte) ('a' + q);
+          content[1] = (byte) r;
+          rawVector.set(i, content);
+        }
+      }
+      negVector.set(0, "abcd".getBytes());
+
+      // do search
+      VectorValueComparator<VarCharVector> comparator = new DefaultVectorComparators.VarCharComparator();
+      for (int i = 0; i < VECTOR_LENGTH; i++) {
+        int result = VectorSearcher.linearSearch(rawVector, comparator, rawVector, i);
+        assertEquals(i, result);
+      }
+
+      // negative case
+      assertEquals(-1, VectorSearcher.linearSearch(rawVector, comparator, negVector, 0));
+    }
+  }
+}