You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ra...@apache.org on 2019/08/01 09:17:05 UTC
[arrow] branch master updated: ARROW-6013: [Java] Support range
searcher
This is an automated email from the ASF dual-hosted git repository.
ravindra pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e3ba3de ARROW-6013: [Java] Support range searcher
e3ba3de is described below
commit e3ba3dedd162744bf7e506da0d977a65762674c9
Author: liyafan82 <fa...@foxmail.com>
AuthorDate: Thu Aug 1 14:46:42 2019 +0530
ARROW-6013: [Java] Support range searcher
For a sorted vector, the range searcher finds the first/last occurrence of a particular element.
The search is based on binary search, which takes O(logn) time.
Closes #4925 from liyafan82/fly_0723_range and squashes the following commits:
4690f6956 <liyafan82> Support range searcher
Authored-by: liyafan82 <fa...@foxmail.com>
Signed-off-by: Pindikura Ravindra <ra...@dremio.com>
---
.../algorithm/search/VectorRangeSearcher.java | 138 ++++++++++++++++
.../arrow/algorithm/search/VectorSearcher.java | 17 +-
.../algorithm/search/TestVectorRangeSearcher.java | 178 +++++++++++++++++++++
3 files changed, 324 insertions(+), 9 deletions(-)
diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/search/VectorRangeSearcher.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/search/VectorRangeSearcher.java
new file mode 100644
index 0000000..e7ce95b
--- /dev/null
+++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/search/VectorRangeSearcher.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.algorithm.search;
+
+import org.apache.arrow.algorithm.sort.VectorValueComparator;
+import org.apache.arrow.vector.ValueVector;
+
+/**
+ * Search for the range of a particular element in the target vector.
+ */
+public class VectorRangeSearcher {
+
+ /**
+ * Result returned when a search fails.
+ */
+ public static final int SEARCH_FAIL_RESULT = -1;
+
+ /**
+ * Search for the first occurrence of an element.
+ * The search is based on the binary search algorithm. So the target vector must be sorted.
+ * @param targetVector the vector from which to perform the search.
+ * @param comparator the criterion for the comparison.
+ * @param keyVector the vector containing the element to search.
+ * @param keyIndex the index of the search key in the key vector.
+ * @param <V> the vector type.
+ * @return the index of the first matched element if any, and -1 otherwise.
+ */
+ public static <V extends ValueVector> int getFirstMatch(
+ V targetVector, VectorValueComparator<V> comparator, V keyVector, int keyIndex) {
+ comparator.attachVectors(keyVector, targetVector);
+
+ int low = 0;
+ int high = targetVector.getValueCount() - 1;
+
+ while (low <= high) {
+ int mid = low + (high - low) / 2;
+ int result = comparator.compare(keyIndex, mid);
+ if (result < 0) {
+ // the key is smaller
+ high = mid - 1;
+ } else if (result > 0) {
+ // the key is larger
+ low = mid + 1;
+ } else {
+ // the key equals the mid value, find the lower bound by going left-ward.
+
+ // compare with the left neighbour
+ int left = mid - 1;
+ if (left == -1) {
+ // this is the first value in the vector
+ return mid;
+ } else {
+ int leftResult = comparator.compare(keyIndex, left);
+ if (leftResult > 0) {
+ // the key is greater than the left neighbour, and equal to the current one
+ // we find it
+ return mid;
+ } else if (leftResult == 0) {
+ // the left neighbour is also equal, continue to go left
+ high = mid - 1;
+ } else {
+ // the key is larger than the left neighbour, this is not possible
+ throw new IllegalStateException("The target vector is not sorted ");
+ }
+ }
+ }
+ }
+ return SEARCH_FAIL_RESULT;
+ }
+
+ /**
+ * Search for the last occurrence of an element.
+ * The search is based on the binary search algorithm. So the target vector must be sorted.
+ * @param targetVector the vector from which to perform the search.
+ * @param comparator the criterion for the comparison.
+ * @param keyVector the vector containing the element to search.
+ * @param keyIndex the index of the search key in the key vector.
+ * @param <V> the vector type.
+ * @return the index of the last matched element if any, and -1 otherwise.
+ */
+ public static <V extends ValueVector> int getLastMatch(
+ V targetVector, VectorValueComparator<V> comparator, V keyVector, int keyIndex) {
+ comparator.attachVectors(keyVector, targetVector);
+
+ int low = 0;
+ int high = targetVector.getValueCount() - 1;
+
+ while (low <= high) {
+ int mid = low + (high - low) / 2;
+ int result = comparator.compare(keyIndex, mid);
+ if (result < 0) {
+ // the key is smaller
+ high = mid - 1;
+ } else if (result > 0) {
+ // the key is larger
+ low = mid + 1;
+ } else {
+ // the key equals the mid value, find the upper bound by going right-ward.
+
+ // compare with the right neighbour
+ int right = mid + 1;
+ if (right == targetVector.getValueCount()) {
+ // this is the last value in the vector
+ return mid;
+ } else {
+ int rightResult = comparator.compare(keyIndex, right);
+ if (rightResult < 0) {
+ // the key is smaller than the right neighbour, and equal to the current one
+ // we find it
+ return mid;
+ } else if (rightResult == 0) {
+ // the right neighbour is also equal, continue to go right
+ low = mid + 1;
+ } else {
+ // the key is smaller than the right neighbour, this is not possible
+ throw new IllegalStateException("The target vector is not sorted ");
+ }
+ }
+ }
+ }
+ return SEARCH_FAIL_RESULT;
+ }
+}
diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/search/VectorSearcher.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/search/VectorSearcher.java
index 8bed811..d88fcae 100644
--- a/java/algorithm/src/main/java/org/apache/arrow/algorithm/search/VectorSearcher.java
+++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/search/VectorSearcher.java
@@ -26,6 +26,11 @@ import org.apache.arrow.vector.ValueVector;
public final class VectorSearcher {
/**
+ * Result returned when a search fails.
+ */
+ public static final int SEARCH_FAIL_RESULT = -1;
+
+ /**
* Search for a particular element from the key vector in the target vector by binary search.
* The target vector must be sorted.
* @param targetVector the vector from which to perform the sort.
@@ -44,13 +49,7 @@ public final class VectorSearcher {
int high = targetVector.getValueCount() - 1;
while (low <= high) {
- int mid = (high + low) / 2;
-
- if (mid < 0) {
- // overflow has occurred, so calculate the mid by converting to long first
- mid = (int) (((long) high + (long) low) / 2L);
- }
-
+ int mid = low + (high - low) / 2;
int cmp = comparator.compare(keyIndex, mid);
if (cmp < 0) {
high = mid - 1;
@@ -60,7 +59,7 @@ public final class VectorSearcher {
return mid;
}
}
- return -1;
+ return SEARCH_FAIL_RESULT;
}
/**
@@ -80,7 +79,7 @@ public final class VectorSearcher {
return i;
}
}
- return -1;
+ return SEARCH_FAIL_RESULT;
}
private VectorSearcher() {
diff --git a/java/algorithm/src/test/java/org/apache/arrow/algorithm/search/TestVectorRangeSearcher.java b/java/algorithm/src/test/java/org/apache/arrow/algorithm/search/TestVectorRangeSearcher.java
new file mode 100644
index 0000000..88f6e23
--- /dev/null
+++ b/java/algorithm/src/test/java/org/apache/arrow/algorithm/search/TestVectorRangeSearcher.java
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.algorithm.search;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
+import org.apache.arrow.algorithm.sort.VectorValueComparator;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.IntVector;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for {@link VectorRangeSearcher}.
+ */
+public class TestVectorRangeSearcher {
+
+ private BufferAllocator allocator;
+
+ @Before
+ public void prepare() {
+ allocator = new RootAllocator(1024 * 1024);
+ }
+
+ @After
+ public void shutdown() {
+ allocator.close();
+ }
+
+ @Test
+ public void testGetLowerBounds() {
+ final int maxValue = 100;
+ final int repeat = 5;
+ try (IntVector intVector = new IntVector("int vec", allocator)) {
+ // allocate vector
+ intVector.allocateNew(maxValue * repeat);
+ intVector.setValueCount(maxValue * repeat);
+
+ // prepare data in sorted order
+ // each value is repeated some times
+ for (int i = 0; i < maxValue; i++) {
+ for (int j = 0; j < repeat; j++) {
+ if (i == 0) {
+ intVector.setNull(i * repeat + j);
+ } else {
+ intVector.set(i * repeat + j, i);
+ }
+ }
+ }
+
+ // do search
+ VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVector);
+ for (int i = 0; i < maxValue; i++) {
+ int result = VectorRangeSearcher.getFirstMatch(intVector, comparator, intVector, i * repeat);
+ assertEquals(i * repeat, result);
+ }
+ }
+ }
+
+ @Test
+ public void testGetLowerBoundsNegative() {
+ final int maxValue = 100;
+ final int repeat = 5;
+ try (IntVector intVector = new IntVector("int vec", allocator);
+ IntVector negVector = new IntVector("neg vec", allocator)) {
+ // allocate vector
+ intVector.allocateNew(maxValue * repeat);
+ intVector.setValueCount(maxValue * repeat);
+
+ negVector.allocateNew(maxValue);
+ negVector.setValueCount(maxValue);
+
+ // prepare data in sorted order
+ // each value is repeated some times
+ for (int i = 0; i < maxValue; i++) {
+ for (int j = 0; j < repeat; j++) {
+ if (i == 0) {
+ intVector.setNull(i * repeat + j);
+ } else {
+ intVector.set(i * repeat + j, i);
+ }
+ }
+ negVector.set(i, maxValue + i);
+ }
+
+ // do search
+ VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVector);
+ for (int i = 0; i < maxValue; i++) {
+ int result = VectorRangeSearcher.getFirstMatch(intVector, comparator, negVector, i);
+ assertEquals(-1, result);
+ }
+ }
+ }
+
+ @Test
+ public void testGetUpperBounds() {
+ final int maxValue = 100;
+ final int repeat = 5;
+ try (IntVector intVector = new IntVector("int vec", allocator)) {
+ // allocate vector
+ intVector.allocateNew(maxValue * repeat);
+ intVector.setValueCount(maxValue * repeat);
+
+ // prepare data in sorted order
+ // each value is repeated some times
+ for (int i = 0; i < maxValue; i++) {
+ for (int j = 0; j < repeat; j++) {
+ if (i == 0) {
+ intVector.setNull(i * repeat + j);
+ } else {
+ intVector.set(i * repeat + j, i);
+ }
+ }
+ }
+
+ // do search
+ VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVector);
+ for (int i = 0; i < maxValue; i++) {
+ int result = VectorRangeSearcher.getLastMatch(intVector, comparator, intVector, i * repeat);
+ assertEquals((i + 1) * repeat - 1, result);
+ }
+ }
+ }
+
+ @Test
+ public void testGetUpperBoundsNegative() {
+ final int maxValue = 100;
+ final int repeat = 5;
+ try (IntVector intVector = new IntVector("int vec", allocator);
+ IntVector negVector = new IntVector("neg vec", allocator)) {
+ // allocate vector
+ intVector.allocateNew(maxValue * repeat);
+ intVector.setValueCount(maxValue * repeat);
+
+ negVector.allocateNew(maxValue);
+ negVector.setValueCount(maxValue);
+
+ // prepare data in sorted order
+ // each value is repeated some times
+ for (int i = 0; i < maxValue; i++) {
+ for (int j = 0; j < repeat; j++) {
+ if (i == 0) {
+ intVector.setNull(i * repeat + j);
+ } else {
+ intVector.set(i * repeat + j, i);
+ }
+ }
+ negVector.set(i, maxValue + i);
+ }
+
+ // do search
+ VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVector);
+ for (int i = 0; i < maxValue; i++) {
+ int result = VectorRangeSearcher.getLastMatch(intVector, comparator, negVector, i);
+ assertEquals(-1, result);
+ }
+ }
+ }
+}