You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2021/12/03 14:31:45 UTC

[arrow] branch master updated: ARROW-13811: [Java] Provide a general out-of-place sorter

This is an automated email from the ASF dual-hosted git repository.

liyafan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 6cd288e  ARROW-13811: [Java] Provide a general out-of-place sorter
6cd288e is described below

commit 6cd288e686abcdf2957582d6652ddae3d031d0fa
Author: liyafan82 <fa...@foxmail.com>
AuthorDate: Fri Dec 3 22:29:23 2021 +0800

    ARROW-13811: [Java] Provide a general out-of-place sorter
    
    The sorter should work for any type of vectors, with a time complexity of `O(n*log( n ))`.
    
    Since it does not make any assumptions about the memory layout of the vector, its performance can be sub-optimal. So if another sorter is applicable (e.g. `FixedWidthInPlaceVectorSorter`), it should be used in preference.
    
    Closes #11035 from liyafan82/fly_0831_st
    
    Authored-by: liyafan82 <fa...@foxmail.com>
    Signed-off-by: liyafan82 <fa...@foxmail.com>
---
 .../sort/GeneralOutOfPlaceVectorSorter.java        |  59 +++++++++
 .../sort/TestFixedWidthOutOfPlaceVectorSorter.java |  25 ++--
 .../sort/TestGeneralOutOfPlaceVectorSorter.java    | 142 +++++++++++++++++++++
 .../algorithm/sort/TestOutOfPlaceVectorSorter.java |  46 +++++++
 .../TestVariableWidthOutOfPlaceVectorSorter.java   |  13 +-
 5 files changed, 275 insertions(+), 10 deletions(-)

diff --git a/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/GeneralOutOfPlaceVectorSorter.java b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/GeneralOutOfPlaceVectorSorter.java
new file mode 100644
index 0000000..9ea39f6
--- /dev/null
+++ b/java/algorithm/src/main/java/org/apache/arrow/algorithm/sort/GeneralOutOfPlaceVectorSorter.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.algorithm.sort;
+
+import org.apache.arrow.util.Preconditions;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.ValueVector;
+
+/**
+ * An out-of-place sorter for vectors of arbitrary type, with time complexity O(n*log(n)).
+ * Since it does not make any assumptions about the memory layout of the vector, its performance
+ * can be sub-optimal. So if another sorter is applicable ({@link FixedWidthInPlaceVectorSorter}),
+ * it should be used in preference.
+ *
+ * @param <V> vector type.
+ */
+public class GeneralOutOfPlaceVectorSorter<V extends ValueVector> implements OutOfPlaceVectorSorter<V> {
+
+  @Override
+  public void sortOutOfPlace(V srcVector, V dstVector, VectorValueComparator<V> comparator) {
+    comparator.attachVector(srcVector);
+
+    // check vector capacity
+    Preconditions.checkArgument(dstVector.getValueCapacity() >= srcVector.getValueCount(),
+        "Not enough capacity for the target vector. " +
+            "Expected capacity %s, actual capacity %s", srcVector.getValueCount(), dstVector.getValueCapacity());
+
+    // sort value indices
+    try (IntVector sortedIndices = new IntVector("", srcVector.getAllocator())) {
+      sortedIndices.allocateNew(srcVector.getValueCount());
+      sortedIndices.setValueCount(srcVector.getValueCount());
+
+      IndexSorter<V> indexSorter = new IndexSorter<>();
+      indexSorter.sort(srcVector, sortedIndices, comparator);
+
+      // copy sorted values to the output vector
+      for (int dstIndex = 0; dstIndex < sortedIndices.getValueCount(); dstIndex++) {
+        int srcIndex = sortedIndices.get(dstIndex);
+
+        dstVector.copyFromSafe(srcIndex, dstIndex, srcVector);
+      }
+    }
+  }
+}
diff --git a/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestFixedWidthOutOfPlaceVectorSorter.java b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestFixedWidthOutOfPlaceVectorSorter.java
index e3701f1..cc13e7f 100644
--- a/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestFixedWidthOutOfPlaceVectorSorter.java
+++ b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestFixedWidthOutOfPlaceVectorSorter.java
@@ -24,6 +24,7 @@ import java.util.stream.IntStream;
 
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.BaseFixedWidthVector;
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.Float4Vector;
 import org.apache.arrow.vector.Float8Vector;
@@ -39,10 +40,18 @@ import org.junit.Test;
 /**
  * Test cases for {@link FixedWidthOutOfPlaceVectorSorter}.
  */
-public class TestFixedWidthOutOfPlaceVectorSorter {
+public class TestFixedWidthOutOfPlaceVectorSorter extends TestOutOfPlaceVectorSorter {
 
   private BufferAllocator allocator;
 
+  public TestFixedWidthOutOfPlaceVectorSorter(boolean generalSorter) {
+    super(generalSorter);
+  }
+
+  <V extends BaseFixedWidthVector> OutOfPlaceVectorSorter<V> getSorter() {
+    return generalSorter ? new GeneralOutOfPlaceVectorSorter<>() : new FixedWidthOutOfPlaceVectorSorter<>();
+  }
+
   @Before
   public void prepare() {
     allocator = new RootAllocator(1024 * 1024);
@@ -72,7 +81,7 @@ public class TestFixedWidthOutOfPlaceVectorSorter {
       vec.set(9, 2);
 
       // sort the vector
-      FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
+      OutOfPlaceVectorSorter<TinyIntVector> sorter = getSorter();
       VectorValueComparator<TinyIntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
 
       TinyIntVector sortedVec =
@@ -119,7 +128,7 @@ public class TestFixedWidthOutOfPlaceVectorSorter {
       vec.set(9, 2);
 
       // sort the vector
-      FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
+      OutOfPlaceVectorSorter<SmallIntVector> sorter = getSorter();
       VectorValueComparator<SmallIntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
 
       SmallIntVector sortedVec =
@@ -166,7 +175,7 @@ public class TestFixedWidthOutOfPlaceVectorSorter {
       vec.set(9, 2);
 
       // sort the vector
-      FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
+      OutOfPlaceVectorSorter<IntVector> sorter = getSorter();
       VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
 
       IntVector sortedVec = (IntVector) vec.getField().getFieldType().createNewSingleVector("", allocator, null);
@@ -212,7 +221,7 @@ public class TestFixedWidthOutOfPlaceVectorSorter {
       vec.set(9, 2L);
 
       // sort the vector
-      FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
+      OutOfPlaceVectorSorter<BigIntVector> sorter = getSorter();
       VectorValueComparator<BigIntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
 
       BigIntVector sortedVec = (BigIntVector) vec.getField().getFieldType().createNewSingleVector("", allocator, null);
@@ -258,7 +267,7 @@ public class TestFixedWidthOutOfPlaceVectorSorter {
       vec.set(9, 2f);
 
       // sort the vector
-      FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
+      OutOfPlaceVectorSorter<Float4Vector> sorter = getSorter();
       VectorValueComparator<Float4Vector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
 
       Float4Vector sortedVec = (Float4Vector) vec.getField().getFieldType().createNewSingleVector("", allocator, null);
@@ -304,7 +313,7 @@ public class TestFixedWidthOutOfPlaceVectorSorter {
       vec.set(9, 2);
 
       // sort the vector
-      FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
+      OutOfPlaceVectorSorter<Float8Vector> sorter = getSorter();
       VectorValueComparator<Float8Vector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
 
       Float8Vector sortedVec = (Float8Vector) vec.getField().getFieldType().createNewSingleVector("", allocator, null);
@@ -341,7 +350,7 @@ public class TestFixedWidthOutOfPlaceVectorSorter {
           66, 67, 68, 69, 70, 71);
 
       // sort the vector
-      FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
+      OutOfPlaceVectorSorter<IntVector> sorter = getSorter();
       VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
 
       try (IntVector sortedVec = (IntVector) vec.getField().getFieldType().createNewSingleVector("", allocator, null)) {
diff --git a/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestGeneralOutOfPlaceVectorSorter.java b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestGeneralOutOfPlaceVectorSorter.java
new file mode 100644
index 0000000..07a6b54
--- /dev/null
+++ b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestGeneralOutOfPlaceVectorSorter.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.algorithm.sort;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.complex.StructVector;
+import org.apache.arrow.vector.testing.ValueVectorDataPopulator;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.FieldType;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for {@link GeneralOutOfPlaceVectorSorter}.
+ */
+public class TestGeneralOutOfPlaceVectorSorter {
+
+  private BufferAllocator allocator;
+
+  @Before
+  public void prepare() {
+    allocator = new RootAllocator(1024 * 1024);
+  }
+
+  @After
+  public void shutdown() {
+    allocator.close();
+  }
+
+  VectorValueComparator<StructVector> getComparator(StructVector structVector) {
+    IntVector child0 = structVector.getChild("column0", IntVector.class);
+    VectorValueComparator<IntVector> childComp0 = DefaultVectorComparators.createDefaultComparator(child0);
+    childComp0.attachVector(child0);
+
+    IntVector child1 = structVector.getChild("column1", IntVector.class);
+    VectorValueComparator<IntVector> childComp1 = DefaultVectorComparators.createDefaultComparator(child1);
+    childComp1.attachVector(child1);
+
+    VectorValueComparator<StructVector> comp = new VectorValueComparator<StructVector>() {
+
+      @Override
+      public int compareNotNull(int index1, int index2) {
+        // compare values by lexicographic order
+        int result0 = childComp0.compare(index1, index2);
+        if (result0 != 0) {
+          return result0;
+        }
+        return childComp1.compare(index1, index2);
+      }
+
+      @Override
+      public VectorValueComparator createNew() {
+        return this;
+      }
+    };
+
+    return comp;
+  }
+
+  @Test
+  public void testSortStructVector() {
+    final int vectorLength = 7;
+    try (StructVector srcVector = StructVector.empty("src struct", allocator);
+         StructVector dstVector = StructVector.empty("dst struct", allocator)) {
+
+      IntVector srcChild0 =
+          srcVector.addOrGet("column0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
+      IntVector srcChild1 =
+          srcVector.addOrGet("column1", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
+
+      IntVector dstChild0 =
+          dstVector.addOrGet("column0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
+      IntVector dstChild1 =
+          dstVector.addOrGet("column1", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
+
+      // src struct vector values:
+      // [
+      //   (2, 1)
+      //   (3, 4)
+      //   (5, 4)
+      //   (null, 3)
+      //   (7, null)
+      //   (null, null)
+      //   (6, 6)
+      // ]
+
+      ValueVectorDataPopulator.setVector(srcChild0, 2, 3, 5, null, 7, null, 6);
+      ValueVectorDataPopulator.setVector(srcChild1, 1, 4, 4, 3, null, null, 6);
+      srcVector.setIndexDefined(0);
+      srcVector.setIndexDefined(1);
+      srcVector.setIndexDefined(2);
+      srcVector.setIndexDefined(3);
+      srcVector.setIndexDefined(4);
+      srcVector.setIndexDefined(6);
+      srcVector.setValueCount(vectorLength);
+
+      dstChild0.allocateNew(vectorLength);
+      dstChild1.allocateNew(vectorLength);
+      dstVector.setValueCount(vectorLength);
+
+      // construct the comparator
+      VectorValueComparator<StructVector> comp = getComparator(srcVector);
+
+      // sort the vector
+      GeneralOutOfPlaceVectorSorter<StructVector> sorter = new GeneralOutOfPlaceVectorSorter<>();
+      sorter.sortOutOfPlace(srcVector, dstVector, comp);
+
+      // validate results
+      assertEquals(vectorLength, dstVector.getValueCount());
+      assertEquals(
+          "[" +
+                    "null, " +
+                    "{\"column1\":3}, " +
+                    "{\"column0\":2,\"column1\":1}, " +
+                    "{\"column0\":3,\"column1\":4}, " +
+                    "{\"column0\":5,\"column1\":4}, " +
+                    "{\"column0\":6,\"column1\":6}, " +
+                    "{\"column0\":7}" +
+              "]", dstVector.toString());
+    }
+  }
+}
diff --git a/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestOutOfPlaceVectorSorter.java b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestOutOfPlaceVectorSorter.java
new file mode 100644
index 0000000..66b75cb
--- /dev/null
+++ b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestOutOfPlaceVectorSorter.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.algorithm.sort;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+/**
+ * Test cases for out-of-place sorters.
+ */
+@RunWith(Parameterized.class)
+public abstract class TestOutOfPlaceVectorSorter {
+
+  protected final boolean generalSorter;
+
+  public TestOutOfPlaceVectorSorter(boolean generalSorter) {
+    this.generalSorter = generalSorter;
+  }
+
+  @Parameterized.Parameters(name = "general sorter = {0}")
+  public static Collection<Object[]> getParameter() {
+    List<Object[]> params = new ArrayList<>();
+    params.add(new Object[] {true});
+    params.add(new Object[] {false});
+    return params;
+  }
+}
diff --git a/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestVariableWidthOutOfPlaceVectorSorter.java b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestVariableWidthOutOfPlaceVectorSorter.java
index 46b3060..8f4e3b8 100644
--- a/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestVariableWidthOutOfPlaceVectorSorter.java
+++ b/java/algorithm/src/test/java/org/apache/arrow/algorithm/sort/TestVariableWidthOutOfPlaceVectorSorter.java
@@ -32,10 +32,19 @@ import org.junit.Test;
 /**
  * Test cases for {@link VariableWidthOutOfPlaceVectorSorter}.
  */
-public class TestVariableWidthOutOfPlaceVectorSorter {
+public class TestVariableWidthOutOfPlaceVectorSorter extends TestOutOfPlaceVectorSorter {
 
   private BufferAllocator allocator;
 
+  public TestVariableWidthOutOfPlaceVectorSorter(boolean generalSorter) {
+    super(generalSorter);
+  }
+
+  <V extends BaseVariableWidthVector> OutOfPlaceVectorSorter<V> getSorter() {
+    return generalSorter ? new GeneralOutOfPlaceVectorSorter<>() : new VariableWidthOutOfPlaceVectorSorter<V>();
+  }
+
+
   @Before
   public void prepare() {
     allocator = new RootAllocator(1024 * 1024);
@@ -65,7 +74,7 @@ public class TestVariableWidthOutOfPlaceVectorSorter {
       vec.set(9, "yes".getBytes());
 
       // sort the vector
-      VariableWidthOutOfPlaceVectorSorter sorter = new VariableWidthOutOfPlaceVectorSorter();
+      OutOfPlaceVectorSorter<BaseVariableWidthVector> sorter = getSorter();
       VectorValueComparator<BaseVariableWidthVector> comparator =
               DefaultVectorComparators.createDefaultComparator(vec);