You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2022/01/14 18:41:50 UTC

[orc] branch main updated: ORC-1004: Java ORC writer supports the selection vector (#994)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 677ed8b  ORC-1004: Java ORC writer supports the selection vector (#994)
677ed8b is described below

commit 677ed8b17aa6cb6a6cd2a5c1297ee8601096a90c
Author: Gang Wu <ga...@alibaba-inc.com>
AuthorDate: Sat Jan 15 02:41:41 2022 +0800

    ORC-1004: Java ORC writer supports the selection vector (#994)
    
    ### What changes were proposed in this pull request?
    
    This PR is to enable the Java ORC writer to respect the selection vector from VectorizedRowBatch. The implementation tries to find each longest batch that is continuously selected to get the best performance.
    
    ### Why are the changes needed?
    
    Currently the ORC writer doesn't support the selected vector. This could cause clients that expect it to be supported to get trash rows in the output.
    
    ### How was this patch tested?
    
    Enabled UT TestSelectedVector.java
---
 .../src/java/org/apache/orc/impl/WriterImpl.java   | 31 ++++++++++++++++++++--
 .../test/org/apache/orc/TestSelectedVector.java    |  5 ----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
index c0a2b52..b6f5b56 100644
--- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java
@@ -696,7 +696,18 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback {
         while (posn < batch.size) {
           int chunkSize = Math.min(batch.size - posn,
               rowIndexStride - rowsInIndex);
-          treeWriter.writeRootBatch(batch, posn, chunkSize);
+          if (batch.isSelectedInUse()) {
+            // find the longest chunk that is continuously selected from posn
+            for (int len = 1; len < chunkSize; ++len) {
+              if (batch.selected[posn + len] - batch.selected[posn] != len) {
+                chunkSize = len;
+                break;
+              }
+            }
+            treeWriter.writeRootBatch(batch, batch.selected[posn], chunkSize);
+          } else {
+            treeWriter.writeRootBatch(batch, posn, chunkSize);
+          }
           posn += chunkSize;
           rowsInIndex += chunkSize;
           rowsInStripe += chunkSize;
@@ -705,8 +716,24 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback {
           }
         }
       } else {
+        if (batch.isSelectedInUse()) {
+          int posn = 0;
+          while (posn < batch.size) {
+            int chunkSize = 1;
+            while (posn + chunkSize < batch.size) {
+              // find the longest chunk that is continuously selected from posn
+              if (batch.selected[posn + chunkSize] - batch.selected[posn] != chunkSize) {
+                break;
+              }
+              ++chunkSize;
+            }
+            treeWriter.writeRootBatch(batch, batch.selected[posn], chunkSize);
+            posn += chunkSize;
+          }
+        } else {
+          treeWriter.writeRootBatch(batch, 0, batch.size);
+        }
         rowsInStripe += batch.size;
-        treeWriter.writeRootBatch(batch, 0, batch.size);
       }
       rowsSinceCheck += batch.size;
       previousAllocation = memoryManager.checkMemory(previousAllocation, this);
diff --git a/java/core/src/test/org/apache/orc/TestSelectedVector.java b/java/core/src/test/org/apache/orc/TestSelectedVector.java
index 5178fe6..3e2e475 100644
--- a/java/core/src/test/org/apache/orc/TestSelectedVector.java
+++ b/java/core/src/test/org/apache/orc/TestSelectedVector.java
@@ -36,7 +36,6 @@ import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
 import org.apache.orc.impl.KeyProvider;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.TestInfo;
 
@@ -74,7 +73,6 @@ public class TestSelectedVector {
   }
 
   @Test
-  @Disabled("Disable until the impl of ORC-1004 is merged")
   public void testWriteBaseTypeUseSelectedVector() throws IOException {
     TypeDescription schema =
         TypeDescription.fromString("struct<a:boolean,b:tinyint,c:smallint,d:int,e:bigint," +
@@ -234,7 +232,6 @@ public class TestSelectedVector {
   }
 
   @Test
-  @Disabled("Disable until the impl of ORC-1004 is merged")
   public void testWriteComplexTypeUseSelectedVector() throws IOException {
     TypeDescription schema =
         TypeDescription.fromString("struct<a:map<int,uniontype<int,string>>," +
@@ -381,7 +378,6 @@ public class TestSelectedVector {
   }
 
   @Test
-  @Disabled("Disable until the impl of ORC-1004 is merged")
   public void testWriteRepeatedUseSelectedVector() throws IOException {
     TypeDescription schema =
         TypeDescription.fromString("struct<a:int,b:string,c:decimal(20,5)>");
@@ -465,7 +461,6 @@ public class TestSelectedVector {
   }
 
   @Test
-  @Disabled("Disable until the impl of ORC-1004 is merged")
   public void testWriteEncryptionUseSelectedVector() throws IOException {
     TypeDescription schema =
         TypeDescription.fromString("struct<id:int,name:string>");