You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2023/01/20 00:46:42 UTC

[orc] branch main updated: ORC-1357: Handle missing compression block size (#1385)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 481a9f000 ORC-1357: Handle missing compression block size (#1385)
481a9f000 is described below

commit 481a9f00015c19407524592844778d800b4ae48b
Author: Huw Campbell <hu...@gmail.com>
AuthorDate: Fri Jan 20 11:46:34 2023 +1100

    ORC-1357: Handle missing compression block size (#1385)
    
    ### What changes were proposed in this pull request?
    
    The C++ implementation handles when the postscript doesn't include compressionBlockSize by checking if it is set and using 256 * 1024 if it is not.
    
    As compressionBlockSize is optional this means valid files couldn't be read with the Java implementation. This change brings the Java implemenation in line with C++ by first checking and using the same default value.
    
    ### Why are the changes needed?
    
    Some files which can be read by C++ fail to be read when compression is enabled in the Java implementation because it uses a default buffer of size 0 (which is just an arbitrary protobuf 'nil' conversion).
    
    ### How was this patch tested?
    
    New file / test case added which failed to read before in Java but could be read by C++.
---
 ...TestOrcFile.testWithoutCompressionBlockSize.orc | Bin 0 -> 63 bytes
 .../core/src/java/org/apache/orc/impl/OrcTail.java |   3 ++-
 .../src/java/org/apache/orc/impl/ReaderImpl.java   |  17 ++++++++++++++--
 .../src/test/org/apache/orc/TestVectorOrcFile.java |   5 +++--
 .../test/org/apache/orc/impl/TestReaderImpl.java   |  22 +++++++++++++++++++++
 5 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/examples/TestOrcFile.testWithoutCompressionBlockSize.orc b/examples/TestOrcFile.testWithoutCompressionBlockSize.orc
new file mode 100644
index 000000000..552f4e776
Binary files /dev/null and b/examples/TestOrcFile.testWithoutCompressionBlockSize.orc differ
diff --git a/java/core/src/java/org/apache/orc/impl/OrcTail.java b/java/core/src/java/org/apache/orc/impl/OrcTail.java
index a7fb330d1..9297f953c 100644
--- a/java/core/src/java/org/apache/orc/impl/OrcTail.java
+++ b/java/core/src/java/org/apache/orc/impl/OrcTail.java
@@ -135,7 +135,8 @@ public final class OrcTail {
   }
 
   public int getCompressionBufferSize() {
-    return (int) fileTail.getPostscript().getCompressionBlockSize();
+    OrcProto.PostScript postScript = fileTail.getPostscript();
+    return ReaderImpl.getCompressionBlockSize(postScript);
   }
 
   public int getMetadataSize() {
diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
index 5c4ced6b3..6a4830c38 100644
--- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
@@ -64,6 +64,7 @@ public class ReaderImpl implements Reader {
   private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class);
 
   private static final int DIRECTORY_SIZE_GUESS = 16 * 1024;
+  public static final int DEFAULT_COMPRESSION_BLOCK_SIZE = 256 * 1024;
 
   private final long maxLength;
   protected final Path path;
@@ -714,6 +715,18 @@ public class ReaderImpl implements Reader {
     return extractFileTail(buffer, -1,-1);
   }
 
+  /**
+   * Read compression block size from the postscript if it is set; otherwise,
+   * use the same 256k default the C++ implementation uses.
+   */
+  public static int getCompressionBlockSize(OrcProto.PostScript postScript) {
+    if (postScript.hasCompressionBlockSize()) {
+      return (int) postScript.getCompressionBlockSize();
+    } else {
+      return DEFAULT_COMPRESSION_BLOCK_SIZE;
+    }
+  }
+
   /**
    * @deprecated Use {@link ReaderImpl#extractFileTail(FileSystem, Path, long)} instead.
    * This is for backward compatibility.
@@ -741,7 +754,7 @@ public class ReaderImpl implements Reader {
     try (CompressionCodec codec = OrcCodecPool.getCodec(compressionKind)){
       if (codec != null) {
         compression.withCodec(codec)
-            .withBufferSize((int) ps.getCompressionBlockSize());
+            .withBufferSize(getCompressionBlockSize(ps));
       }
 
       OrcProto.Footer footer =
@@ -823,7 +836,7 @@ public class ReaderImpl implements Reader {
       try (CompressionCodec codec = OrcCodecPool.getCodec(compressionKind)) {
         if (codec != null) {
           compression.withCodec(codec)
-              .withBufferSize((int) ps.getCompressionBlockSize());
+              .withBufferSize(getCompressionBlockSize(ps));
         }
         OrcProto.Footer footer =
             OrcProto.Footer.parseFrom(
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index 1c38732c1..9a1431c68 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -85,6 +85,7 @@ import java.util.UUID;
 import java.util.function.IntFunction;
 import java.util.stream.Stream;
 
+import static org.apache.orc.impl.ReaderImpl.DEFAULT_COMPRESSION_BLOCK_SIZE;
 import static org.apache.orc.impl.mask.SHA256MaskFactory.printHexBinary;
 import static org.junit.jupiter.api.Assertions.assertArrayEquals;
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -1495,7 +1496,7 @@ public class TestVectorOrcFile {
     assertFalse(reader.rows().nextBatch(batch));
     assertEquals(CompressionKind.NONE, reader.getCompressionKind());
     assertEquals(0, reader.getNumberOfRows());
-    assertEquals(0, reader.getCompressionSize());
+    assertEquals(DEFAULT_COMPRESSION_BLOCK_SIZE, reader.getCompressionSize());
     assertFalse(reader.getMetadataKeys().iterator().hasNext());
     assertEquals(3, reader.getContentLength());
     assertFalse(reader.getStripes().iterator().hasNext());
@@ -4126,7 +4127,7 @@ public class TestVectorOrcFile {
     assertEquals(CompressionKind.NONE, reader.getCompressionKind());
     assertEquals(0, reader.getRawDataSize());
     assertEquals(0, reader.getRowIndexStride());
-    assertEquals(0, reader.getCompressionSize());
+    assertEquals(DEFAULT_COMPRESSION_BLOCK_SIZE, reader.getCompressionSize());
     assertEquals(0, reader.getMetadataSize());
     assertEquals(OrcFile.Version.CURRENT, reader.getFileVersion());
     assertEquals(0, reader.getStripes().size());
diff --git a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
index d038a8623..8369b03c0 100644
--- a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
@@ -505,4 +505,26 @@ public class TestReaderImpl {
       assertEquals(tail.getFooter().getStripesList(), extractedTail.getFooter().getStripesList());
     }
   }
+
+  @Test
+  public void testWithoutCompressionBlockSize() throws IOException {
+    Configuration conf = new Configuration();
+    Path path = new Path(workDir, "TestOrcFile.testWithoutCompressionBlockSize.orc");
+    FileSystem fs = path.getFileSystem(conf);
+    try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path,
+            OrcFile.readerOptions(conf).filesystem(fs))) {
+      try (RecordReader rows = reader.rows()) {
+        TypeDescription schema = reader.getSchema();
+        assertEquals("bigint", schema.toString());
+        VectorizedRowBatch batch = schema.createRowBatchV2();
+        assertTrue(rows.nextBatch(batch), "No rows read out!");
+        assertEquals(100, batch.size);
+        LongColumnVector col1 = (LongColumnVector) batch.cols[0];
+        for (int i = 0; i < batch.size; ++i) {
+          assertEquals(1L + i, col1.vector[i]);
+        }
+        assertFalse(rows.nextBatch(batch));
+      }
+    }
+  }
 }