You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2023/01/20 00:46:42 UTC
[orc] branch main updated: ORC-1357: Handle missing compression block size (#1385)
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 481a9f000 ORC-1357: Handle missing compression block size (#1385)
481a9f000 is described below
commit 481a9f00015c19407524592844778d800b4ae48b
Author: Huw Campbell <hu...@gmail.com>
AuthorDate: Fri Jan 20 11:46:34 2023 +1100
ORC-1357: Handle missing compression block size (#1385)
### What changes were proposed in this pull request?
The C++ implementation handles when the postscript doesn't include compressionBlockSize by checking if it is set and using 256 * 1024 if it is not.
As compressionBlockSize is optional this means valid files couldn't be read with the Java implementation. This change brings the Java implemenation in line with C++ by first checking and using the same default value.
### Why are the changes needed?
Some files which can be read by C++ fail to be read when compression is enabled in the Java implementation because it uses a default buffer of size 0 (which is just an arbitrary protobuf 'nil' conversion).
### How was this patch tested?
New file / test case added which failed to read before in Java but could be read by C++.
---
...TestOrcFile.testWithoutCompressionBlockSize.orc | Bin 0 -> 63 bytes
.../core/src/java/org/apache/orc/impl/OrcTail.java | 3 ++-
.../src/java/org/apache/orc/impl/ReaderImpl.java | 17 ++++++++++++++--
.../src/test/org/apache/orc/TestVectorOrcFile.java | 5 +++--
.../test/org/apache/orc/impl/TestReaderImpl.java | 22 +++++++++++++++++++++
5 files changed, 42 insertions(+), 5 deletions(-)
diff --git a/examples/TestOrcFile.testWithoutCompressionBlockSize.orc b/examples/TestOrcFile.testWithoutCompressionBlockSize.orc
new file mode 100644
index 000000000..552f4e776
Binary files /dev/null and b/examples/TestOrcFile.testWithoutCompressionBlockSize.orc differ
diff --git a/java/core/src/java/org/apache/orc/impl/OrcTail.java b/java/core/src/java/org/apache/orc/impl/OrcTail.java
index a7fb330d1..9297f953c 100644
--- a/java/core/src/java/org/apache/orc/impl/OrcTail.java
+++ b/java/core/src/java/org/apache/orc/impl/OrcTail.java
@@ -135,7 +135,8 @@ public final class OrcTail {
}
public int getCompressionBufferSize() {
- return (int) fileTail.getPostscript().getCompressionBlockSize();
+ OrcProto.PostScript postScript = fileTail.getPostscript();
+ return ReaderImpl.getCompressionBlockSize(postScript);
}
public int getMetadataSize() {
diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
index 5c4ced6b3..6a4830c38 100644
--- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java
@@ -64,6 +64,7 @@ public class ReaderImpl implements Reader {
private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class);
private static final int DIRECTORY_SIZE_GUESS = 16 * 1024;
+ public static final int DEFAULT_COMPRESSION_BLOCK_SIZE = 256 * 1024;
private final long maxLength;
protected final Path path;
@@ -714,6 +715,18 @@ public class ReaderImpl implements Reader {
return extractFileTail(buffer, -1,-1);
}
+ /**
+ * Read compression block size from the postscript if it is set; otherwise,
+ * use the same 256k default the C++ implementation uses.
+ */
+ public static int getCompressionBlockSize(OrcProto.PostScript postScript) {
+ if (postScript.hasCompressionBlockSize()) {
+ return (int) postScript.getCompressionBlockSize();
+ } else {
+ return DEFAULT_COMPRESSION_BLOCK_SIZE;
+ }
+ }
+
/**
* @deprecated Use {@link ReaderImpl#extractFileTail(FileSystem, Path, long)} instead.
* This is for backward compatibility.
@@ -741,7 +754,7 @@ public class ReaderImpl implements Reader {
try (CompressionCodec codec = OrcCodecPool.getCodec(compressionKind)){
if (codec != null) {
compression.withCodec(codec)
- .withBufferSize((int) ps.getCompressionBlockSize());
+ .withBufferSize(getCompressionBlockSize(ps));
}
OrcProto.Footer footer =
@@ -823,7 +836,7 @@ public class ReaderImpl implements Reader {
try (CompressionCodec codec = OrcCodecPool.getCodec(compressionKind)) {
if (codec != null) {
compression.withCodec(codec)
- .withBufferSize((int) ps.getCompressionBlockSize());
+ .withBufferSize(getCompressionBlockSize(ps));
}
OrcProto.Footer footer =
OrcProto.Footer.parseFrom(
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index 1c38732c1..9a1431c68 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -85,6 +85,7 @@ import java.util.UUID;
import java.util.function.IntFunction;
import java.util.stream.Stream;
+import static org.apache.orc.impl.ReaderImpl.DEFAULT_COMPRESSION_BLOCK_SIZE;
import static org.apache.orc.impl.mask.SHA256MaskFactory.printHexBinary;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -1495,7 +1496,7 @@ public class TestVectorOrcFile {
assertFalse(reader.rows().nextBatch(batch));
assertEquals(CompressionKind.NONE, reader.getCompressionKind());
assertEquals(0, reader.getNumberOfRows());
- assertEquals(0, reader.getCompressionSize());
+ assertEquals(DEFAULT_COMPRESSION_BLOCK_SIZE, reader.getCompressionSize());
assertFalse(reader.getMetadataKeys().iterator().hasNext());
assertEquals(3, reader.getContentLength());
assertFalse(reader.getStripes().iterator().hasNext());
@@ -4126,7 +4127,7 @@ public class TestVectorOrcFile {
assertEquals(CompressionKind.NONE, reader.getCompressionKind());
assertEquals(0, reader.getRawDataSize());
assertEquals(0, reader.getRowIndexStride());
- assertEquals(0, reader.getCompressionSize());
+ assertEquals(DEFAULT_COMPRESSION_BLOCK_SIZE, reader.getCompressionSize());
assertEquals(0, reader.getMetadataSize());
assertEquals(OrcFile.Version.CURRENT, reader.getFileVersion());
assertEquals(0, reader.getStripes().size());
diff --git a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
index d038a8623..8369b03c0 100644
--- a/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
+++ b/java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
@@ -505,4 +505,26 @@ public class TestReaderImpl {
assertEquals(tail.getFooter().getStripesList(), extractedTail.getFooter().getStripesList());
}
}
+
+ @Test
+ public void testWithoutCompressionBlockSize() throws IOException {
+ Configuration conf = new Configuration();
+ Path path = new Path(workDir, "TestOrcFile.testWithoutCompressionBlockSize.orc");
+ FileSystem fs = path.getFileSystem(conf);
+ try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path,
+ OrcFile.readerOptions(conf).filesystem(fs))) {
+ try (RecordReader rows = reader.rows()) {
+ TypeDescription schema = reader.getSchema();
+ assertEquals("bigint", schema.toString());
+ VectorizedRowBatch batch = schema.createRowBatchV2();
+ assertTrue(rows.nextBatch(batch), "No rows read out!");
+ assertEquals(100, batch.size);
+ LongColumnVector col1 = (LongColumnVector) batch.cols[0];
+ for (int i = 0; i < batch.size; ++i) {
+ assertEquals(1L + i, col1.vector[i]);
+ }
+ assertFalse(rows.nextBatch(batch));
+ }
+ }
+ }
}