You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2022/04/06 04:13:13 UTC
[orc] branch main updated: ORC-1147: Use isNaN instead of isFinite to determine the contain NaN values
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 6b053d4ba ORC-1147: Use isNaN instead of isFinite to determine the contain NaN values
6b053d4ba is described below
commit 6b053d4ba4df036e2691f84331bd407354a7c3b2
Author: Guiyanakuang <gu...@gmail.com>
AuthorDate: Tue Apr 5 21:12:42 2022 -0700
ORC-1147: Use isNaN instead of isFinite to determine the contain NaN values
### What changes were proposed in this pull request?
This pr is aimed at using `isNaN` instead of `isFinite` to determine the contain NaN values.
I want to exclude Double.POSITIVE_INFINITY / Double.NEGATIVE_INFINITY both cases, and only match NaN.
### Why are the changes needed?
In the case of a sum overflow we can also predicate down to skip the corresponding strip.
### How was this patch tested?
Added unit test.
Closes #1080
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
.../java/org/apache/orc/impl/RecordReaderImpl.java | 2 +-
.../src/test/org/apache/orc/TestVectorOrcFile.java | 90 ++++++++++++++++++++++
2 files changed, 91 insertions(+), 1 deletion(-)
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index f77542d84..ade91c6e3 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -696,7 +696,7 @@ public class RecordReaderImpl implements RecordReader {
} else if (category == TypeDescription.Category.DOUBLE
|| category == TypeDescription.Category.FLOAT) {
DoubleColumnStatistics dstas = (DoubleColumnStatistics) cs;
- if (!Double.isFinite(dstas.getSum())) {
+ if (Double.isNaN(dstas.getSum())) {
LOG.debug("Not using predication pushdown on {} because stats contain NaN values",
predicate.getColumnName());
return dstas.hasNull() ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index a06dc1b4b..7c8d8cf1e 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -4384,6 +4384,96 @@ public class TestVectorOrcFile {
assertEquals(0, batch.size);
}
+ @ParameterizedTest
+ @MethodSource("data")
+ public void testPredicatePushdownWithSumOverflow(Version fileFormat) throws Exception {
+ TypeDescription schema = TypeDescription.createStruct()
+ .addField("double1", TypeDescription.createDouble())
+ .addField("float1", TypeDescription.createFloat());
+
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(400000L)
+ .compress(CompressionKind.NONE)
+ .bufferSize(500)
+ .rowIndexStride(1000)
+ .version(fileFormat));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.ensureSize(3500);
+ batch.size = 3500;
+ batch.cols[0].noNulls = true;
+ batch.cols[1].noNulls = true;
+
+ DoubleColumnVector dbcol = ((DoubleColumnVector) batch.cols[0]);
+ DoubleColumnVector fcol = ((DoubleColumnVector) batch.cols[1]);
+
+ double largeNumber = Double.MAX_VALUE / 2 + Double.MAX_VALUE / 4;
+
+ // Here we are writing 3500 rows of data, with stripeSize set to 400000
+ // and rowIndexStride set to 1000, so 1 stripe will be written,
+ // indexed in 4 strides.
+ // Two large values are written in the first and fourth strides,
+ // causing the statistical sum to overflow, sum is not a finite value,
+ // but this does not prevent pushing down (range comparisons work fine)
+ fcol.vector[0] = dbcol.vector[0] = largeNumber;
+ fcol.vector[1] = dbcol.vector[1] = largeNumber;
+ for (int i=2; i < 3500; ++i) {
+ if (i >= 3200 && i<= 3201) {
+ fcol.vector[i] = dbcol.vector[i] = largeNumber;
+ } else {
+ dbcol.vector[i] = i;
+ fcol.vector[i] = i;
+ }
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals(3500, reader.getNumberOfRows());
+
+ // Test double category push down
+ SearchArgument sarg = SearchArgumentFactory.newBuilder()
+ .startAnd()
+ .lessThan("double1", PredicateLeaf.Type.FLOAT, 100d)
+ .end()
+ .build();
+
+ RecordReader rows = reader.rows(reader.options()
+ .range(0L, Long.MAX_VALUE)
+ .searchArgument(sarg, new String[]{"double1"}));
+ batch = reader.getSchema().createRowBatch(3500);
+
+ rows.nextBatch(batch);
+ // First stride should be read
+ assertEquals(1000, batch.size);
+
+ rows.nextBatch(batch);
+ // Last stride should not be read, even if sum is not finite
+ assertEquals(0, batch.size);
+
+ // Test float category push down
+ sarg = SearchArgumentFactory.newBuilder()
+ .startAnd()
+ .lessThan("float1", PredicateLeaf.Type.FLOAT, 100d)
+ .end()
+ .build();
+
+ rows = reader.rows(reader.options()
+ .range(0L, Long.MAX_VALUE)
+ .searchArgument(sarg, new String[]{"float1"}));
+ batch = reader.getSchema().createRowBatch(3500);
+
+ rows.nextBatch(batch);
+ // First stride should be read
+ assertEquals(1000, batch.size);
+
+ rows.nextBatch(batch);
+ // Last stride should not be read, even if sum is not finite
+ assertEquals(0, batch.size);
+ }
+
/**
* Test predicate pushdown on nulls, with different combinations of
* values and nulls.