You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2022/04/06 04:13:13 UTC
[orc] branch main updated: ORC-1147: Use isNaN instead of isFinite to determine the contain NaN values

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 6b053d4ba ORC-1147: Use isNaN instead of isFinite to determine the contain NaN values
6b053d4ba is described below

commit 6b053d4ba4df036e2691f84331bd407354a7c3b2
Author: Guiyanakuang <gu...@gmail.com>
AuthorDate: Tue Apr 5 21:12:42 2022 -0700

    ORC-1147: Use isNaN instead of isFinite to determine the contain NaN values
    
    ### What changes were proposed in this pull request?
    
    This pr is aimed at using `isNaN` instead of `isFinite` to determine the contain NaN values.
    I want to exclude Double.POSITIVE_INFINITY / Double.NEGATIVE_INFINITY both cases, and only match NaN.
    
    ### Why are the changes needed?
    
    In the case of a sum overflow we can also predicate down to skip the corresponding strip.
    
    ### How was this patch tested?
    
    Added unit test.
    
    Closes #1080
    
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 .../java/org/apache/orc/impl/RecordReaderImpl.java |  2 +-
 .../src/test/org/apache/orc/TestVectorOrcFile.java | 90 ++++++++++++++++++++++
 2 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index f77542d84..ade91c6e3 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -696,7 +696,7 @@ public class RecordReaderImpl implements RecordReader {
     } else if (category == TypeDescription.Category.DOUBLE
         || category == TypeDescription.Category.FLOAT) {
       DoubleColumnStatistics dstas = (DoubleColumnStatistics) cs;
-      if (!Double.isFinite(dstas.getSum())) {
+      if (Double.isNaN(dstas.getSum())) {
         LOG.debug("Not using predication pushdown on {} because stats contain NaN values",
                 predicate.getColumnName());
         return dstas.hasNull() ? TruthValue.YES_NO_NULL : TruthValue.YES_NO;
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index a06dc1b4b..7c8d8cf1e 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -4384,6 +4384,96 @@ public class TestVectorOrcFile {
     assertEquals(0, batch.size);
   }
 
+  @ParameterizedTest
+  @MethodSource("data")
+  public void testPredicatePushdownWithSumOverflow(Version fileFormat) throws Exception {
+    TypeDescription schema = TypeDescription.createStruct()
+        .addField("double1", TypeDescription.createDouble())
+        .addField("float1", TypeDescription.createFloat());
+
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .setSchema(schema)
+            .stripeSize(400000L)
+            .compress(CompressionKind.NONE)
+            .bufferSize(500)
+            .rowIndexStride(1000)
+            .version(fileFormat));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    batch.ensureSize(3500);
+    batch.size = 3500;
+    batch.cols[0].noNulls = true;
+    batch.cols[1].noNulls = true;
+
+    DoubleColumnVector dbcol = ((DoubleColumnVector) batch.cols[0]);
+    DoubleColumnVector fcol = ((DoubleColumnVector) batch.cols[1]);
+
+    double largeNumber = Double.MAX_VALUE / 2 + Double.MAX_VALUE / 4;
+
+    // Here we are writing 3500 rows of data, with stripeSize set to 400000
+    // and rowIndexStride set to 1000, so 1 stripe will be written,
+    // indexed in 4 strides.
+    // Two large values are written in the first and fourth strides,
+    // causing the statistical sum to overflow, sum is not a finite value,
+    // but this does not prevent pushing down (range comparisons work fine)
+    fcol.vector[0] = dbcol.vector[0] = largeNumber;
+    fcol.vector[1] = dbcol.vector[1] = largeNumber;
+    for (int i=2; i < 3500; ++i) {
+      if (i >= 3200 && i<= 3201) {
+        fcol.vector[i] = dbcol.vector[i] = largeNumber;
+      } else {
+        dbcol.vector[i] = i;
+        fcol.vector[i] = i;
+      }
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+
+    Reader reader = OrcFile.createReader(testFilePath,
+        OrcFile.readerOptions(conf).filesystem(fs));
+    assertEquals(3500, reader.getNumberOfRows());
+
+    // Test double category push down
+    SearchArgument sarg = SearchArgumentFactory.newBuilder()
+        .startAnd()
+        .lessThan("double1", PredicateLeaf.Type.FLOAT, 100d)
+        .end()
+        .build();
+
+    RecordReader rows = reader.rows(reader.options()
+        .range(0L, Long.MAX_VALUE)
+        .searchArgument(sarg, new String[]{"double1"}));
+    batch = reader.getSchema().createRowBatch(3500);
+
+    rows.nextBatch(batch);
+    // First stride should be read
+    assertEquals(1000, batch.size);
+
+    rows.nextBatch(batch);
+    // Last stride should not be read, even if sum is not finite
+    assertEquals(0, batch.size);
+
+    // Test float category push down
+    sarg = SearchArgumentFactory.newBuilder()
+        .startAnd()
+        .lessThan("float1", PredicateLeaf.Type.FLOAT, 100d)
+        .end()
+        .build();
+
+    rows = reader.rows(reader.options()
+        .range(0L, Long.MAX_VALUE)
+        .searchArgument(sarg, new String[]{"float1"}));
+    batch = reader.getSchema().createRowBatch(3500);
+
+    rows.nextBatch(batch);
+    // First stride should be read
+    assertEquals(1000, batch.size);
+
+    rows.nextBatch(batch);
+    // Last stride should not be read, even if sum is not finite
+    assertEquals(0, batch.size);
+  }
+
   /**
    * Test predicate pushdown on nulls, with different combinations of
    * values and nulls.