You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2022/04/04 17:22:03 UTC

[orc] branch main updated: ORC-1146: Float category missing check if the statistic sum is a finite value

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 93a350520 ORC-1146: Float category missing check if the statistic sum is a finite value
93a350520 is described below

commit 93a350520f5f9f1582fbb3415d3a567e880c29da
Author: Guiyanakuang <gu...@gmail.com>
AuthorDate: Mon Apr 4 10:16:25 2022 -0700

    ORC-1146: Float category missing check if the statistic sum is a finite value
    
    ### What changes were proposed in this pull request?
    
    This pr is aimed at checking whether the float category statistic sum has a finite value.
    
    ### Why are the changes needed?
    
    When the orc float category is written with NaN, pushing down is not supported.
    
    ### How was this patch tested?
    
    Added unit test.
    
    Closes #1077
    
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 .../java/org/apache/orc/impl/RecordReaderImpl.java |  3 ++-
 .../src/test/org/apache/orc/TestVectorOrcFile.java | 31 +++++++++++++++++++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 3ca5aed58..f77542d84 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -693,7 +693,8 @@ public class RecordReaderImpl implements RecordReader {
                    " include ORC-517. Writer version: {}",
           predicate.getColumnName(), writerVersion);
       return TruthValue.YES_NO_NULL;
-    } else if (category == TypeDescription.Category.DOUBLE) {
+    } else if (category == TypeDescription.Category.DOUBLE
+        || category == TypeDescription.Category.FLOAT) {
       DoubleColumnStatistics dstas = (DoubleColumnStatistics) cs;
       if (!Double.isFinite(dstas.getSum())) {
         LOG.debug("Not using predication pushdown on {} because stats contain NaN values",
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index 13fa6b8cf..a06dc1b4b 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -4301,7 +4301,8 @@ public class TestVectorOrcFile {
   @MethodSource("data")
   public void testPredicatePushdownWithNan(Version fileFormat) throws Exception {
     TypeDescription schema = TypeDescription.createStruct()
-            .addField("double1", TypeDescription.createDouble());
+        .addField("double1", TypeDescription.createDouble())
+        .addField("float1", TypeDescription.createFloat());
 
     Writer writer = OrcFile.createWriter(testFilePath,
             OrcFile.writerOptions(conf)
@@ -4315,14 +4316,18 @@ public class TestVectorOrcFile {
     batch.ensureSize(3500);
     batch.size = 3500;
     batch.cols[0].noNulls = true;
+    batch.cols[1].noNulls = true;
 
     DoubleColumnVector dbcol = ((DoubleColumnVector) batch.cols[0]);
+    DoubleColumnVector fcol = ((DoubleColumnVector) batch.cols[1]);
 
     // first row NaN (resulting to min/max and sum columnStats of stride to be NaN)
     // NaN in the middle of a stride causes Sum of last stride to be NaN
     dbcol.vector[0] = Double.NaN;
+    fcol.vector[0] = Double.NaN;
     for (int i=1; i < 3500; ++i) {
       dbcol.vector[i] = i == 3200 ? Double.NaN : i;
+      fcol.vector[i] = i == 3200 ? Double.NaN : i;
     }
     writer.addRowBatch(batch);
     writer.close();
@@ -4332,6 +4337,7 @@ public class TestVectorOrcFile {
     assertEquals(3500, reader.getNumberOfRows());
 
     // Only the first stride matches the predicate, just need to make sure NaN stats are ignored
+    // Test double category push down
     SearchArgument sarg = SearchArgumentFactory.newBuilder()
             .startAnd()
             .lessThan("double1", PredicateLeaf.Type.FLOAT, 100d)
@@ -4353,6 +4359,29 @@ public class TestVectorOrcFile {
 
     rows.nextBatch(batch);
     assertEquals(0, batch.size);
+
+    // Test float category push down
+    sarg = SearchArgumentFactory.newBuilder()
+        .startAnd()
+        .lessThan("float1", PredicateLeaf.Type.FLOAT, 100d)
+        .end()
+        .build();
+
+    rows = reader.rows(reader.options()
+        .range(0L, Long.MAX_VALUE)
+        .searchArgument(sarg, new String[]{"float1"}));
+    batch = reader.getSchema().createRowBatch(3500);
+
+    rows.nextBatch(batch);
+    // First stride should be read as NaN sum is ignored
+    assertEquals(1000, batch.size);
+
+    rows.nextBatch(batch);
+    // Last stride should be read as NaN sum is ignored
+    assertEquals(500, batch.size);
+
+    rows.nextBatch(batch);
+    assertEquals(0, batch.size);
   }
 
   /**