You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/09/30 10:44:05 UTC

[arrow-rs] branch master updated: parquet: Avoid NaN check for non-floats (#798)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new e8438bb  parquet: Avoid NaN check for non-floats (#798)
e8438bb is described below

commit e8438bb54472a3f62044cd64ee68dd2e4ff59ba9
Author: Kornelijus Survila <ko...@gmail.com>
AuthorDate: Thu Sep 30 04:43:57 2021 -0600

    parquet: Avoid NaN check for non-floats (#798)
    
    It was especially expensive for `ByteArray` columns, potentially taking as
    long as the rest of encoding.
---
 parquet/src/column/writer.rs | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/parquet/src/column/writer.rs b/parquet/src/column/writer.rs
index 0da9439..162941a 100644
--- a/parquet/src/column/writer.rs
+++ b/parquet/src/column/writer.rs
@@ -951,23 +951,27 @@ impl<T: DataType> ColumnWriterImpl<T> {
 
     #[allow(clippy::eq_op)]
     fn update_page_min_max(&mut self, val: &T::T) {
-        // simple "isNaN" check that works for all types
-        if val == val {
-            if self
-                .min_page_value
-                .as_ref()
-                .map_or(true, |min| self.compare_greater(min, val))
-            {
-                self.min_page_value = Some(val.clone());
-            }
-            if self
-                .max_page_value
-                .as_ref()
-                .map_or(true, |max| self.compare_greater(val, max))
-            {
-                self.max_page_value = Some(val.clone());
+        if let Type::FLOAT | Type::DOUBLE = T::get_physical_type() {
+            // Skip NaN values
+            if val != val {
+                return;
             }
         }
+
+        if self
+            .min_page_value
+            .as_ref()
+            .map_or(true, |min| self.compare_greater(min, val))
+        {
+            self.min_page_value = Some(val.clone());
+        }
+        if self
+            .max_page_value
+            .as_ref()
+            .map_or(true, |max| self.compare_greater(val, max))
+        {
+            self.max_page_value = Some(val.clone());
+        }
     }
 
     fn update_column_min_max(&mut self) {