You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ga...@apache.org on 2018/08/23 07:43:47 UTC

[parquet-mr] branch column-indexes updated: PARQUET-1386: Fix issues of NaN and +-0.0 in case of float/double column indexes (#515)

This is an automated email from the ASF dual-hosted git repository.

gabor pushed a commit to branch column-indexes
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git


The following commit(s) were added to refs/heads/column-indexes by this push:
     new 1f95eca  PARQUET-1386: Fix issues of NaN and +-0.0 in case of float/double column indexes (#515)
1f95eca is described below

commit 1f95ecac93eba8df7ee57d8271755d8a0be0401a
Author: Gabor Szadovszky <ga...@apache.org>
AuthorDate: Thu Aug 23 09:43:44 2018 +0200

    PARQUET-1386: Fix issues of NaN and +-0.0 in case of float/double column indexes (#515)
---
 .../column/columnindex/ColumnIndexBuilder.java     |  4 ++
 .../columnindex/DoubleColumnIndexBuilder.java      | 23 +++++++++++-
 .../columnindex/FloatColumnIndexBuilder.java       | 23 +++++++++++-
 .../column/columnindex/TestColumnIndexBuilder.java | 43 +++++++++++++++++++++-
 4 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
index 9633b61..b28fdde 100644
--- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
@@ -557,6 +557,10 @@ public abstract class ColumnIndexBuilder {
       return null;
     }
     ColumnIndexBase<?> columnIndex = createColumnIndex(type);
+    if (columnIndex == null) {
+      // Might happen if the specialized builder discovers invalid min/max values
+      return null;
+    }
     columnIndex.nullPages = nullPages.toBooleanArray();
     // Null counts is optional so keep it null if the builder has no values
     if (!nullCounts.isEmpty()) {
diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java
index 24be02c..074d025 100644
--- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java
@@ -84,6 +84,7 @@ class DoubleColumnIndexBuilder extends ColumnIndexBuilder {
 
   private final DoubleList minValues = new DoubleArrayList();
   private final DoubleList maxValues = new DoubleArrayList();
+  private boolean invalid;
 
   private static double convert(ByteBuffer buffer) {
     return buffer.order(LITTLE_ENDIAN).getDouble(0);
@@ -101,12 +102,30 @@ class DoubleColumnIndexBuilder extends ColumnIndexBuilder {
 
   @Override
   void addMinMax(Object min, Object max) {
-    minValues.add((double) min);
-    maxValues.add((double) max);
+    double dMin = (double) min;
+    double dMax = (double) max;
+    if (Double.isNaN(dMin) || Double.isNaN(dMax)) {
+      // Invalidate this column index in case of NaN as the sorting order of values is undefined for this case
+      invalid = true;
+    }
+
+    // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to ensure that no 0.0 values are skipped
+    if (Double.compare(dMin, +0.0) == 0) {
+      dMin = -0.0;
+    }
+    if (Double.compare(dMax, -0.0) == 0) {
+      dMax = +0.0;
+    }
+
+    minValues.add(dMin);
+    maxValues.add(dMax);
   }
 
   @Override
   ColumnIndexBase<Double> createColumnIndex(PrimitiveType type) {
+    if (invalid) {
+      return null;
+    }
     DoubleColumnIndex columnIndex = new DoubleColumnIndex(type);
     columnIndex.minValues = minValues.toDoubleArray();
     columnIndex.maxValues = maxValues.toDoubleArray();
diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java
index 4452746..cbcdf94 100644
--- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java
@@ -84,6 +84,7 @@ class FloatColumnIndexBuilder extends ColumnIndexBuilder {
 
   private final FloatList minValues = new FloatArrayList();
   private final FloatList maxValues = new FloatArrayList();
+  private boolean invalid;
 
   private static float convert(ByteBuffer buffer) {
     return buffer.order(LITTLE_ENDIAN).getFloat(0);
@@ -101,12 +102,30 @@ class FloatColumnIndexBuilder extends ColumnIndexBuilder {
 
   @Override
   void addMinMax(Object min, Object max) {
-    minValues.add((float) min);
-    maxValues.add((float) max);
+    float fMin = (float) min;
+    float fMax = (float) max;
+    if (Float.isNaN(fMin) || Float.isNaN(fMax)) {
+      // Invalidate this column index in case of NaN as the sorting order of values is undefined for this case
+      invalid = true;
+    }
+
+    // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to ensure that no 0.0 values are skipped
+    if (Float.compare(fMin, +0.0f) == 0) {
+      fMin = -0.0f;
+    }
+    if (Float.compare(fMax, -0.0f) == 0) {
+      fMax = +0.0f;
+    }
+
+    minValues.add(fMin);
+    maxValues.add(fMax);
   }
 
   @Override
   ColumnIndexBase<Float> createColumnIndex(PrimitiveType type) {
+    if (invalid) {
+      return null;
+    }
     FloatColumnIndex columnIndex = new FloatColumnIndex(type);
     columnIndex.minValues = minValues.toFloatArray();
     columnIndex.maxValues = maxValues.toFloatArray();
diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
index e4655b2..5a3947c 100644
--- a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
+++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
@@ -48,6 +48,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
 import java.math.BigDecimal;
@@ -811,6 +812,25 @@ public class TestColumnIndexBuilder {
   }
 
   @Test
+  public void testBuildDoubleZeroNaN() {
+    PrimitiveType type = Types.required(DOUBLE).named("test_double");
+    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
+    StatsBuilder sb = new StatsBuilder();
+    builder.add(sb.stats(type, -1.0, -0.0));
+    builder.add(sb.stats(type, 0.0, 1.0));
+    builder.add(sb.stats(type, 1.0, 100.0));
+    ColumnIndex columnIndex = builder.build();
+    assertCorrectValues(columnIndex.getMinValues(), -1.0, -0.0, 1.0);
+    assertCorrectValues(columnIndex.getMaxValues(), 0.0, 1.0, 100.0);
+
+    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
+    builder.add(sb.stats(type, -1.0, -0.0));
+    builder.add(sb.stats(type, 0.0, Double.NaN));
+    builder.add(sb.stats(type, 1.0, 100.0));
+    assertNull(builder.build());
+  }
+
+  @Test
   public void testStaticBuildDouble() {
     ColumnIndex columnIndex = ColumnIndexBuilder.build(
         Types.required(DOUBLE).named("test_double"),
@@ -922,6 +942,25 @@ public class TestColumnIndexBuilder {
   }
 
   @Test
+  public void testBuildFloatZeroNaN() {
+    PrimitiveType type = Types.required(FLOAT).named("test_float");
+    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
+    StatsBuilder sb = new StatsBuilder();
+    builder.add(sb.stats(type, -1.0f, -0.0f));
+    builder.add(sb.stats(type, 0.0f, 1.0f));
+    builder.add(sb.stats(type, 1.0f, 100.0f));
+    ColumnIndex columnIndex = builder.build();
+    assertCorrectValues(columnIndex.getMinValues(), -1.0f, -0.0f, 1.0f);
+    assertCorrectValues(columnIndex.getMaxValues(), 0.0f, 1.0f, 100.0f);
+
+    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
+    builder.add(sb.stats(type, -1.0f, -0.0f));
+    builder.add(sb.stats(type, 0.0f, Float.NaN));
+    builder.add(sb.stats(type, 1.0f, 100.0f));
+    assertNull(builder.build());
+  }
+
+  @Test
   public void testStaticBuildFloat() {
     ColumnIndex columnIndex = ColumnIndexBuilder.build(
         Types.required(FLOAT).named("test_float"),
@@ -1391,7 +1430,7 @@ public class TestColumnIndexBuilder {
         assertFalse("The byte buffer should be empty for null pages", value.hasRemaining());
       } else {
         assertEquals("The byte buffer should be 8 bytes long for double", 8, value.remaining());
-        assertEquals("Invalid value for page " + i, expectedValue.doubleValue(), value.getDouble(0), 0.0);
+        assertTrue("Invalid value for page " + i, Double.compare(expectedValue.doubleValue(), value.getDouble(0)) == 0);
       }
     }
   }
@@ -1405,7 +1444,7 @@ public class TestColumnIndexBuilder {
         assertFalse("The byte buffer should be empty for null pages", value.hasRemaining());
       } else {
         assertEquals("The byte buffer should be 4 bytes long for double", 4, value.remaining());
-        assertEquals("Invalid value for page " + i, expectedValue.floatValue(), value.getFloat(0), 0.0f);
+        assertTrue("Invalid value for page " + i, Float.compare(expectedValue.floatValue(), value.getFloat(0)) == 0);
       }
     }
   }