You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ga...@apache.org on 2018/08/23 07:43:47 UTC
[parquet-mr] branch column-indexes updated: PARQUET-1386: Fix
issues of NaN and +-0.0 in case of float/double column indexes (#515)
This is an automated email from the ASF dual-hosted git repository.
gabor pushed a commit to branch column-indexes
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/column-indexes by this push:
new 1f95eca PARQUET-1386: Fix issues of NaN and +-0.0 in case of float/double column indexes (#515)
1f95eca is described below
commit 1f95ecac93eba8df7ee57d8271755d8a0be0401a
Author: Gabor Szadovszky <ga...@apache.org>
AuthorDate: Thu Aug 23 09:43:44 2018 +0200
PARQUET-1386: Fix issues of NaN and +-0.0 in case of float/double column indexes (#515)
---
.../column/columnindex/ColumnIndexBuilder.java | 4 ++
.../columnindex/DoubleColumnIndexBuilder.java | 23 +++++++++++-
.../columnindex/FloatColumnIndexBuilder.java | 23 +++++++++++-
.../column/columnindex/TestColumnIndexBuilder.java | 43 +++++++++++++++++++++-
4 files changed, 87 insertions(+), 6 deletions(-)
diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
index 9633b61..b28fdde 100644
--- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
@@ -557,6 +557,10 @@ public abstract class ColumnIndexBuilder {
return null;
}
ColumnIndexBase<?> columnIndex = createColumnIndex(type);
+ if (columnIndex == null) {
+ // Might happen if the specialized builder discovers invalid min/max values
+ return null;
+ }
columnIndex.nullPages = nullPages.toBooleanArray();
// Null counts is optional so keep it null if the builder has no values
if (!nullCounts.isEmpty()) {
diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java
index 24be02c..074d025 100644
--- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java
@@ -84,6 +84,7 @@ class DoubleColumnIndexBuilder extends ColumnIndexBuilder {
private final DoubleList minValues = new DoubleArrayList();
private final DoubleList maxValues = new DoubleArrayList();
+ private boolean invalid;
private static double convert(ByteBuffer buffer) {
return buffer.order(LITTLE_ENDIAN).getDouble(0);
@@ -101,12 +102,30 @@ class DoubleColumnIndexBuilder extends ColumnIndexBuilder {
@Override
void addMinMax(Object min, Object max) {
- minValues.add((double) min);
- maxValues.add((double) max);
+ double dMin = (double) min;
+ double dMax = (double) max;
+ if (Double.isNaN(dMin) || Double.isNaN(dMax)) {
+ // Invalidate this column index in case of NaN as the sorting order of values is undefined for this case
+ invalid = true;
+ }
+
+ // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to ensure that no 0.0 values are skipped
+ if (Double.compare(dMin, +0.0) == 0) {
+ dMin = -0.0;
+ }
+ if (Double.compare(dMax, -0.0) == 0) {
+ dMax = +0.0;
+ }
+
+ minValues.add(dMin);
+ maxValues.add(dMax);
}
@Override
ColumnIndexBase<Double> createColumnIndex(PrimitiveType type) {
+ if (invalid) {
+ return null;
+ }
DoubleColumnIndex columnIndex = new DoubleColumnIndex(type);
columnIndex.minValues = minValues.toDoubleArray();
columnIndex.maxValues = maxValues.toDoubleArray();
diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java
index 4452746..cbcdf94 100644
--- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java
@@ -84,6 +84,7 @@ class FloatColumnIndexBuilder extends ColumnIndexBuilder {
private final FloatList minValues = new FloatArrayList();
private final FloatList maxValues = new FloatArrayList();
+ private boolean invalid;
private static float convert(ByteBuffer buffer) {
return buffer.order(LITTLE_ENDIAN).getFloat(0);
@@ -101,12 +102,30 @@ class FloatColumnIndexBuilder extends ColumnIndexBuilder {
@Override
void addMinMax(Object min, Object max) {
- minValues.add((float) min);
- maxValues.add((float) max);
+ float fMin = (float) min;
+ float fMax = (float) max;
+ if (Float.isNaN(fMin) || Float.isNaN(fMax)) {
+ // Invalidate this column index in case of NaN as the sorting order of values is undefined for this case
+ invalid = true;
+ }
+
+ // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to ensure that no 0.0 values are skipped
+ if (Float.compare(fMin, +0.0f) == 0) {
+ fMin = -0.0f;
+ }
+ if (Float.compare(fMax, -0.0f) == 0) {
+ fMax = +0.0f;
+ }
+
+ minValues.add(fMin);
+ maxValues.add(fMax);
}
@Override
ColumnIndexBase<Float> createColumnIndex(PrimitiveType type) {
+ if (invalid) {
+ return null;
+ }
FloatColumnIndex columnIndex = new FloatColumnIndex(type);
columnIndex.minValues = minValues.toFloatArray();
columnIndex.maxValues = maxValues.toFloatArray();
diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
index e4655b2..5a3947c 100644
--- a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
+++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
@@ -48,6 +48,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.math.BigDecimal;
@@ -811,6 +812,25 @@ public class TestColumnIndexBuilder {
}
@Test
+ public void testBuildDoubleZeroNaN() {
+ PrimitiveType type = Types.required(DOUBLE).named("test_double");
+ ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
+ StatsBuilder sb = new StatsBuilder();
+ builder.add(sb.stats(type, -1.0, -0.0));
+ builder.add(sb.stats(type, 0.0, 1.0));
+ builder.add(sb.stats(type, 1.0, 100.0));
+ ColumnIndex columnIndex = builder.build();
+ assertCorrectValues(columnIndex.getMinValues(), -1.0, -0.0, 1.0);
+ assertCorrectValues(columnIndex.getMaxValues(), 0.0, 1.0, 100.0);
+
+ builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
+ builder.add(sb.stats(type, -1.0, -0.0));
+ builder.add(sb.stats(type, 0.0, Double.NaN));
+ builder.add(sb.stats(type, 1.0, 100.0));
+ assertNull(builder.build());
+ }
+
+ @Test
public void testStaticBuildDouble() {
ColumnIndex columnIndex = ColumnIndexBuilder.build(
Types.required(DOUBLE).named("test_double"),
@@ -922,6 +942,25 @@ public class TestColumnIndexBuilder {
}
@Test
+ public void testBuildFloatZeroNaN() {
+ PrimitiveType type = Types.required(FLOAT).named("test_float");
+ ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
+ StatsBuilder sb = new StatsBuilder();
+ builder.add(sb.stats(type, -1.0f, -0.0f));
+ builder.add(sb.stats(type, 0.0f, 1.0f));
+ builder.add(sb.stats(type, 1.0f, 100.0f));
+ ColumnIndex columnIndex = builder.build();
+ assertCorrectValues(columnIndex.getMinValues(), -1.0f, -0.0f, 1.0f);
+ assertCorrectValues(columnIndex.getMaxValues(), 0.0f, 1.0f, 100.0f);
+
+ builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
+ builder.add(sb.stats(type, -1.0f, -0.0f));
+ builder.add(sb.stats(type, 0.0f, Float.NaN));
+ builder.add(sb.stats(type, 1.0f, 100.0f));
+ assertNull(builder.build());
+ }
+
+ @Test
public void testStaticBuildFloat() {
ColumnIndex columnIndex = ColumnIndexBuilder.build(
Types.required(FLOAT).named("test_float"),
@@ -1391,7 +1430,7 @@ public class TestColumnIndexBuilder {
assertFalse("The byte buffer should be empty for null pages", value.hasRemaining());
} else {
assertEquals("The byte buffer should be 8 bytes long for double", 8, value.remaining());
- assertEquals("Invalid value for page " + i, expectedValue.doubleValue(), value.getDouble(0), 0.0);
+ assertTrue("Invalid value for page " + i, Double.compare(expectedValue.doubleValue(), value.getDouble(0)) == 0);
}
}
}
@@ -1405,7 +1444,7 @@ public class TestColumnIndexBuilder {
assertFalse("The byte buffer should be empty for null pages", value.hasRemaining());
} else {
assertEquals("The byte buffer should be 4 bytes long for double", 4, value.remaining());
- assertEquals("Invalid value for page " + i, expectedValue.floatValue(), value.getFloat(0), 0.0f);
+ assertTrue("Invalid value for page " + i, Float.compare(expectedValue.floatValue(), value.getFloat(0)) == 0);
}
}
}