You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by zi...@apache.org on 2018/02/27 13:20:52 UTC
parquet-mr git commit: PARQUET-1217: Incorrect handling of missing
values in Statistics
Repository: parquet-mr
Updated Branches:
refs/heads/master 8bbc6cb95 -> b82d96218
PARQUET-1217: Incorrect handling of missing values in Statistics
In parquet-format every value in Statistics is optional while parquet-mr does not properly handle these scenarios:
- null_count is set but min/max or min_value/max_value are not: filtering may fail with NPE or incorrect filtering occurs
fix: check if min/max is set before comparing to the related values
- null_count is not set: filtering handles null_count as if it would be 0 -> incorrect filtering may occur
fix: introduce new method in Statistics object to check if num_nulls is set; check if num_nulls is set by the new method before using its value for filtering
Author: Gabor Szadovszky <ga...@cloudera.com>
Closes #458 from gszadovszky/PARQUET-1217 and squashes the following commits:
9d14090 [Gabor Szadovszky] Updates according to rdblue's comments
116d1d3 [Gabor Szadovszky] PARQUET-1217: Updates according to zi's comments
c264b50 [Gabor Szadovszky] PARQUET-1217: fix handling of unset nullCount
2ec2fb1 [Gabor Szadovszky] PARQUET-1217: Incorrect handling of missing values in Statistics
Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/b82d9621
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/b82d9621
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/b82d9621
Branch: refs/heads/master
Commit: b82d96218bfd37f6df95a2e8d7675d091ab61970
Parents: 8bbc6cb
Author: Gabor Szadovszky <ga...@cloudera.com>
Authored: Tue Feb 27 14:19:14 2018 +0100
Committer: Zoltan Ivanfi <zi...@cloudera.com>
Committed: Tue Feb 27 14:19:14 2018 +0100
----------------------------------------------------------------------
.../cli/commands/ParquetMetadataCommand.java | 4 +-
.../parquet/cli/commands/ShowPagesCommand.java | 2 +-
.../parquet/column/statistics/Statistics.java | 78 ++++++++++++++++++--
.../column/statistics/TestStatistics.java | 1 +
.../statisticslevel/StatisticsFilter.java | 42 ++++++++++-
.../converter/ParquetMetadataConverter.java | 19 +++--
.../statisticslevel/TestStatisticsFilter.java | 64 +++++++++++++++-
.../converter/TestParquetMetadataConverter.java | 33 +++++++++
.../hadoop/TestColumnChunkPageWriteStore.java | 4 +-
.../parquet/hadoop/TestParquetFileWriter.java | 48 ++++++------
10 files changed, 249 insertions(+), 46 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java
----------------------------------------------------------------------
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java
index 0bd77a3..54fe657 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java
@@ -168,12 +168,12 @@ public class ParquetMetadataCommand extends BaseCommand {
if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
console.info(String.format("%-" + width + "s FIXED[%d] %s %-7s %-9d %-8s %-7s %s",
name, type.getTypeLength(), shortCodec(codec), encodingSummary, count,
- humanReadable(perValue), stats == null ? "" : String.valueOf(stats.getNumNulls()),
+ humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
minMaxAsString(stats, type.getOriginalType())));
} else {
console.info(String.format("%-" + width + "s %-9s %s %-7s %-9d %-10s %-7s %s",
name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue),
- stats == null ? "" : String.valueOf(stats.getNumNulls()),
+ stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
minMaxAsString(stats, type.getOriginalType())));
}
}
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
----------------------------------------------------------------------
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
index beda452..4d0e2c9 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java
@@ -191,7 +191,7 @@ public class ShowPagesCommand extends BaseCommand {
String enc = encodingAsString(page.getValueEncoding(), false);
long totalSize = page.getCompressedSize();
int count = page.getValueCount();
- long numNulls = page.getStatistics().getNumNulls();
+ String numNulls = page.getStatistics().isNumNullsSet() ? Long.toString(page.getStatistics().getNumNulls()) : "";
float perValue = ((float) totalSize) / count;
String minMax = minMaxAsString(page.getStatistics(), type.getOriginalType());
return String.format("%3d-%-3d %-5s %s %-2s %-7d %-10s %-10s %-8s %-7s %s",
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
index 00d0bbf..a087c5f 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
@@ -35,6 +35,44 @@ import org.apache.parquet.schema.Type;
*/
public abstract class Statistics<T extends Comparable<T>> {
+ /**
+ * Builder class to build Statistics objects. Used to read the statistics from the Parquet file.
+ */
+ public static class Builder {
+ private final PrimitiveType type;
+ private byte[] min;
+ private byte[] max;
+ private long numNulls = -1;
+
+ private Builder(PrimitiveType type) {
+ this.type = type;
+ }
+
+ public Builder withMin(byte[] min) {
+ this.min = min;
+ return this;
+ }
+
+ public Builder withMax(byte[] max) {
+ this.max = max;
+ return this;
+ }
+
+ public Builder withNumNulls(long numNulls) {
+ this.numNulls = numNulls;
+ return this;
+ }
+
+ public Statistics<?> build() {
+ Statistics<?> stats = createStats(type);
+ if (min != null && max != null) {
+ stats.setMinMaxFromBytes(min, max);
+ }
+ stats.num_nulls = this.numNulls;
+ return stats;
+ }
+ }
+
private final PrimitiveType type;
private final PrimitiveComparator<T> comparator;
private boolean hasNonNullValue;
@@ -110,6 +148,17 @@ public abstract class Statistics<T extends Comparable<T>> {
}
/**
+ * Returns a builder to create new statistics object. Used to read the statistics from the parquet file.
+ *
+ * @param type
+ * type of the column
+ * @return builder to create new statistics object
+ */
+ public static Builder getBuilder(PrimitiveType type) {
+ return new Builder(type);
+ }
+
+ /**
* updates statistics min and max using the passed value
* @param value value to use to update min and max
*/
@@ -217,7 +266,9 @@ public abstract class Statistics<T extends Comparable<T>> {
* Abstract method to set min and max values from byte arrays.
* @param minBytes byte array to set the min value to
* @param maxBytes byte array to set the max value to
+ * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
*/
+ @Deprecated
abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);
/**
@@ -310,9 +361,13 @@ public abstract class Statistics<T extends Comparable<T>> {
@Override
public String toString() {
- if (this.hasNonNullValue())
- return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls());
- else if (!this.isEmpty())
+ if (this.hasNonNullValue()) {
+ if (isNumNullsSet()) {
+ return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls());
+ } else {
+ return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString());
+ }
+ } else if (!this.isEmpty())
return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
else
return "no stats for this column";
@@ -335,7 +390,7 @@ public abstract class Statistics<T extends Comparable<T>> {
/**
* Returns the null count
- * @return null count
+ * @return null count or {@code -1} if the null count is not set
*/
public long getNumNulls() {
return num_nulls;
@@ -343,8 +398,12 @@ public abstract class Statistics<T extends Comparable<T>> {
/**
* Sets the number of nulls to the parameter value
- * @param nulls null count to set the count to
+ *
+ * @param nulls
+ * null count to set the count to
+ * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
*/
+ @Deprecated
public void setNumNulls(long nulls) {
num_nulls = nulls;
}
@@ -355,7 +414,7 @@ public abstract class Statistics<T extends Comparable<T>> {
* @return true if object is empty, false otherwise
*/
public boolean isEmpty() {
- return !hasNonNullValue && num_nulls == 0;
+ return !hasNonNullValue && !isNumNullsSet();
}
/**
@@ -366,6 +425,13 @@ public abstract class Statistics<T extends Comparable<T>> {
}
/**
+ * @return whether numNulls is set and can be used
+ */
+ public boolean isNumNullsSet() {
+ return num_nulls >= 0;
+ }
+
+ /**
* Sets the page/column as having a valid non-null value
* kind of misnomer here
*/
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
index 8ca1ca6..5e5d5fd 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
@@ -42,6 +42,7 @@ public class TestStatistics {
@Test
public void testNumNulls() {
IntStatistics stats = new IntStatistics();
+ assertTrue(stats.isNumNullsSet());
assertEquals(stats.getNumNulls(), 0);
stats.incrementNumNulls();
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
index f168a60..446c8a3 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
@@ -40,7 +40,6 @@ import org.apache.parquet.filter2.predicate.Operators.UserDefined;
import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
-import static org.apache.parquet.Preconditions.checkArgument;
import static org.apache.parquet.Preconditions.checkNotNull;
/**
@@ -122,6 +121,10 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> {
}
if (value == null) {
+ // We don't know anything about the nulls in this chunk
+ if (!stats.isNumNullsSet()) {
+ return BLOCK_MIGHT_MATCH;
+ }
// we are looking for records where v eq(null)
// so drop if there are no nulls in this chunk
return !hasNulls(meta);
@@ -133,6 +136,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> {
return BLOCK_CANNOT_MATCH;
}
+ if (!stats.hasNonNullValue()) {
+ // stats does not contain min/max values, we cannot drop any chunks
+ return BLOCK_MIGHT_MATCH;
+ }
+
// drop if value < min || value > max
return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0;
}
@@ -166,12 +174,17 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> {
return isAllNulls(meta);
}
- if (hasNulls(meta)) {
+ if (stats.isNumNullsSet() && hasNulls(meta)) {
// we are looking for records where v notEq(someNonNull)
// but this chunk contains nulls, we cannot drop it
return BLOCK_MIGHT_MATCH;
}
+ if (!stats.hasNonNullValue()) {
+ // stats does not contain min/max values, we cannot drop any chunks
+ return BLOCK_MIGHT_MATCH;
+ }
+
// drop if this is a column where min = max = value
return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0;
}
@@ -201,6 +214,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> {
return BLOCK_CANNOT_MATCH;
}
+ if (!stats.hasNonNullValue()) {
+ // stats does not contain min/max values, we cannot drop any chunks
+ return BLOCK_MIGHT_MATCH;
+ }
+
T value = lt.getValue();
// drop if value <= min
@@ -232,6 +250,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> {
return BLOCK_CANNOT_MATCH;
}
+ if (!stats.hasNonNullValue()) {
+ // stats does not contain min/max values, we cannot drop any chunks
+ return BLOCK_MIGHT_MATCH;
+ }
+
T value = ltEq.getValue();
// drop if value < min
@@ -263,6 +286,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> {
return BLOCK_CANNOT_MATCH;
}
+ if (!stats.hasNonNullValue()) {
+ // stats does not contain min/max values, we cannot drop any chunks
+ return BLOCK_MIGHT_MATCH;
+ }
+
T value = gt.getValue();
// drop if value >= max
@@ -294,6 +322,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> {
return BLOCK_CANNOT_MATCH;
}
+ if (!stats.hasNonNullValue()) {
+ // stats does not contain min/max values, we cannot drop any chunks
+ return BLOCK_MIGHT_MATCH;
+ }
+
T value = gtEq.getValue();
// drop if value > max
@@ -355,6 +388,11 @@ public class StatisticsFilter implements FilterPredicate.Visitor<Boolean> {
}
}
+ if (!stats.hasNonNullValue()) {
+ // stats does not contain min/max values, we cannot drop any chunks
+ return BLOCK_MIGHT_MATCH;
+ }
+
org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
new org.apache.parquet.filter2.predicate.Statistics<T>(stats.genericGetMin(), stats.genericGetMax(),
stats.comparator());
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index c4e5da3..0daabb6 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -401,7 +401,8 @@ public class ParquetMetadataConverter {
static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal
(String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) {
// create stats object based on the column type
- org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.createStats(type);
+ org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
+ org.apache.parquet.column.statistics.Statistics.getBuilder(type);
if (formatStats != null) {
// Use the new V2 min-max statistics over the former one if it is filled
@@ -409,9 +410,12 @@ public class ParquetMetadataConverter {
byte[] min = formatStats.min_value.array();
byte[] max = formatStats.max_value.array();
if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) {
- stats.setMinMaxFromBytes(min, max);
+ statsBuilder.withMin(min);
+ statsBuilder.withMax(max);
+ }
+ if (formatStats.isSetNull_count()) {
+ statsBuilder.withNumNulls(formatStats.null_count);
}
- stats.setNumNulls(formatStats.null_count);
} else {
boolean isSet = formatStats.isSetMax() && formatStats.isSetMin();
boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false;
@@ -424,13 +428,16 @@ public class ParquetMetadataConverter {
if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) &&
(sortOrdersMatch || maxEqualsMin)) {
if (isSet) {
- stats.setMinMaxFromBytes(formatStats.min.array(), formatStats.max.array());
+ statsBuilder.withMin(formatStats.min.array());
+ statsBuilder.withMax(formatStats.max.array());
+ }
+ if (formatStats.isSetNull_count()) {
+ statsBuilder.withNumNulls(formatStats.null_count);
}
- stats.setNumNulls(formatStats.null_count);
}
}
}
- return stats;
+ return statsBuilder.build();
}
public org.apache.parquet.column.statistics.Statistics fromParquetStatistics(
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
index d8b4407..6fdec2a 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
@@ -22,7 +22,6 @@ import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
-import org.apache.parquet.io.api.Binary;
import org.junit.Test;
import org.apache.parquet.column.Encoding;
@@ -39,6 +38,7 @@ import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.apache.parquet.schema.Types;
import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn;
import static org.apache.parquet.io.api.Binary.fromString;
@@ -62,7 +62,8 @@ import static org.apache.parquet.filter2.statisticslevel.StatisticsFilter.canDro
public class TestStatisticsFilter {
- private static ColumnChunkMetaData getIntColumnMeta(IntStatistics stats, long valueCount) {
+ private static ColumnChunkMetaData getIntColumnMeta(org.apache.parquet.column.statistics.Statistics<?> stats,
+ long valueCount) {
return ColumnChunkMetaData.get(ColumnPath.get("int", "column"),
PrimitiveTypeName.INT32,
CompressionCodecName.GZIP,
@@ -71,7 +72,8 @@ public class TestStatisticsFilter {
0L, 0L, valueCount, 0L, 0L);
}
- private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, long valueCount) {
+ private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column.statistics.Statistics<?> stats,
+ long valueCount) {
return ColumnChunkMetaData.get(ColumnPath.get("double", "column"),
PrimitiveTypeName.DOUBLE,
CompressionCodecName.GZIP,
@@ -87,13 +89,16 @@ public class TestStatisticsFilter {
private static final IntStatistics intStats = new IntStatistics();
private static final IntStatistics nullIntStats = new IntStatistics();
+ private static final org.apache.parquet.column.statistics.Statistics<?> emptyIntStats = org.apache.parquet.column.statistics.Statistics
+ .getBuilder(Types.required(PrimitiveTypeName.INT32).named("test_int32")).build();
private static final DoubleStatistics doubleStats = new DoubleStatistics();
+ private static final org.apache.parquet.column.statistics.Statistics<?> missingMinMaxDoubleStats = org.apache.parquet.column.statistics.Statistics
+ .getBuilder(Types.required(PrimitiveTypeName.DOUBLE).named("test_double")).withNumNulls(100).build();
static {
intStats.setMinMax(10, 100);
doubleStats.setMinMax(10, 100);
- nullIntStats.setMinMax(0, 0);
nullIntStats.setNumNulls(177);
}
@@ -105,6 +110,9 @@ public class TestStatisticsFilter {
getIntColumnMeta(nullIntStats, 177L), // column of all nulls
getDoubleColumnMeta(doubleStats, 177L));
+ private static final List<ColumnChunkMetaData> missingMinMaxColumnMetas = Arrays.asList(
+ getIntColumnMeta(emptyIntStats, 177L), // missing min/max values and numNulls => stats is empty
+ getDoubleColumnMeta(missingMinMaxDoubleStats, 177L)); // missing min/max, some null values
@Test
public void testEqNonNull() {
@@ -116,6 +124,9 @@ public class TestStatisticsFilter {
// drop columns of all nulls when looking for non-null value
assertTrue(canDrop(eq(intColumn, 0), nullColumnMetas));
assertTrue(canDrop(eq(missingColumn, fromString("any")), columnMetas));
+
+ assertFalse(canDrop(eq(intColumn, 50), missingMinMaxColumnMetas));
+ assertFalse(canDrop(eq(doubleColumn, 50.0), missingMinMaxColumnMetas));
}
@Test
@@ -137,6 +148,9 @@ public class TestStatisticsFilter {
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(eq(missingColumn, null), columnMetas));
+
+ assertFalse(canDrop(eq(intColumn, null), missingMinMaxColumnMetas));
+ assertFalse(canDrop(eq(doubleColumn, null), missingMinMaxColumnMetas));
}
@Test
@@ -163,6 +177,9 @@ public class TestStatisticsFilter {
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(notEq(missingColumn, fromString("any")), columnMetas));
+
+ assertFalse(canDrop(notEq(intColumn, 50), missingMinMaxColumnMetas));
+ assertFalse(canDrop(notEq(doubleColumn, 50.0), missingMinMaxColumnMetas));
}
@Test
@@ -192,6 +209,9 @@ public class TestStatisticsFilter {
getDoubleColumnMeta(doubleStats, 177L))));
assertTrue(canDrop(notEq(missingColumn, null), columnMetas));
+
+ assertFalse(canDrop(notEq(intColumn, null), missingMinMaxColumnMetas));
+ assertFalse(canDrop(notEq(doubleColumn, null), missingMinMaxColumnMetas));
}
@Test
@@ -205,6 +225,9 @@ public class TestStatisticsFilter {
assertTrue(canDrop(lt(intColumn, 7), nullColumnMetas));
assertTrue(canDrop(lt(missingColumn, fromString("any")), columnMetas));
+
+ assertFalse(canDrop(lt(intColumn, 0), missingMinMaxColumnMetas));
+ assertFalse(canDrop(lt(doubleColumn, 0.0), missingMinMaxColumnMetas));
}
@Test
@@ -218,6 +241,9 @@ public class TestStatisticsFilter {
assertTrue(canDrop(ltEq(intColumn, 7), nullColumnMetas));
assertTrue(canDrop(ltEq(missingColumn, fromString("any")), columnMetas));
+
+ assertFalse(canDrop(ltEq(intColumn, -1), missingMinMaxColumnMetas));
+ assertFalse(canDrop(ltEq(doubleColumn, -0.1), missingMinMaxColumnMetas));
}
@Test
@@ -231,6 +257,9 @@ public class TestStatisticsFilter {
assertTrue(canDrop(gt(intColumn, 7), nullColumnMetas));
assertTrue(canDrop(gt(missingColumn, fromString("any")), columnMetas));
+
+ assertFalse(canDrop(gt(intColumn, 0), missingMinMaxColumnMetas));
+ assertFalse(canDrop(gt(doubleColumn, 0.0), missingMinMaxColumnMetas));
}
@Test
@@ -244,6 +273,9 @@ public class TestStatisticsFilter {
assertTrue(canDrop(gtEq(intColumn, 7), nullColumnMetas));
assertTrue(canDrop(gtEq(missingColumn, fromString("any")), columnMetas));
+
+ assertFalse(canDrop(gtEq(intColumn, 1), missingMinMaxColumnMetas));
+ assertFalse(canDrop(gtEq(doubleColumn, 0.1), missingMinMaxColumnMetas));
}
@Test
@@ -297,6 +329,26 @@ public class TestStatisticsFilter {
}
}
+ public static class AllPositiveUdp extends UserDefinedPredicate<Double> {
+ @Override
+ public boolean keep(Double value) {
+ if (value == null) {
+ return true;
+ }
+ throw new RuntimeException("this method should not be called with value != null");
+ }
+
+ @Override
+ public boolean canDrop(Statistics<Double> statistics) {
+ return statistics.getMin() <= 0.0;
+ }
+
+ @Override
+ public boolean inverseCanDrop(Statistics<Double> statistics) {
+ return statistics.getMin() > 0.0;
+ }
+ }
+
@Test
public void testUdp() {
FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class);
@@ -308,6 +360,8 @@ public class TestStatisticsFilter {
FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class);
FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class)));
+ FilterPredicate allPositivePred = userDefined(doubleColumn, AllPositiveUdp.class);
+
IntStatistics seven = new IntStatistics();
seven.setMinMax(7, 7);
@@ -392,6 +446,8 @@ public class TestStatisticsFilter {
assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
getIntColumnMeta(neither, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
+
+ assertFalse(canDrop(allPositivePred, missingMinMaxColumnMetas));
}
@Test
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
index b83da5d..6cce32f 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
@@ -658,6 +658,7 @@ public class TestParquetMetadataConverter {
binaryType);
Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty());
+ Assert.assertTrue(convertedStats.isNumNullsSet());
Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls());
if (helper == StatsHelper.V1) {
assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue());
@@ -670,6 +671,38 @@ public class TestParquetMetadataConverter {
}
@Test
+ public void testMissingValuesFromStats() {
+ ParquetMetadataConverter converter = new ParquetMetadataConverter();
+ PrimitiveType type = Types.required(PrimitiveTypeName.INT32).named("test_int32");
+
+ org.apache.parquet.format.Statistics formatStats = new org.apache.parquet.format.Statistics();
+ Statistics<?> stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
+ assertFalse(stats.isNumNullsSet());
+ assertFalse(stats.hasNonNullValue());
+ assertTrue(stats.isEmpty());
+ assertEquals(-1, stats.getNumNulls());
+
+ formatStats.clear();
+ formatStats.setMin(BytesUtils.intToBytes(-100));
+ formatStats.setMax(BytesUtils.intToBytes(100));
+ stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
+ assertFalse(stats.isNumNullsSet());
+ assertTrue(stats.hasNonNullValue());
+ assertFalse(stats.isEmpty());
+ assertEquals(-1, stats.getNumNulls());
+ assertEquals(-100, stats.genericGetMin());
+ assertEquals(100, stats.genericGetMax());
+
+ formatStats.clear();
+ formatStats.setNull_count(2000);
+ stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
+ assertTrue(stats.isNumNullsSet());
+ assertFalse(stats.hasNonNullValue());
+ assertFalse(stats.isEmpty());
+ assertEquals(2000, stats.getNumNulls());
+ }
+
+ @Test
public void testSkippedV2Stats() {
testSkippedV2Stats(
Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12).as(OriginalType.INTERVAL).named(""),
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
index 87574cd..0b7b951 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
@@ -60,6 +60,7 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Types;
import org.apache.parquet.bytes.HeapByteBufferAllocator;
@@ -92,7 +93,8 @@ public class TestColumnChunkPageWriteStore {
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
- Statistics<?> statistics = new BinaryStatistics();
+ Statistics<?> statistics = Statistics.getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
+ .build();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/b82d9621/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
index 4243e9b..c73e569 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
@@ -27,7 +27,6 @@ import org.apache.hadoop.fs.Path;
import org.apache.parquet.Version;
import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel;
-import org.apache.parquet.hadoop.util.HadoopOutputFile;
import org.junit.Assume;
import org.junit.Rule;
import org.junit.Test;
@@ -48,6 +47,7 @@ import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.apache.parquet.schema.Types;
import java.io.File;
import java.io.IOException;
@@ -95,8 +95,8 @@ public class TestParquetFileWriter {
private static final byte[] BYTES4 = { 3, 4, 5, 6 };
private static final CompressionCodecName CODEC = CompressionCodecName.UNCOMPRESSED;
- private static final BinaryStatistics STATS1 = new BinaryStatistics();
- private static final BinaryStatistics STATS2 = new BinaryStatistics();
+ private static final org.apache.parquet.column.statistics.Statistics<?> EMPTY_STATS = org.apache.parquet.column.statistics.Statistics
+ .getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
private String writeSchema;
@@ -145,24 +145,24 @@ public class TestParquetFileWriter {
w.startBlock(3);
w.startColumn(C1, 5, CODEC);
long c1Starts = w.getPos();
- w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
- w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c1Ends = w.getPos();
w.startColumn(C2, 6, CODEC);
long c2Starts = w.getPos();
- w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
- w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
- w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c2Ends = w.getPos();
w.endBlock();
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
- w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
- w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
w.end(new HashMap<String, String>());
@@ -225,15 +225,15 @@ public class TestParquetFileWriter {
w.startBlock(3);
w.startColumn(C1, 5, CODEC);
long c1Starts = w.getPos();
- w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
- w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c1Ends = w.getPos();
w.startColumn(C2, 6, CODEC);
long c2Starts = w.getPos();
- w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
- w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
- w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c2Ends = w.getPos();
w.endBlock();
@@ -242,10 +242,10 @@ public class TestParquetFileWriter {
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
- w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
- w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
@@ -330,15 +330,15 @@ public class TestParquetFileWriter {
w.startBlock(3);
w.startColumn(C1, 5, CODEC);
long c1Starts = w.getPos();
- w.writeDataPage(2, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
- w.writeDataPage(3, 4, BytesInput.from(BYTES1), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c1Ends = w.getPos();
w.startColumn(C2, 6, CODEC);
long c2Starts = w.getPos();
- w.writeDataPage(2, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
- w.writeDataPage(3, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
- w.writeDataPage(1, 4, BytesInput.from(BYTES2), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c2Ends = w.getPos();
w.endBlock();
@@ -347,10 +347,10 @@ public class TestParquetFileWriter {
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
- w.writeDataPage(7, 4, BytesInput.from(BYTES3), STATS1, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
- w.writeDataPage(8, 4, BytesInput.from(BYTES4), STATS2, BIT_PACKED, BIT_PACKED, PLAIN);
+ w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();