You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2019/08/26 17:23:17 UTC
[incubator-iceberg] branch master updated: Use null counts in
metrics evaluators (#412)
This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new e7197a7 Use null counts in metrics evaluators (#412)
e7197a7 is described below
commit e7197a7e9cda676ddce402a258ca6e3ebfb2dcad
Author: Anton Okolnychyi <ao...@apple.com>
AuthorDate: Mon Aug 26 18:23:12 2019 +0100
Use null counts in metrics evaluators (#412)
---
.../expressions/InclusiveMetricsEvaluator.java | 30 ++++++++++--
.../expressions/StrictMetricsEvaluator.java | 38 ++++++++++++++--
.../expressions/TestInclusiveMetricsEvaluator.java | 15 ++++++
.../expressions/TestStrictMetricsEvaluator.java | 53 ++++++++++++++++++++++
4 files changed, 130 insertions(+), 6 deletions(-)
diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
index 64d34e9..d905440 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
@@ -140,9 +140,7 @@ public class InclusiveMetricsEvaluator {
// if the column has no non-null values, the expression cannot match
Integer id = ref.fieldId();
- if (valueCounts != null && valueCounts.containsKey(id) &&
- nullCounts != null && nullCounts.containsKey(id) &&
- valueCounts.get(id) - nullCounts.get(id) == 0) {
+ if (containsNullsOnly(id)) {
return ROWS_CANNOT_MATCH;
}
@@ -153,6 +151,10 @@ public class InclusiveMetricsEvaluator {
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
Integer id = ref.fieldId();
+ if (containsNullsOnly(id)) {
+ return ROWS_CANNOT_MATCH;
+ }
+
if (lowerBounds != null && lowerBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
@@ -169,6 +171,10 @@ public class InclusiveMetricsEvaluator {
public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
Integer id = ref.fieldId();
+ if (containsNullsOnly(id)) {
+ return ROWS_CANNOT_MATCH;
+ }
+
if (lowerBounds != null && lowerBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
@@ -185,6 +191,10 @@ public class InclusiveMetricsEvaluator {
public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
Integer id = ref.fieldId();
+ if (containsNullsOnly(id)) {
+ return ROWS_CANNOT_MATCH;
+ }
+
if (upperBounds != null && upperBounds.containsKey(id)) {
T upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id));
@@ -201,6 +211,10 @@ public class InclusiveMetricsEvaluator {
public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
Integer id = ref.fieldId();
+ if (containsNullsOnly(id)) {
+ return ROWS_CANNOT_MATCH;
+ }
+
if (upperBounds != null && upperBounds.containsKey(id)) {
T upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id));
@@ -217,6 +231,10 @@ public class InclusiveMetricsEvaluator {
public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
Integer id = ref.fieldId();
+ if (containsNullsOnly(id)) {
+ return ROWS_CANNOT_MATCH;
+ }
+
if (lowerBounds != null && lowerBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
@@ -254,5 +272,11 @@ public class InclusiveMetricsEvaluator {
public <T> Boolean notIn(BoundReference<T> ref, Literal<T> lit) {
return ROWS_MIGHT_MATCH;
}
+
+ private boolean containsNullsOnly(Integer id) {
+ return valueCounts != null && valueCounts.containsKey(id) &&
+ nullCounts != null && nullCounts.containsKey(id) &&
+ valueCounts.get(id) - nullCounts.get(id) == 0;
+ }
}
}
diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
index 6af5b21..17ef0b1 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
@@ -126,9 +126,7 @@ public class StrictMetricsEvaluator {
Preconditions.checkNotNull(struct.field(id),
"Cannot filter by nested column: %s", schema.findField(id));
- if (valueCounts != null && valueCounts.containsKey(id) &&
- nullCounts != null && nullCounts.containsKey(id) &&
- valueCounts.get(id) - nullCounts.get(id) == 0) {
+ if (containsNullsOnly(id)) {
return ROWS_MUST_MATCH;
}
@@ -157,6 +155,10 @@ public class StrictMetricsEvaluator {
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
+ if (canContainNulls(id)) {
+ return ROWS_MIGHT_NOT_MATCH;
+ }
+
if (upperBounds != null && upperBounds.containsKey(id)) {
T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id));
@@ -176,6 +178,10 @@ public class StrictMetricsEvaluator {
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
+ if (canContainNulls(id)) {
+ return ROWS_MIGHT_NOT_MATCH;
+ }
+
if (upperBounds != null && upperBounds.containsKey(id)) {
T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id));
@@ -195,6 +201,10 @@ public class StrictMetricsEvaluator {
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
+ if (canContainNulls(id)) {
+ return ROWS_MIGHT_NOT_MATCH;
+ }
+
if (lowerBounds != null && lowerBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
@@ -214,6 +224,10 @@ public class StrictMetricsEvaluator {
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
+ if (canContainNulls(id)) {
+ return ROWS_MIGHT_NOT_MATCH;
+ }
+
if (lowerBounds != null && lowerBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
@@ -233,6 +247,10 @@ public class StrictMetricsEvaluator {
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
+ if (canContainNulls(id)) {
+ return ROWS_MIGHT_NOT_MATCH;
+ }
+
if (lowerBounds != null && lowerBounds.containsKey(id) &&
upperBounds != null && upperBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
@@ -262,6 +280,10 @@ public class StrictMetricsEvaluator {
Types.NestedField field = struct.field(id);
Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
+ if (containsNullsOnly(id)) {
+ return ROWS_MUST_MATCH;
+ }
+
if (lowerBounds != null && lowerBounds.containsKey(id)) {
T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
@@ -292,5 +314,15 @@ public class StrictMetricsEvaluator {
public <T> Boolean notIn(BoundReference<T> ref, Literal<T> lit) {
return ROWS_MIGHT_NOT_MATCH;
}
+
+ private boolean canContainNulls(Integer id) {
+ return nullCounts == null || nullCounts.containsKey(id) && nullCounts.get(id) > 0;
+ }
+
+ private boolean containsNullsOnly(Integer id) {
+ return valueCounts != null && valueCounts.containsKey(id) &&
+ nullCounts != null && nullCounts.containsKey(id) &&
+ valueCounts.get(id) - nullCounts.get(id) == 0;
+ }
}
}
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
index 6014ce6..cca8b28 100644
--- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
@@ -79,6 +79,21 @@ public class TestInclusiveMetricsEvaluator {
boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE);
Assert.assertFalse("Should skip: no non-null value in all null column", shouldRead);
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("all_nulls", "a")).eval(FILE);
+ Assert.assertFalse("Should skip: lessThan on all null column", shouldRead);
+
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("all_nulls", "a")).eval(FILE);
+ Assert.assertFalse("Should skip: lessThanOrEqual on all null column", shouldRead);
+
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("all_nulls", "a")).eval(FILE);
+ Assert.assertFalse("Should skip: greaterThan on all null column", shouldRead);
+
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("all_nulls", "a")).eval(FILE);
+ Assert.assertFalse("Should skip: greaterThanOrEqual on all null column", shouldRead);
+
+ shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("all_nulls", "a")).eval(FILE);
+ Assert.assertFalse("Should skip: equal on all null column", shouldRead);
+
shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNull("some_nulls")).eval(FILE);
Assert.assertTrue("Should read: column with some nulls contains a non-null value", shouldRead);
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java
index 2269c07..0a7251c 100644
--- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java
@@ -77,6 +77,38 @@ public class TestStrictMetricsEvaluator {
1, toByteBuffer(IntegerType.get(), 79),
7, toByteBuffer(IntegerType.get(), 5)));
+ private static final DataFile FILE_2 = new TestDataFile("file_2.avro", Row.of(), 50,
+ // any value counts, including nulls
+ ImmutableMap.of(
+ 4, 50L,
+ 5, 50L,
+ 6, 50L),
+ // null value counts
+ ImmutableMap.of(
+ 4, 50L,
+ 5, 10L,
+ 6, 0L),
+ // lower bounds
+ ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")),
+ // upper bounds
+ ImmutableMap.of(5, toByteBuffer(StringType.get(), "eee")));
+
+ private static final DataFile FILE_3 = new TestDataFile("file_3.avro", Row.of(), 50,
+ // any value counts, including nulls
+ ImmutableMap.of(
+ 4, 50L,
+ 5, 50L,
+ 6, 50L),
+ // null value counts
+ ImmutableMap.of(
+ 4, 50L,
+ 5, 10L,
+ 6, 0L),
+ // lower bounds
+ ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")),
+ // upper bounds
+ ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")));
+
@Test
public void testAllNulls() {
boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE);
@@ -87,6 +119,9 @@ public class TestStrictMetricsEvaluator {
shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("no_nulls")).eval(FILE);
Assert.assertTrue("Should match: non-null column contains no null values", shouldRead);
+
+ shouldRead = new StrictMetricsEvaluator(SCHEMA, notEqual("all_nulls", "a")).eval(FILE);
+ Assert.assertTrue("Should match: notEqual on all nulls column", shouldRead);
}
@Test
@@ -102,6 +137,24 @@ public class TestStrictMetricsEvaluator {
}
@Test
+ public void testSomeNulls() {
+ boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThan("some_nulls", "ggg")).eval(FILE_2);
+ Assert.assertFalse("Should not match: lessThan on some nulls column", shouldRead);
+
+ shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("some_nulls", "eee")).eval(FILE_2);
+ Assert.assertFalse("Should not match: lessThanOrEqual on some nulls column", shouldRead);
+
+ shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThan("some_nulls", "aaa")).eval(FILE_2);
+ Assert.assertFalse("Should not match: greaterThan on some nulls column", shouldRead);
+
+ shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("some_nulls", "bbb")).eval(FILE_2);
+ Assert.assertFalse("Should not match: greaterThanOrEqual on some nulls column", shouldRead);
+
+ shouldRead = new StrictMetricsEvaluator(SCHEMA, equal("some_nulls", "bbb")).eval(FILE_3);
+ Assert.assertFalse("Should not match: equal on some nulls column", shouldRead);
+ }
+
+ @Test
public void testRequiredColumn() {
boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("required")).eval(FILE);
Assert.assertTrue("Should match: required columns are always non-null", shouldRead);