You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2019/08/26 17:23:17 UTC

[incubator-iceberg] branch master updated: Use null counts in metrics evaluators (#412)

This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new e7197a7  Use null counts in metrics evaluators (#412)
e7197a7 is described below

commit e7197a7e9cda676ddce402a258ca6e3ebfb2dcad
Author: Anton Okolnychyi <ao...@apple.com>
AuthorDate: Mon Aug 26 18:23:12 2019 +0100

    Use null counts in metrics evaluators (#412)
---
 .../expressions/InclusiveMetricsEvaluator.java     | 30 ++++++++++--
 .../expressions/StrictMetricsEvaluator.java        | 38 ++++++++++++++--
 .../expressions/TestInclusiveMetricsEvaluator.java | 15 ++++++
 .../expressions/TestStrictMetricsEvaluator.java    | 53 ++++++++++++++++++++++
 4 files changed, 130 insertions(+), 6 deletions(-)

diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
index 64d34e9..d905440 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
@@ -140,9 +140,7 @@ public class InclusiveMetricsEvaluator {
       // if the column has no non-null values, the expression cannot match
       Integer id = ref.fieldId();
 
-      if (valueCounts != null && valueCounts.containsKey(id) &&
-          nullCounts != null && nullCounts.containsKey(id) &&
-          valueCounts.get(id) - nullCounts.get(id) == 0) {
+      if (containsNullsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
@@ -153,6 +151,10 @@ public class InclusiveMetricsEvaluator {
     public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
+      if (containsNullsOnly(id)) {
+        return ROWS_CANNOT_MATCH;
+      }
+
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
@@ -169,6 +171,10 @@ public class InclusiveMetricsEvaluator {
     public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
+      if (containsNullsOnly(id)) {
+        return ROWS_CANNOT_MATCH;
+      }
+
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
@@ -185,6 +191,10 @@ public class InclusiveMetricsEvaluator {
     public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
+      if (containsNullsOnly(id)) {
+        return ROWS_CANNOT_MATCH;
+      }
+
       if (upperBounds != null && upperBounds.containsKey(id)) {
         T upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id));
 
@@ -201,6 +211,10 @@ public class InclusiveMetricsEvaluator {
     public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
+      if (containsNullsOnly(id)) {
+        return ROWS_CANNOT_MATCH;
+      }
+
       if (upperBounds != null && upperBounds.containsKey(id)) {
         T upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id));
 
@@ -217,6 +231,10 @@ public class InclusiveMetricsEvaluator {
     public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
+      if (containsNullsOnly(id)) {
+        return ROWS_CANNOT_MATCH;
+      }
+
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
@@ -254,5 +272,11 @@ public class InclusiveMetricsEvaluator {
     public <T> Boolean notIn(BoundReference<T> ref, Literal<T> lit) {
       return ROWS_MIGHT_MATCH;
     }
+
+    private boolean containsNullsOnly(Integer id) {
+      return valueCounts != null && valueCounts.containsKey(id) &&
+          nullCounts != null && nullCounts.containsKey(id) &&
+          valueCounts.get(id) - nullCounts.get(id) == 0;
+    }
   }
 }
diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
index 6af5b21..17ef0b1 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
@@ -126,9 +126,7 @@ public class StrictMetricsEvaluator {
       Preconditions.checkNotNull(struct.field(id),
           "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (valueCounts != null && valueCounts.containsKey(id) &&
-          nullCounts != null && nullCounts.containsKey(id) &&
-          valueCounts.get(id) - nullCounts.get(id) == 0) {
+      if (containsNullsOnly(id)) {
         return ROWS_MUST_MATCH;
       }
 
@@ -157,6 +155,10 @@ public class StrictMetricsEvaluator {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
+      if (canContainNulls(id)) {
+        return ROWS_MIGHT_NOT_MATCH;
+      }
+
       if (upperBounds != null && upperBounds.containsKey(id)) {
         T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id));
 
@@ -176,6 +178,10 @@ public class StrictMetricsEvaluator {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
+      if (canContainNulls(id)) {
+        return ROWS_MIGHT_NOT_MATCH;
+      }
+
       if (upperBounds != null && upperBounds.containsKey(id)) {
         T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id));
 
@@ -195,6 +201,10 @@ public class StrictMetricsEvaluator {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
+      if (canContainNulls(id)) {
+        return ROWS_MIGHT_NOT_MATCH;
+      }
+
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
 
@@ -214,6 +224,10 @@ public class StrictMetricsEvaluator {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
+      if (canContainNulls(id)) {
+        return ROWS_MIGHT_NOT_MATCH;
+      }
+
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
 
@@ -233,6 +247,10 @@ public class StrictMetricsEvaluator {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
+      if (canContainNulls(id)) {
+        return ROWS_MIGHT_NOT_MATCH;
+      }
+
       if (lowerBounds != null && lowerBounds.containsKey(id) &&
           upperBounds != null && upperBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
@@ -262,6 +280,10 @@ public class StrictMetricsEvaluator {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
+      if (containsNullsOnly(id)) {
+        return ROWS_MUST_MATCH;
+      }
+
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
 
@@ -292,5 +314,15 @@ public class StrictMetricsEvaluator {
     public <T> Boolean notIn(BoundReference<T> ref, Literal<T> lit) {
       return ROWS_MIGHT_NOT_MATCH;
     }
+
+    private boolean canContainNulls(Integer id) {
+      return nullCounts == null || nullCounts.containsKey(id) && nullCounts.get(id) > 0;
+    }
+
+    private boolean containsNullsOnly(Integer id) {
+      return valueCounts != null && valueCounts.containsKey(id) &&
+          nullCounts != null && nullCounts.containsKey(id) &&
+          valueCounts.get(id) - nullCounts.get(id) == 0;
+    }
   }
 }
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
index 6014ce6..cca8b28 100644
--- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
@@ -79,6 +79,21 @@ public class TestInclusiveMetricsEvaluator {
     boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE);
     Assert.assertFalse("Should skip: no non-null value in all null column", shouldRead);
 
+    shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("all_nulls", "a")).eval(FILE);
+    Assert.assertFalse("Should skip: lessThan on all null column", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("all_nulls", "a")).eval(FILE);
+    Assert.assertFalse("Should skip: lessThanOrEqual on all null column", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("all_nulls", "a")).eval(FILE);
+    Assert.assertFalse("Should skip: greaterThan on all null column", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("all_nulls", "a")).eval(FILE);
+    Assert.assertFalse("Should skip: greaterThanOrEqual on all null column", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("all_nulls", "a")).eval(FILE);
+    Assert.assertFalse("Should skip: equal on all null column", shouldRead);
+
     shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNull("some_nulls")).eval(FILE);
     Assert.assertTrue("Should read: column with some nulls contains a non-null value", shouldRead);
 
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java
index 2269c07..0a7251c 100644
--- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java
@@ -77,6 +77,38 @@ public class TestStrictMetricsEvaluator {
           1, toByteBuffer(IntegerType.get(), 79),
           7, toByteBuffer(IntegerType.get(), 5)));
 
+  private static final DataFile FILE_2 = new TestDataFile("file_2.avro", Row.of(), 50,
+      // any value counts, including nulls
+      ImmutableMap.of(
+          4, 50L,
+          5, 50L,
+          6, 50L),
+      // null value counts
+      ImmutableMap.of(
+          4, 50L,
+          5, 10L,
+          6, 0L),
+      // lower bounds
+      ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")),
+      // upper bounds
+      ImmutableMap.of(5, toByteBuffer(StringType.get(), "eee")));
+
+  private static final DataFile FILE_3 = new TestDataFile("file_3.avro", Row.of(), 50,
+      // any value counts, including nulls
+      ImmutableMap.of(
+          4, 50L,
+          5, 50L,
+          6, 50L),
+      // null value counts
+      ImmutableMap.of(
+          4, 50L,
+          5, 10L,
+          6, 0L),
+      // lower bounds
+      ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")),
+      // upper bounds
+      ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")));
+
   @Test
   public void testAllNulls() {
     boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE);
@@ -87,6 +119,9 @@ public class TestStrictMetricsEvaluator {
 
     shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("no_nulls")).eval(FILE);
     Assert.assertTrue("Should match: non-null column contains no null values", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(SCHEMA, notEqual("all_nulls", "a")).eval(FILE);
+    Assert.assertTrue("Should match: notEqual on all nulls column", shouldRead);
   }
 
   @Test
@@ -102,6 +137,24 @@ public class TestStrictMetricsEvaluator {
   }
 
   @Test
+  public void testSomeNulls() {
+    boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThan("some_nulls", "ggg")).eval(FILE_2);
+    Assert.assertFalse("Should not match: lessThan on some nulls column", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("some_nulls", "eee")).eval(FILE_2);
+    Assert.assertFalse("Should not match: lessThanOrEqual on some nulls column", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThan("some_nulls", "aaa")).eval(FILE_2);
+    Assert.assertFalse("Should not match: greaterThan on some nulls column", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("some_nulls", "bbb")).eval(FILE_2);
+    Assert.assertFalse("Should not match: greaterThanOrEqual on some nulls column", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(SCHEMA, equal("some_nulls", "bbb")).eval(FILE_3);
+    Assert.assertFalse("Should not match: equal on some nulls column", shouldRead);
+  }
+
+  @Test
   public void testRequiredColumn() {
     boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("required")).eval(FILE);
     Assert.assertTrue("Should match: required columns are always non-null", shouldRead);