You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ga...@apache.org on 2020/01/22 12:09:18 UTC

[parquet-mr] 03/04: PARQUET-1744: Some filters throws ArrayIndexOutOfBoundsException (#732)

This is an automated email from the ASF dual-hosted git repository.

gabor pushed a commit to branch parquet-1.11.x
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git

commit 49de5b475c21c241dff9b5020c17401cc6707b87
Author: Gabor Szadovszky <ga...@apache.org>
AuthorDate: Fri Jan 10 07:08:01 2020 +0100

    PARQUET-1744: Some filters throws ArrayIndexOutOfBoundsException (#732)
    
    
    (cherry picked from commit f0fc29fd3341046f7d46a4a02a7c9ec3d7cd3e46)
---
 .../internal/column/columnindex/BoundaryOrder.java |  32 +++++++
 .../column/columnindex/TestBoundaryOrder.java      |  56 +++++++++++
 .../filter2/columnindex/TestColumnIndexFilter.java | 106 ++++++++++++++-------
 3 files changed, 159 insertions(+), 35 deletions(-)

diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BoundaryOrder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BoundaryOrder.java
index e47b5b3..8a7cee0 100644
--- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BoundaryOrder.java
+++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BoundaryOrder.java
@@ -84,6 +84,10 @@ public enum BoundaryOrder {
     @Override
     OfInt gt(ColumnIndexBase<?>.ValueComparator comparator) {
       int length = comparator.arrayLength();
+      if (length == 0) {
+        // No matching rows if the column index contains null pages only
+        return IndexIterator.EMPTY;
+      }
       int left = 0;
       int right = length;
       do {
@@ -100,6 +104,10 @@ public enum BoundaryOrder {
     @Override
     OfInt gtEq(ColumnIndexBase<?>.ValueComparator comparator) {
       int length = comparator.arrayLength();
+      if (length == 0) {
+        // No matching rows if the column index contains null pages only
+        return IndexIterator.EMPTY;
+      }
       int left = 0;
       int right = length;
       do {
@@ -116,6 +124,10 @@ public enum BoundaryOrder {
     @Override
     OfInt lt(ColumnIndexBase<?>.ValueComparator comparator) {
       int length = comparator.arrayLength();
+      if (length == 0) {
+        // No matching rows if the column index contains null pages only
+        return IndexIterator.EMPTY;
+      }
       int left = -1;
       int right = length - 1;
       do {
@@ -132,6 +144,10 @@ public enum BoundaryOrder {
     @Override
     OfInt ltEq(ColumnIndexBase<?>.ValueComparator comparator) {
       int length = comparator.arrayLength();
+      if (length == 0) {
+        // No matching rows if the column index contains null pages only
+        return IndexIterator.EMPTY;
+      }
       int left = -1;
       int right = length - 1;
       do {
@@ -207,6 +223,10 @@ public enum BoundaryOrder {
     @Override
     OfInt gt(ColumnIndexBase<?>.ValueComparator comparator) {
       int length = comparator.arrayLength();
+      if (length == 0) {
+        // No matching rows if the column index contains null pages only
+        return IndexIterator.EMPTY;
+      }
       int left = -1;
       int right = length - 1;
       do {
@@ -223,6 +243,10 @@ public enum BoundaryOrder {
     @Override
     OfInt gtEq(ColumnIndexBase<?>.ValueComparator comparator) {
       int length = comparator.arrayLength();
+      if (length == 0) {
+        // No matching rows if the column index contains null pages only
+        return IndexIterator.EMPTY;
+      }
       int left = -1;
       int right = length - 1;
       do {
@@ -239,6 +263,10 @@ public enum BoundaryOrder {
     @Override
     OfInt lt(ColumnIndexBase<?>.ValueComparator comparator) {
       int length = comparator.arrayLength();
+      if (length == 0) {
+        // No matching rows if the column index contains null pages only
+        return IndexIterator.EMPTY;
+      }
       int left = 0;
       int right = length;
       do {
@@ -255,6 +283,10 @@ public enum BoundaryOrder {
     @Override
     OfInt ltEq(ColumnIndexBase<?>.ValueComparator comparator) {
       int length = comparator.arrayLength();
+      if (length == 0) {
+        // No matching rows if the column index contains null pages only
+        return IndexIterator.EMPTY;
+      }
       int left = 0;
       int right = length;
       do {
diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java
index 3d2a924..97cbf07 100644
--- a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java
+++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java
@@ -183,6 +183,7 @@ public class TestBoundaryOrder {
   private static final int RAND_COUNT = 2000;
   private static final ColumnIndexBase<?> RAND_ASCENDING;
   private static final ColumnIndexBase<?> RAND_DESCENDING;
+  private static final ColumnIndexBase<?> ALL_NULL_PAGES;
   private static final SpyValueComparatorBuilder SPY_COMPARATOR_BUILDER = new SpyValueComparatorBuilder();
   static {
     ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(TYPE, Integer.MAX_VALUE);
@@ -233,6 +234,13 @@ public class TestBoundaryOrder {
       builder.add(stats(it.next(), it.next()));
     }
     RAND_DESCENDING = (ColumnIndexBase<?>) builder.build();
+
+    builder = ColumnIndexBuilder.getBuilder(TYPE, Integer.MAX_VALUE);
+    // Add a couple of empty stats so column index will contain null pages only
+    for (int i = 0; i < 10; ++i) {
+      builder.add(Statistics.createStats(TYPE));
+    }
+    ALL_NULL_PAGES = (ColumnIndexBase<?>) builder.build();
   }
 
   private static Statistics<?> stats(int min, int max) {
@@ -267,6 +275,14 @@ public class TestBoundaryOrder {
           BoundaryOrder.UNORDERED::eq,
           BoundaryOrder.DESCENDING::eq,
           DESCENDING.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with ASCENDING order in ",
+          BoundaryOrder.UNORDERED::eq,
+          BoundaryOrder.ASCENDING::eq,
+          ALL_NULL_PAGES.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with DESCENDING order in ",
+          BoundaryOrder.UNORDERED::eq,
+          BoundaryOrder.DESCENDING::eq,
+          ALL_NULL_PAGES.createValueComparator(i));
     }
     for (int i = SINGLE_FROM - 1; i <= SINGLE_TO + 1; ++i) {
       ColumnIndexBase<?>.ValueComparator singleComparator = SINGLE.createValueComparator(i);
@@ -305,6 +321,14 @@ public class TestBoundaryOrder {
           BoundaryOrder.UNORDERED::gt,
           BoundaryOrder.DESCENDING::gt,
           DESCENDING.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with ASCENDING order in ",
+          BoundaryOrder.UNORDERED::gt,
+          BoundaryOrder.ASCENDING::gt,
+          ALL_NULL_PAGES.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with DESCENDING order in ",
+          BoundaryOrder.UNORDERED::gt,
+          BoundaryOrder.DESCENDING::gt,
+          ALL_NULL_PAGES.createValueComparator(i));
     }
     for (int i = SINGLE_FROM - 1; i <= SINGLE_TO + 1; ++i) {
       ColumnIndexBase<?>.ValueComparator singleComparator = SINGLE.createValueComparator(i);
@@ -343,6 +367,14 @@ public class TestBoundaryOrder {
           BoundaryOrder.UNORDERED::gtEq,
           BoundaryOrder.DESCENDING::gtEq,
           DESCENDING.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with ASCENDING order in ",
+          BoundaryOrder.UNORDERED::gtEq,
+          BoundaryOrder.ASCENDING::gtEq,
+          ALL_NULL_PAGES.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with DESCENDING order in ",
+          BoundaryOrder.UNORDERED::gtEq,
+          BoundaryOrder.DESCENDING::gtEq,
+          ALL_NULL_PAGES.createValueComparator(i));
     }
     for (int i = SINGLE_FROM - 1; i <= SINGLE_TO + 1; ++i) {
       ColumnIndexBase<?>.ValueComparator singleComparator = SINGLE.createValueComparator(i);
@@ -381,6 +413,14 @@ public class TestBoundaryOrder {
           BoundaryOrder.UNORDERED::lt,
           BoundaryOrder.DESCENDING::lt,
           DESCENDING.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with ASCENDING order in ",
+          BoundaryOrder.UNORDERED::lt,
+          BoundaryOrder.ASCENDING::lt,
+          ALL_NULL_PAGES.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with DESCENDING order in ",
+          BoundaryOrder.UNORDERED::lt,
+          BoundaryOrder.DESCENDING::lt,
+          ALL_NULL_PAGES.createValueComparator(i));
     }
     for (int i = SINGLE_FROM - 1; i <= SINGLE_TO + 1; ++i) {
       ColumnIndexBase<?>.ValueComparator singleComparator = SINGLE.createValueComparator(i);
@@ -419,6 +459,14 @@ public class TestBoundaryOrder {
           BoundaryOrder.UNORDERED::ltEq,
           BoundaryOrder.DESCENDING::ltEq,
           DESCENDING.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with ASCENDING order in ",
+          BoundaryOrder.UNORDERED::ltEq,
+          BoundaryOrder.ASCENDING::ltEq,
+          ALL_NULL_PAGES.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with DESCENDING order in ",
+          BoundaryOrder.UNORDERED::ltEq,
+          BoundaryOrder.DESCENDING::ltEq,
+          ALL_NULL_PAGES.createValueComparator(i));
     }
     for (int i = SINGLE_FROM - 1; i <= SINGLE_TO + 1; ++i) {
       ColumnIndexBase<?>.ValueComparator singleComparator = SINGLE.createValueComparator(i);
@@ -457,6 +505,14 @@ public class TestBoundaryOrder {
           BoundaryOrder.UNORDERED::notEq,
           BoundaryOrder.DESCENDING::notEq,
           DESCENDING.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with ASCENDING order in ",
+          BoundaryOrder.UNORDERED::notEq,
+          BoundaryOrder.ASCENDING::notEq,
+          ALL_NULL_PAGES.createValueComparator(i));
+      validateOperator("Mismatching page indexes for all null pages and value " + i + " with DESCENDING order in ",
+          BoundaryOrder.UNORDERED::notEq,
+          BoundaryOrder.DESCENDING::notEq,
+          ALL_NULL_PAGES.createValueComparator(i));
     }
     for (int i = FROM - 1; i <= TO + 1; ++i) {
       ColumnIndexBase<?>.ValueComparator singleComparator = SINGLE.createValueComparator(i);
diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java b/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java
index ae27214..f37a343 100644
--- a/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java
+++ b/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java
@@ -27,6 +27,7 @@ import static org.apache.parquet.filter2.predicate.FilterApi.eq;
 import static org.apache.parquet.filter2.predicate.FilterApi.gt;
 import static org.apache.parquet.filter2.predicate.FilterApi.gtEq;
 import static org.apache.parquet.filter2.predicate.FilterApi.intColumn;
+import static org.apache.parquet.filter2.predicate.FilterApi.longColumn;
 import static org.apache.parquet.filter2.predicate.FilterApi.lt;
 import static org.apache.parquet.filter2.predicate.FilterApi.ltEq;
 import static org.apache.parquet.filter2.predicate.FilterApi.notEq;
@@ -38,10 +39,11 @@ import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.DESCE
 import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.UNORDERED;
 import static org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter.calculateRowRanges;
 import static org.apache.parquet.io.api.Binary.fromString;
-import static org.apache.parquet.schema.OriginalType.UTF8;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType;
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
 import static org.apache.parquet.schema.Types.optional;
 import static org.junit.Assert.assertArrayEquals;
 
@@ -161,55 +163,56 @@ public class TestColumnIndexFilter {
 
   /**
    * <pre>
-   * row     column1        column2        column3        column4 (no column index)
-   *      ------0------  ------0------  ------0------  ------0------
-   * 0.   1              Zulu           2.03
-   *      ------1------  ------1------  ------1------  ------1------
-   * 1.   2              Yankee         4.67
-   * 2.   3              Xray           3.42
-   * 3.   4              Whiskey        8.71
+   * row     column1        column2        column3        column4        column5
+   *                                                 (no column index)
+   *      ------0------  ------0------  ------0------  ------0------  ------0------
+   * 0.   1              Zulu           2.03                          null
+   *      ------1------  ------1------  ------1------  ------1------  ------1------
+   * 1.   2              Yankee         4.67                          null
+   * 2.   3              Xray           3.42                          null
+   * 3.   4              Whiskey        8.71                          null
    *                     ------2------                 ------2------
-   * 4.   5              Victor         0.56
-   * 5.   6              Uniform        4.30
+   * 4.   5              Victor         0.56                          null
+   * 5.   6              Uniform        4.30                          null
    *                                    ------2------  ------3------
-   * 6.   null           null           null
+   * 6.   null           null           null                          null
    *      ------2------                                ------4------
-   * 7.   7              Tango          3.50
+   * 7.   7              Tango          3.50                          null
    *                     ------3------
-   * 8.   7              null           3.14
+   * 8.   7              null           3.14                          null
    *      ------3------
-   * 9.   7              null           null
+   * 9.   7              null           null                          null
    *                                    ------3------
-   * 10.  null           null           9.99
+   * 10.  null           null           9.99                          null
    *                     ------4------
-   * 11.  8              Sierra         8.78
+   * 11.  8              Sierra         8.78                          null
    *                                                   ------5------
-   * 12.  9              Romeo          9.56
-   * 13.  10             Quebec         2.71
+   * 12.  9              Romeo          9.56                          null
+   * 13.  10             Quebec         2.71                          null
    *      ------4------
-   * 14.  11             Papa           5.71
-   * 15.  12             Oscar          4.09
+   * 14.  11             Papa           5.71                          null
+   * 15.  12             Oscar          4.09                          null
    *                     ------5------  ------4------  ------6------
-   * 16.  13             November       null
-   * 17.  14             Mike           null
-   * 18.  15             Lima           0.36
-   * 19.  16             Kilo           2.94
-   * 20.  17             Juliett        4.23
+   * 16.  13             November       null                          null
+   * 17.  14             Mike           null                          null
+   * 18.  15             Lima           0.36                          null
+   * 19.  16             Kilo           2.94                          null
+   * 20.  17             Juliett        4.23                          null
    *      ------5------  ------6------                 ------7------
-   * 21.  18             India          null
-   * 22.  19             Hotel          5.32
+   * 21.  18             India          null                          null
+   * 22.  19             Hotel          5.32                          null
    *                                    ------5------
-   * 23.  20             Golf           4.17
-   * 24.  21             Foxtrot        7.92
-   * 25.  22             Echo           7.95
+   * 23.  20             Golf           4.17                          null
+   * 24.  21             Foxtrot        7.92                          null
+   * 25.  22             Echo           7.95                          null
    *                                   ------6------
-   * 26.  23             Delta          null
+   * 26.  23             Delta          null                          null
    *      ------6------
-   * 27.  24             Charlie        null
+   * 27.  24             Charlie        null                          null
    *                                                   ------8------
-   * 28.  25             Bravo          null
+   * 28.  25             Bravo          null                          null
    *                     ------7------
-   * 29.  26             Alfa           null
+   * 29.  26             Alfa           null                          null
    * </pre>
    */
   private static final long TOTAL_ROW_COUNT = 30;
@@ -231,7 +234,7 @@ public class TestColumnIndexFilter {
       .addPage(6)
       .addPage(3)
       .build();
-  private static final ColumnIndex COLUMN2_CI = new CIBuilder(optional(BINARY).as(UTF8).named("column2"), DESCENDING)
+  private static final ColumnIndex COLUMN2_CI = new CIBuilder(optional(BINARY).as(stringType()).named("column2"), DESCENDING)
       .addPage(0, "Zulu", "Zulu")
       .addPage(0, "Whiskey", "Yankee")
       .addPage(1, "Tango", "Victor")
@@ -281,6 +284,14 @@ public class TestColumnIndexFilter {
       .addPage(7)
       .addPage(2)
       .build();
+  private static final ColumnIndex COLUMN5_CI = new CIBuilder(optional(INT64).named("column5"), ASCENDING)
+      .addNullPage(1)
+      .addNullPage(29)
+      .build();
+  private static final OffsetIndex COLUMN5_OI = new OIBuilder()
+      .addPage(1)
+      .addPage(29)
+      .build();
   private static final ColumnIndexStore STORE = new ColumnIndexStore() {
     @Override
     public ColumnIndex getColumnIndex(ColumnPath column) {
@@ -293,6 +304,8 @@ public class TestColumnIndexFilter {
           return COLUMN3_CI;
         case "column4":
           return COLUMN4_CI;
+        case "column5":
+          return COLUMN5_CI;
         default:
           return null;
       }
@@ -309,6 +322,8 @@ public class TestColumnIndexFilter {
           return COLUMN3_OI;
         case "column4":
           return COLUMN4_OI;
+        case "column5":
+          return COLUMN5_OI;
         default:
           throw new MissingOffsetIndexException(column);
       }
@@ -461,4 +476,25 @@ public class TestColumnIndexFilter {
         TOTAL_ROW_COUNT);
   }
 
+  @Test
+  public void testFilteringWithAllNullPages() {
+    Set<ColumnPath> paths = paths("column1", "column5");
+
+    assertAllRows(calculateRowRanges(FilterCompat.get(
+        notEq(longColumn("column5"), 1234567L)),
+        STORE, paths, TOTAL_ROW_COUNT),
+        TOTAL_ROW_COUNT);
+    assertAllRows(calculateRowRanges(FilterCompat.get(
+        or(gtEq(intColumn("column1"), 10),
+            notEq(longColumn("column5"), 1234567L))),
+        STORE, paths, TOTAL_ROW_COUNT),
+        TOTAL_ROW_COUNT);
+    assertRows(calculateRowRanges(FilterCompat.get(
+        eq(longColumn("column5"), 1234567L)),
+        STORE, paths, TOTAL_ROW_COUNT));
+    assertRows(calculateRowRanges(FilterCompat.get(
+        and(lt(intColumn("column1"), 20),
+            gtEq(longColumn("column5"), 1234567L))),
+        STORE, paths, TOTAL_ROW_COUNT));
+  }
 }