You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by bl...@apache.org on 2016/04/23 02:42:39 UTC

parquet-mr git commit: PARQUET-569: Separate metadata filtering for ranges and offsets.

Repository: parquet-mr
Updated Branches:
  refs/heads/master 3dd2210e7 -> 2f22533ef


PARQUET-569: Separate metadata filtering for ranges and offsets.

Range filtering should use the row group midpoint and offset filtering
should use the start offset.

Author: Ryan Blue <bl...@apache.org>

Closes #337 from rdblue/PARQUET-569-fix-metadata-filter and squashes the following commits:

6171af4 [Ryan Blue] PARQUET-569: Add tests for new offset metadata filter.
3fe2d5e [Ryan Blue] PARQUET-569: Separate metadata filtering for ranges and offsets.


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/2f22533e
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/2f22533e
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/2f22533e

Branch: refs/heads/master
Commit: 2f22533ef41533e2b839a6b41b262dca59e6dbf9
Parents: 3dd2210
Author: Ryan Blue <bl...@apache.org>
Authored: Fri Apr 22 17:42:35 2016 -0700
Committer: Ryan Blue <bl...@apache.org>
Committed: Fri Apr 22 17:42:35 2016 -0700

----------------------------------------------------------------------
 .../converter/ParquetMetadataConverter.java     | 36 ++++++++++++++------
 .../converter/TestParquetMetadataConverter.java | 31 +++++++++++++++--
 2 files changed, 54 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/2f22533e/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 6feb4a2..75b07fd 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -478,6 +478,7 @@ public class ParquetMetadataConverter {
   private static interface MetadataFilterVisitor<T, E extends Throwable> {
     T visit(NoFilter filter) throws E;
     T visit(SkipMetadataFilter filter) throws E;
+    T visit(RangeMetadataFilter filter) throws E;
     T visit(OffsetMetadataFilter filter) throws E;
   }
 
@@ -501,7 +502,7 @@ public class ParquetMetadataConverter {
     for (long offset : offsets) {
       set.add(offset);
     }
-    return new OffsetListMetadataFilter(set);
+    return new OffsetMetadataFilter(set);
   }
 
   private static final class NoFilter extends MetadataFilter {
@@ -527,16 +528,12 @@ public class ParquetMetadataConverter {
     }
   }
 
-  interface OffsetMetadataFilter {
-    boolean contains(long offset);
-  }
-
   /**
    * [ startOffset, endOffset )
    * @author Julien Le Dem
    */
   // Visible for testing
-  static final class RangeMetadataFilter extends MetadataFilter implements OffsetMetadataFilter {
+  static final class RangeMetadataFilter extends MetadataFilter {
     final long startOffset;
     final long endOffset;
 
@@ -551,7 +548,6 @@ public class ParquetMetadataConverter {
       return visitor.visit(this);
     }
 
-    @Override
     public boolean contains(long offset) {
       return offset >= this.startOffset && offset < this.endOffset;
     }
@@ -562,10 +558,10 @@ public class ParquetMetadataConverter {
     }
   }
 
-  static final class OffsetListMetadataFilter extends MetadataFilter implements OffsetMetadataFilter {
+  static final class OffsetMetadataFilter extends MetadataFilter {
     private final Set<Long> offsets;
 
-    public OffsetListMetadataFilter(Set<Long> offsets) {
+    public OffsetMetadataFilter(Set<Long> offsets) {
       this.offsets = offsets;
     }
 
@@ -585,7 +581,7 @@ public class ParquetMetadataConverter {
   }
 
   // Visible for testing
-  static FileMetaData filterFileMetaData(FileMetaData metaData, OffsetMetadataFilter filter) {
+  static FileMetaData filterFileMetaDataByMidpoint(FileMetaData metaData, RangeMetadataFilter filter) {
     List<RowGroup> rowGroups = metaData.getRow_groups();
     List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
     for (RowGroup rowGroup : rowGroups) {
@@ -604,6 +600,19 @@ public class ParquetMetadataConverter {
   }
 
   // Visible for testing
+  static FileMetaData filterFileMetaDataByStart(FileMetaData metaData, OffsetMetadataFilter filter) {
+    List<RowGroup> rowGroups = metaData.getRow_groups();
+    List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
+    for (RowGroup rowGroup : rowGroups) {
+      long startIndex = getOffset(rowGroup.getColumns().get(0));
+      if (filter.contains(startIndex)) {
+        newRowGroups.add(rowGroup);
+      }
+    }
+    metaData.setRow_groups(newRowGroups);
+    return metaData;
+  }
+
   static long getOffset(RowGroup rowGroup) {
     return getOffset(rowGroup.getColumns().get(0));
   }
@@ -631,7 +640,12 @@ public class ParquetMetadataConverter {
 
       @Override
       public FileMetaData visit(OffsetMetadataFilter filter) throws IOException {
-        return filterFileMetaData(readFileMetaData(from), filter);
+        return filterFileMetaDataByStart(readFileMetaData(from), filter);
+      }
+
+      @Override
+      public FileMetaData visit(RangeMetadataFilter filter) throws IOException {
+        return filterFileMetaDataByMidpoint(readFileMetaData(from), filter);
       }
     });
     if (Log.DEBUG) LOG.debug(fileMetaData);

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/2f22533e/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
index eb109c0..b9cfde7 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
@@ -19,6 +19,7 @@
 package org.apache.parquet.format.converter;
 
 import static java.util.Collections.emptyList;
+import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByStart;
 import static org.apache.parquet.schema.MessageTypeParser.parseMessageType;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertSame;
@@ -27,7 +28,7 @@ import static org.apache.parquet.format.CompressionCodec.UNCOMPRESSED;
 import static org.apache.parquet.format.Type.INT32;
 import static org.apache.parquet.format.Util.readPageHeader;
 import static org.apache.parquet.format.Util.writePageHeader;
-import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaData;
+import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByMidpoint;
 import static org.apache.parquet.format.converter.ParquetMetadataConverter.getOffset;
 
 import java.io.ByteArrayInputStream;
@@ -43,6 +44,7 @@ import java.util.Random;
 import java.util.Set;
 import java.util.TreeSet;
 
+import com.google.common.collect.Sets;
 import org.apache.parquet.column.statistics.BinaryStatistics;
 import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
@@ -170,7 +172,20 @@ public class TestParquetMetadataConverter {
   }
 
   private FileMetaData filter(FileMetaData md, long start, long end) {
-    return filterFileMetaData(new FileMetaData(md), new ParquetMetadataConverter.RangeMetadataFilter(start, end));
+    return filterFileMetaDataByMidpoint(new FileMetaData(md),
+        new ParquetMetadataConverter.RangeMetadataFilter(start, end));
+  }
+
+  private FileMetaData find(FileMetaData md, Long... blockStart) {
+    return filterFileMetaDataByStart(new FileMetaData(md),
+        new ParquetMetadataConverter.OffsetMetadataFilter(
+            Sets.newHashSet((Long[]) blockStart)));
+  }
+
+  private FileMetaData find(FileMetaData md, long blockStart) {
+    return filterFileMetaDataByStart(new FileMetaData(md),
+        new ParquetMetadataConverter.OffsetMetadataFilter(
+            Sets.newHashSet(blockStart)));
   }
 
   private void verifyMD(FileMetaData md, long... offsets) {
@@ -243,6 +258,18 @@ public class TestParquetMetadataConverter {
   }
 
   @Test
+  public void testFindRowGroups() {
+    verifyMD(find(metadata(50, 50, 50), 0), 0);
+    verifyMD(find(metadata(50, 50, 50), 50), 50);
+    verifyMD(find(metadata(50, 50, 50), 100), 100);
+    verifyMD(find(metadata(50, 50, 50), 0L, 50L), 0, 50);
+    verifyMD(find(metadata(50, 50, 50), 0L, 50L, 100L), 0, 50, 100);
+    verifyMD(find(metadata(50, 50, 50), 50L, 100L), 50, 100);
+    // doesn't find an offset that isn't the start of a row group.
+    verifyMD(find(metadata(50, 50, 50), 10));
+  }
+
+  @Test
   public void randomTestFilterMetaData() {
     // randomized property based testing
     // if it fails add the case above