You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by bl...@apache.org on 2016/04/23 02:42:39 UTC
parquet-mr git commit: PARQUET-569: Separate metadata filtering for
ranges and offsets.
Repository: parquet-mr
Updated Branches:
refs/heads/master 3dd2210e7 -> 2f22533ef
PARQUET-569: Separate metadata filtering for ranges and offsets.
Range filtering should use the row group midpoint and offset filtering
should use the start offset.
Author: Ryan Blue <bl...@apache.org>
Closes #337 from rdblue/PARQUET-569-fix-metadata-filter and squashes the following commits:
6171af4 [Ryan Blue] PARQUET-569: Add tests for new offset metadata filter.
3fe2d5e [Ryan Blue] PARQUET-569: Separate metadata filtering for ranges and offsets.
Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/2f22533e
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/2f22533e
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/2f22533e
Branch: refs/heads/master
Commit: 2f22533ef41533e2b839a6b41b262dca59e6dbf9
Parents: 3dd2210
Author: Ryan Blue <bl...@apache.org>
Authored: Fri Apr 22 17:42:35 2016 -0700
Committer: Ryan Blue <bl...@apache.org>
Committed: Fri Apr 22 17:42:35 2016 -0700
----------------------------------------------------------------------
.../converter/ParquetMetadataConverter.java | 36 ++++++++++++++------
.../converter/TestParquetMetadataConverter.java | 31 +++++++++++++++--
2 files changed, 54 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/2f22533e/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 6feb4a2..75b07fd 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -478,6 +478,7 @@ public class ParquetMetadataConverter {
private static interface MetadataFilterVisitor<T, E extends Throwable> {
T visit(NoFilter filter) throws E;
T visit(SkipMetadataFilter filter) throws E;
+ T visit(RangeMetadataFilter filter) throws E;
T visit(OffsetMetadataFilter filter) throws E;
}
@@ -501,7 +502,7 @@ public class ParquetMetadataConverter {
for (long offset : offsets) {
set.add(offset);
}
- return new OffsetListMetadataFilter(set);
+ return new OffsetMetadataFilter(set);
}
private static final class NoFilter extends MetadataFilter {
@@ -527,16 +528,12 @@ public class ParquetMetadataConverter {
}
}
- interface OffsetMetadataFilter {
- boolean contains(long offset);
- }
-
/**
* [ startOffset, endOffset )
* @author Julien Le Dem
*/
// Visible for testing
- static final class RangeMetadataFilter extends MetadataFilter implements OffsetMetadataFilter {
+ static final class RangeMetadataFilter extends MetadataFilter {
final long startOffset;
final long endOffset;
@@ -551,7 +548,6 @@ public class ParquetMetadataConverter {
return visitor.visit(this);
}
- @Override
public boolean contains(long offset) {
return offset >= this.startOffset && offset < this.endOffset;
}
@@ -562,10 +558,10 @@ public class ParquetMetadataConverter {
}
}
- static final class OffsetListMetadataFilter extends MetadataFilter implements OffsetMetadataFilter {
+ static final class OffsetMetadataFilter extends MetadataFilter {
private final Set<Long> offsets;
- public OffsetListMetadataFilter(Set<Long> offsets) {
+ public OffsetMetadataFilter(Set<Long> offsets) {
this.offsets = offsets;
}
@@ -585,7 +581,7 @@ public class ParquetMetadataConverter {
}
// Visible for testing
- static FileMetaData filterFileMetaData(FileMetaData metaData, OffsetMetadataFilter filter) {
+ static FileMetaData filterFileMetaDataByMidpoint(FileMetaData metaData, RangeMetadataFilter filter) {
List<RowGroup> rowGroups = metaData.getRow_groups();
List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
for (RowGroup rowGroup : rowGroups) {
@@ -604,6 +600,19 @@ public class ParquetMetadataConverter {
}
// Visible for testing
+ static FileMetaData filterFileMetaDataByStart(FileMetaData metaData, OffsetMetadataFilter filter) {
+ List<RowGroup> rowGroups = metaData.getRow_groups();
+ List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
+ for (RowGroup rowGroup : rowGroups) {
+ long startIndex = getOffset(rowGroup.getColumns().get(0));
+ if (filter.contains(startIndex)) {
+ newRowGroups.add(rowGroup);
+ }
+ }
+ metaData.setRow_groups(newRowGroups);
+ return metaData;
+ }
+
static long getOffset(RowGroup rowGroup) {
return getOffset(rowGroup.getColumns().get(0));
}
@@ -631,7 +640,12 @@ public class ParquetMetadataConverter {
@Override
public FileMetaData visit(OffsetMetadataFilter filter) throws IOException {
- return filterFileMetaData(readFileMetaData(from), filter);
+ return filterFileMetaDataByStart(readFileMetaData(from), filter);
+ }
+
+ @Override
+ public FileMetaData visit(RangeMetadataFilter filter) throws IOException {
+ return filterFileMetaDataByMidpoint(readFileMetaData(from), filter);
}
});
if (Log.DEBUG) LOG.debug(fileMetaData);
http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/2f22533e/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
----------------------------------------------------------------------
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
index eb109c0..b9cfde7 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
@@ -19,6 +19,7 @@
package org.apache.parquet.format.converter;
import static java.util.Collections.emptyList;
+import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByStart;
import static org.apache.parquet.schema.MessageTypeParser.parseMessageType;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertSame;
@@ -27,7 +28,7 @@ import static org.apache.parquet.format.CompressionCodec.UNCOMPRESSED;
import static org.apache.parquet.format.Type.INT32;
import static org.apache.parquet.format.Util.readPageHeader;
import static org.apache.parquet.format.Util.writePageHeader;
-import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaData;
+import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByMidpoint;
import static org.apache.parquet.format.converter.ParquetMetadataConverter.getOffset;
import java.io.ByteArrayInputStream;
@@ -43,6 +44,7 @@ import java.util.Random;
import java.util.Set;
import java.util.TreeSet;
+import com.google.common.collect.Sets;
import org.apache.parquet.column.statistics.BinaryStatistics;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
@@ -170,7 +172,20 @@ public class TestParquetMetadataConverter {
}
private FileMetaData filter(FileMetaData md, long start, long end) {
- return filterFileMetaData(new FileMetaData(md), new ParquetMetadataConverter.RangeMetadataFilter(start, end));
+ return filterFileMetaDataByMidpoint(new FileMetaData(md),
+ new ParquetMetadataConverter.RangeMetadataFilter(start, end));
+ }
+
+ private FileMetaData find(FileMetaData md, Long... blockStart) {
+ return filterFileMetaDataByStart(new FileMetaData(md),
+ new ParquetMetadataConverter.OffsetMetadataFilter(
+ Sets.newHashSet((Long[]) blockStart)));
+ }
+
+ private FileMetaData find(FileMetaData md, long blockStart) {
+ return filterFileMetaDataByStart(new FileMetaData(md),
+ new ParquetMetadataConverter.OffsetMetadataFilter(
+ Sets.newHashSet(blockStart)));
}
private void verifyMD(FileMetaData md, long... offsets) {
@@ -243,6 +258,18 @@ public class TestParquetMetadataConverter {
}
@Test
+ public void testFindRowGroups() {
+ verifyMD(find(metadata(50, 50, 50), 0), 0);
+ verifyMD(find(metadata(50, 50, 50), 50), 50);
+ verifyMD(find(metadata(50, 50, 50), 100), 100);
+ verifyMD(find(metadata(50, 50, 50), 0L, 50L), 0, 50);
+ verifyMD(find(metadata(50, 50, 50), 0L, 50L, 100L), 0, 50, 100);
+ verifyMD(find(metadata(50, 50, 50), 50L, 100L), 50, 100);
+ // doesn't find an offset that isn't the start of a row group.
+ verifyMD(find(metadata(50, 50, 50), 10));
+ }
+
+ @Test
public void randomTestFilterMetaData() {
// randomized property based testing
// if it fails add the case above