You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2018/05/01 20:57:13 UTC
orc git commit: ORC-323: Add support for predicate pushdown of nested
fields.
Repository: orc
Updated Branches:
refs/heads/master 837a1bd7c -> 9eda0e4d5
ORC-323: Add support for predicate pushdown of nested fields.
Fixes #232
Signed-off-by: Owen O'Malley <om...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/9eda0e4d
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/9eda0e4d
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/9eda0e4d
Branch: refs/heads/master
Commit: 9eda0e4d53716ceb1f9ed350bc4d5404111bf679
Parents: 837a1bd
Author: Aashish Kumar Sharma <aa...@flipkart.com>
Authored: Fri Mar 16 17:53:36 2018 +0530
Committer: Owen O'Malley <om...@apache.org>
Committed: Tue May 1 13:56:52 2018 -0700
----------------------------------------------------------------------
.../org/apache/orc/impl/RecordReaderImpl.java | 20 ++--
.../org/apache/orc/impl/SchemaEvolution.java | 23 +++-
.../test/org/apache/orc/TestVectorOrcFile.java | 118 +++++++++++++++++++
3 files changed, 146 insertions(+), 15 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/9eda0e4d/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 0dacc70..83eb039 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -104,21 +104,16 @@ public class RecordReaderImpl implements RecordReader {
* Given a list of column names, find the given column and return the index.
*
* @param evolution the mapping from reader to file schema
- * @param columnName the column name to look for
+ * @param columnName the fully qualified column name to look for
* @return the file column number or -1 if the column wasn't found
*/
static int findColumns(SchemaEvolution evolution,
String columnName) {
- TypeDescription readerSchema = evolution.getReaderBaseSchema();
- List<String> fieldNames = readerSchema.getFieldNames();
- List<TypeDescription> children = readerSchema.getChildren();
- for (int i = 0; i < fieldNames.size(); ++i) {
- if (columnName.equals(fieldNames.get(i))) {
- TypeDescription result = evolution.getFileType(children.get(i));
- return result == null ? -1 : result.getId();
- }
+ try {
+ return evolution.getFileSchema().findSubtype(columnName).getId();
+ } catch (IllegalArgumentException e) {
+ return -1;
}
- return -1;
}
/**
@@ -128,8 +123,9 @@ public class RecordReaderImpl implements RecordReader {
* @return an array mapping the sarg leaves to concrete column numbers in the
* file
*/
- public static int[] mapSargColumnsToOrcInternalColIdx(List<PredicateLeaf> sargLeaves,
- SchemaEvolution evolution) {
+ public static int[] mapSargColumnsToOrcInternalColIdx(
+ List<PredicateLeaf> sargLeaves,
+ SchemaEvolution evolution) {
int[] result = new int[sargLeaves.size()];
Arrays.fill(result, -1);
for(int i=0; i < result.length; ++i) {
http://git-wip-us.apache.org/repos/asf/orc/blob/9eda0e4d/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
index 78acd41..571c0d9 100644
--- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
+++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
@@ -242,15 +242,32 @@ public class SchemaEvolution {
boolean[] result = new boolean[readerSchema.getMaximumId() + 1];
boolean safePpd = validatePPDConversion(fileSchema, readerSchema);
result[readerSchema.getId()] = safePpd;
- List<TypeDescription> children = readerSchema.getChildren();
+ return populatePpdSafeConversionForChildern(result,
+ readerSchema.getChildren());
+ }
+
+ /**
+ * Recursion to check the conversion of nested field.
+ *
+ * @param ppdSafeConversion boolean array to specify which column are safe.
+ * @param children reader schema children.
+ *
+ * @return boolean array to represent list of column safe or not.
+ */
+ private boolean[] populatePpdSafeConversionForChildern(
+ boolean[] ppdSafeConversion,
+ List<TypeDescription> children) {
+ boolean safePpd;
if (children != null) {
for (TypeDescription child : children) {
TypeDescription fileType = getFileType(child.getId());
safePpd = validatePPDConversion(fileType, child);
- result[child.getId()] = safePpd;
+ ppdSafeConversion[child.getId()] = safePpd;
+ populatePpdSafeConversionForChildern(ppdSafeConversion,
+ child.getChildren());
}
}
- return result;
+ return ppdSafeConversion;
}
private boolean validatePPDConversion(final TypeDescription fileType,
http://git-wip-us.apache.org/repos/asf/orc/blob/9eda0e4d/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index fdf20a4..81d248c 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -931,6 +931,16 @@ public class TestVectorOrcFile {
.addField("string1", TypeDescription.createString());
}
+ private static TypeDescription createComplexInnerSchema()
+ {
+ return TypeDescription.createStruct()
+ .addField("int1", TypeDescription.createInt())
+ .addField("complex",
+ TypeDescription.createStruct()
+ .addField("int2", TypeDescription.createInt())
+ .addField("String1",TypeDescription.createString()));
+ }
+
private static TypeDescription createBigRowSchema() {
return TypeDescription.createStruct()
.addField("boolean1", TypeDescription.createBoolean())
@@ -3474,4 +3484,112 @@ public class TestVectorOrcFile {
assertFalse(rows.nextBatch(batch));
rows.close();
}
+
+ @Test
+ public void testPredicatePushdownForComplex() throws Exception {
+ TypeDescription schema = createComplexInnerSchema();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .setSchema(schema)
+ .stripeSize(400000L)
+ .compress(CompressionKind.NONE)
+ .bufferSize(500)
+ .rowIndexStride(1000));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ batch.ensureSize(3500);
+ batch.size = 3500;
+ for(int i=0; i < 3500; ++i) {
+ ((LongColumnVector) batch.cols[0]).vector[i] = i;
+ ((LongColumnVector)((StructColumnVector) batch.cols[1]).fields[0]).vector[i] = i * 300;
+ ((BytesColumnVector)((StructColumnVector) batch.cols[1]).fields[1]).setVal(i,
+ Integer.toHexString(10*i).getBytes());
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals(3500, reader.getNumberOfRows());
+
+ SearchArgument sarg = SearchArgumentFactory.newBuilder()
+ .startAnd()
+ .startNot()
+ .lessThan("complex.int2", PredicateLeaf.Type.LONG, 300000L)
+ .end()
+ .lessThan("complex.int2", PredicateLeaf.Type.LONG, 600000L)
+ .end()
+ .build();
+
+ RecordReader rows = reader.rows(reader.options()
+ .range(0L, Long.MAX_VALUE)
+ .include(new boolean[]{true, true, true, true, true})
+ .searchArgument(sarg, new String[]{null, "int1", "complex","int2","string1"}));
+ batch = reader.getSchema().createRowBatch(2000);
+ LongColumnVector ints1 = (LongColumnVector) batch.cols[0];
+ StructColumnVector struct1 = (StructColumnVector) batch.cols[1];
+ LongColumnVector ints2 = (LongColumnVector) struct1.fields[0];
+ BytesColumnVector strs = (BytesColumnVector) struct1.fields[1];
+
+ System.out.println("------------------------------------------------------------------------------------------------------------------------");
+ System.out.println(rows.getRowNumber());
+
+ Assert.assertEquals(1000L, rows.getRowNumber());
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1000, batch.size);
+
+ for(int i=1000; i < 2000; ++i) {
+ assertEquals(i,ints1.vector[i-1000]);
+ assertEquals(300 * i, ints2.vector[i - 1000]);
+ assertEquals(Integer.toHexString(10*i), strs.toString(i - 1000));
+ }
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ Assert.assertEquals(3500, rows.getRowNumber());
+
+
+ // look through the file with no rows selected
+ sarg = SearchArgumentFactory.newBuilder()
+ .startAnd()
+ .lessThan("complex.int2", PredicateLeaf.Type.LONG, 0L)
+ .end()
+ .build();
+ rows = reader.rows(reader.options()
+ .range(0L, Long.MAX_VALUE)
+ .include(new boolean[]{true, true, true, true, true})
+ .searchArgument(sarg, new String[]{null, "int1",null,"int2","string1"}));
+ Assert.assertEquals(3500L, rows.getRowNumber());
+ assertTrue(!rows.nextBatch(batch));
+
+ // select first 100 and last 100 rows
+ sarg = SearchArgumentFactory.newBuilder()
+ .startOr()
+ .lessThan("complex.int2", PredicateLeaf.Type.LONG, 300L * 100)
+ .startNot()
+ .lessThan("complex.int2", PredicateLeaf.Type.LONG, 300L * 3400)
+ .end()
+ .end()
+ .build();
+ rows = reader.rows(reader.options()
+ .range(0L, Long.MAX_VALUE)
+ .include(new boolean[]{true, true,true,true, true})
+ .searchArgument(sarg, new String[]{null, "int1",null, "int2","string1"}));
+ Assert.assertEquals(0, rows.getRowNumber());
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1000, batch.size);
+ Assert.assertEquals(3000, rows.getRowNumber());
+
+ for(int i=0; i < 1000; ++i) {
+ assertEquals(300 * i, ints2.vector[i]);
+ assertEquals(Integer.toHexString(10*i), strs.toString(i));
+ }
+
+ Assert.assertEquals(true, rows.nextBatch(batch));
+ assertEquals(500, batch.size);
+ Assert.assertEquals(3500, rows.getRowNumber());
+ for(int i=3000; i < 3500; ++i) {
+ assertEquals(300 * i, ints2.vector[i - 3000]);
+ assertEquals(Integer.toHexString(10*i), strs.toString(i - 3000));
+ }
+ Assert.assertEquals(false, rows.nextBatch(batch));
+ Assert.assertEquals(3500, rows.getRowNumber());
+ }
+
}