You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2018/05/01 20:57:13 UTC
orc git commit: ORC-323: Add support for predicate pushdown of nested fields.

Repository: orc
Updated Branches:
  refs/heads/master 837a1bd7c -> 9eda0e4d5


ORC-323: Add support for predicate pushdown of nested fields.

Fixes #232

Signed-off-by: Owen O'Malley <om...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/9eda0e4d
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/9eda0e4d
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/9eda0e4d

Branch: refs/heads/master
Commit: 9eda0e4d53716ceb1f9ed350bc4d5404111bf679
Parents: 837a1bd
Author: Aashish Kumar Sharma <aa...@flipkart.com>
Authored: Fri Mar 16 17:53:36 2018 +0530
Committer: Owen O'Malley <om...@apache.org>
Committed: Tue May 1 13:56:52 2018 -0700

----------------------------------------------------------------------
 .../org/apache/orc/impl/RecordReaderImpl.java   |  20 ++--
 .../org/apache/orc/impl/SchemaEvolution.java    |  23 +++-
 .../test/org/apache/orc/TestVectorOrcFile.java  | 118 +++++++++++++++++++
 3 files changed, 146 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/9eda0e4d/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 0dacc70..83eb039 100644
--- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -104,21 +104,16 @@ public class RecordReaderImpl implements RecordReader {
    * Given a list of column names, find the given column and return the index.
    *
    * @param evolution the mapping from reader to file schema
-   * @param columnName  the column name to look for
+   * @param columnName  the fully qualified column name to look for
    * @return the file column number or -1 if the column wasn't found
    */
   static int findColumns(SchemaEvolution evolution,
                          String columnName) {
-    TypeDescription readerSchema = evolution.getReaderBaseSchema();
-    List<String> fieldNames = readerSchema.getFieldNames();
-    List<TypeDescription> children = readerSchema.getChildren();
-    for (int i = 0; i < fieldNames.size(); ++i) {
-      if (columnName.equals(fieldNames.get(i))) {
-        TypeDescription result = evolution.getFileType(children.get(i));
-        return result == null ? -1 : result.getId();
-      }
+    try {
+      return evolution.getFileSchema().findSubtype(columnName).getId();
+    } catch (IllegalArgumentException e) {
+      return -1;
     }
-    return -1;
   }
 
   /**
@@ -128,8 +123,9 @@ public class RecordReaderImpl implements RecordReader {
    * @return an array mapping the sarg leaves to concrete column numbers in the
    * file
    */
-  public static int[] mapSargColumnsToOrcInternalColIdx(List<PredicateLeaf> sargLeaves,
-                             SchemaEvolution evolution) {
+  public static int[] mapSargColumnsToOrcInternalColIdx(
+                            List<PredicateLeaf> sargLeaves,
+                            SchemaEvolution evolution) {
     int[] result = new int[sargLeaves.size()];
     Arrays.fill(result, -1);
     for(int i=0; i < result.length; ++i) {

http://git-wip-us.apache.org/repos/asf/orc/blob/9eda0e4d/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
index 78acd41..571c0d9 100644
--- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
+++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
@@ -242,15 +242,32 @@ public class SchemaEvolution {
     boolean[] result = new boolean[readerSchema.getMaximumId() + 1];
     boolean safePpd = validatePPDConversion(fileSchema, readerSchema);
     result[readerSchema.getId()] = safePpd;
-    List<TypeDescription> children = readerSchema.getChildren();
+    return populatePpdSafeConversionForChildern(result,
+        readerSchema.getChildren());
+  }
+
+  /**
+   * Recursion to check the conversion of nested field.
+   *
+   * @param ppdSafeConversion boolean array to specify which column are safe.
+   * @param children reader schema children.
+   *
+   * @return boolean array to represent list of column safe or not.
+   */
+  private boolean[] populatePpdSafeConversionForChildern(
+                        boolean[] ppdSafeConversion,
+                        List<TypeDescription> children) {
+    boolean safePpd;
     if (children != null) {
       for (TypeDescription child : children) {
         TypeDescription fileType = getFileType(child.getId());
         safePpd = validatePPDConversion(fileType, child);
-        result[child.getId()] = safePpd;
+        ppdSafeConversion[child.getId()] = safePpd;
+        populatePpdSafeConversionForChildern(ppdSafeConversion,
+            child.getChildren());
       }
     }
-    return result;
+    return  ppdSafeConversion;
   }
 
   private boolean validatePPDConversion(final TypeDescription fileType,

http://git-wip-us.apache.org/repos/asf/orc/blob/9eda0e4d/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
index fdf20a4..81d248c 100644
--- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
+++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java
@@ -931,6 +931,16 @@ public class TestVectorOrcFile {
         .addField("string1", TypeDescription.createString());
   }
 
+  private static TypeDescription createComplexInnerSchema()
+  {
+    return TypeDescription.createStruct()
+            .addField("int1", TypeDescription.createInt())
+            .addField("complex",
+                      TypeDescription.createStruct()
+                              .addField("int2", TypeDescription.createInt())
+                              .addField("String1",TypeDescription.createString()));
+  }
+
   private static TypeDescription createBigRowSchema() {
     return TypeDescription.createStruct()
         .addField("boolean1", TypeDescription.createBoolean())
@@ -3474,4 +3484,112 @@ public class TestVectorOrcFile {
     assertFalse(rows.nextBatch(batch));
     rows.close();
   }
+
+  @Test
+  public void testPredicatePushdownForComplex() throws Exception {
+    TypeDescription schema = createComplexInnerSchema();
+    Writer writer = OrcFile.createWriter(testFilePath,
+            OrcFile.writerOptions(conf)
+                    .setSchema(schema)
+                    .stripeSize(400000L)
+                    .compress(CompressionKind.NONE)
+                    .bufferSize(500)
+                    .rowIndexStride(1000));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    batch.ensureSize(3500);
+    batch.size = 3500;
+    for(int i=0; i < 3500; ++i) {
+      ((LongColumnVector) batch.cols[0]).vector[i] = i;
+      ((LongColumnVector)((StructColumnVector) batch.cols[1]).fields[0]).vector[i] = i * 300;
+      ((BytesColumnVector)((StructColumnVector) batch.cols[1]).fields[1]).setVal(i,
+              Integer.toHexString(10*i).getBytes());
+    }
+    writer.addRowBatch(batch);
+    writer.close();
+    Reader reader = OrcFile.createReader(testFilePath,
+            OrcFile.readerOptions(conf).filesystem(fs));
+    assertEquals(3500, reader.getNumberOfRows());
+
+    SearchArgument sarg = SearchArgumentFactory.newBuilder()
+            .startAnd()
+            .startNot()
+            .lessThan("complex.int2", PredicateLeaf.Type.LONG, 300000L)
+            .end()
+            .lessThan("complex.int2", PredicateLeaf.Type.LONG, 600000L)
+            .end()
+            .build();
+
+    RecordReader rows = reader.rows(reader.options()
+            .range(0L, Long.MAX_VALUE)
+            .include(new boolean[]{true, true, true, true, true})
+            .searchArgument(sarg, new String[]{null, "int1", "complex","int2","string1"}));
+    batch = reader.getSchema().createRowBatch(2000);
+    LongColumnVector ints1 = (LongColumnVector) batch.cols[0];
+    StructColumnVector struct1 = (StructColumnVector) batch.cols[1];
+    LongColumnVector ints2 = (LongColumnVector) struct1.fields[0];
+    BytesColumnVector strs = (BytesColumnVector) struct1.fields[1];
+
+    System.out.println("------------------------------------------------------------------------------------------------------------------------");
+    System.out.println(rows.getRowNumber());
+
+    Assert.assertEquals(1000L, rows.getRowNumber());
+    Assert.assertEquals(true, rows.nextBatch(batch));
+    assertEquals(1000, batch.size);
+
+    for(int i=1000; i < 2000; ++i) {
+      assertEquals(i,ints1.vector[i-1000]);
+      assertEquals(300 * i, ints2.vector[i - 1000]);
+      assertEquals(Integer.toHexString(10*i), strs.toString(i - 1000));
+    }
+    Assert.assertEquals(false, rows.nextBatch(batch));
+    Assert.assertEquals(3500, rows.getRowNumber());
+
+
+    // look through the file with no rows selected
+    sarg = SearchArgumentFactory.newBuilder()
+            .startAnd()
+            .lessThan("complex.int2", PredicateLeaf.Type.LONG, 0L)
+            .end()
+            .build();
+    rows = reader.rows(reader.options()
+            .range(0L, Long.MAX_VALUE)
+            .include(new boolean[]{true, true, true, true, true})
+            .searchArgument(sarg, new String[]{null, "int1",null,"int2","string1"}));
+    Assert.assertEquals(3500L, rows.getRowNumber());
+    assertTrue(!rows.nextBatch(batch));
+
+    // select first 100 and last 100 rows
+    sarg = SearchArgumentFactory.newBuilder()
+            .startOr()
+            .lessThan("complex.int2", PredicateLeaf.Type.LONG, 300L * 100)
+            .startNot()
+            .lessThan("complex.int2", PredicateLeaf.Type.LONG, 300L * 3400)
+            .end()
+            .end()
+            .build();
+    rows = reader.rows(reader.options()
+            .range(0L, Long.MAX_VALUE)
+            .include(new boolean[]{true, true,true,true, true})
+            .searchArgument(sarg, new String[]{null, "int1",null, "int2","string1"}));
+    Assert.assertEquals(0, rows.getRowNumber());
+    Assert.assertEquals(true, rows.nextBatch(batch));
+    assertEquals(1000, batch.size);
+    Assert.assertEquals(3000, rows.getRowNumber());
+
+    for(int i=0; i < 1000; ++i) {
+      assertEquals(300 * i, ints2.vector[i]);
+      assertEquals(Integer.toHexString(10*i), strs.toString(i));
+    }
+
+    Assert.assertEquals(true, rows.nextBatch(batch));
+    assertEquals(500, batch.size);
+    Assert.assertEquals(3500, rows.getRowNumber());
+    for(int i=3000; i < 3500; ++i) {
+      assertEquals(300 * i, ints2.vector[i - 3000]);
+      assertEquals(Integer.toHexString(10*i), strs.toString(i - 3000));
+    }
+    Assert.assertEquals(false, rows.nextBatch(batch));
+    Assert.assertEquals(3500, rows.getRowNumber());
+  }
+
 }