You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by pr...@apache.org on 2019/03/22 20:27:04 UTC

[orc] branch master updated: ORC-477: BloomFilter for ACID table does not get created (#374)

This is an automated email from the ASF dual-hosted git repository.

prasanthj pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/master by this push:
     new d082718  ORC-477: BloomFilter for ACID table does not get created (#374)
d082718 is described below

commit d082718a0bf8d703c96f3bf39ea02307498d1800
Author: Denys Kuzmenko <de...@gmail.com>
AuthorDate: Fri Mar 22 21:26:58 2019 +0100

    ORC-477: BloomFilter for ACID table does not get created (#374)
---
 java/core/src/java/org/apache/orc/OrcUtils.java    |  54 +++++++----
 .../java/org/apache/orc/impl/SchemaEvolution.java  |   4 +-
 .../src/test/org/apache/orc/util/TestOrcUtils.java | 100 +++++++++++++++++++++
 3 files changed, 139 insertions(+), 19 deletions(-)

diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java
index 76e5e16..220fa13 100644
--- a/java/core/src/java/org/apache/orc/OrcUtils.java
+++ b/java/core/src/java/org/apache/orc/OrcUtils.java
@@ -18,12 +18,15 @@
 package org.apache.orc;
 
 import org.apache.orc.impl.ReaderImpl;
+import org.apache.orc.impl.SchemaEvolution;
 
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
+import static org.apache.hadoop.util.StringUtils.COMMA_STR;
+
 public class OrcUtils {
 
   /**
@@ -51,14 +54,16 @@ public class OrcUtils {
       Arrays.fill(results, true);
       return results;
     }
+    TypeDescription baseSchema = SchemaEvolution.checkAcidSchema(schema) ?
+        SchemaEvolution.getBaseRow(schema) : schema;
+
     if (selectedColumns != null &&
-        schema.getCategory() == TypeDescription.Category.STRUCT) {
-      List<String> fieldNames = schema.getFieldNames();
-      List<TypeDescription> fields = schema.getChildren();
-      for (String column: selectedColumns.split((","))) {
-        TypeDescription col = findColumn(column, fieldNames, fields);
-        if (col != null) {
-          for(int i=col.getId(); i <= col.getMaximumId(); ++i) {
+        baseSchema.getCategory() == TypeDescription.Category.STRUCT) {
+
+      for (String columnName : selectedColumns.split(COMMA_STR)) {
+        TypeDescription column = findColumn(baseSchema, columnName.trim());
+        if (column != null) {
+          for (int i = column.getId(); i <= column.getMaximumId(); ++i) {
             results[i] = true;
           }
         }
@@ -67,18 +72,33 @@ public class OrcUtils {
     return results;
   }
 
-  private static TypeDescription findColumn(String columnName,
-                                            List<String> fieldNames,
-                                            List<TypeDescription> fields) {
-    int i = 0;
-    for(String fieldName: fieldNames) {
-      if (fieldName.equalsIgnoreCase(columnName)) {
-        return fields.get(i);
-      } else {
-        i += 1;
+  private static TypeDescription findColumn(TypeDescription schema, String column) {
+    TypeDescription result = schema;
+    String[] columnMatcher = column.split("\\.");
+
+    int index = 0;
+    while (index < columnMatcher.length &&
+        result.getCategory() == TypeDescription.Category.STRUCT) {
+
+      String columnName = columnMatcher[index];
+      int prevIndex = index;
+
+      List<TypeDescription> fields = result.getChildren();
+      List<String> fieldNames = result.getFieldNames();
+
+      for (int i = 0; i < fields.size(); i++) {
+        if (columnName.equalsIgnoreCase(fieldNames.get(i))) {
+          result = fields.get(i);
+          index++;
+
+          break;
+        }
+      }
+      if (prevIndex == index) {
+        return null;
       }
     }
-    return null;
+    return result;
   }
 
   public static List<OrcProto.Type> getOrcTypes(TypeDescription typeDescr) {
diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
index aa7ba9c..1d4cc67 100644
--- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
+++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
@@ -581,7 +581,7 @@ public class SchemaEvolution {
     }
   }
 
-  private static boolean checkAcidSchema(TypeDescription type) {
+  public static boolean checkAcidSchema(TypeDescription type) {
     if (type.getCategory().equals(TypeDescription.Category.STRUCT)) {
       List<String> rootFields = type.getFieldNames();
       if (rootFields.size() != acidEventFieldNames.size()) {
@@ -617,7 +617,7 @@ public class SchemaEvolution {
    * @param typeDescription the ACID event schema.
    * @return the subtype for the real row
    */
-  static TypeDescription getBaseRow(TypeDescription typeDescription) {
+  public static TypeDescription getBaseRow(TypeDescription typeDescription) {
     final int ACID_ROW_OFFSET = 5;
     return typeDescription.getChildren().get(ACID_ROW_OFFSET);
   }
diff --git a/java/core/src/test/org/apache/orc/util/TestOrcUtils.java b/java/core/src/test/org/apache/orc/util/TestOrcUtils.java
new file mode 100644
index 0000000..26c5f08
--- /dev/null
+++ b/java/core/src/test/org/apache/orc/util/TestOrcUtils.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.util;
+
+import java.util.Arrays;
+
+import org.apache.orc.OrcUtils;
+import org.apache.orc.TypeDescription;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Tests for OrcUtils.
+ */
+public class TestOrcUtils {
+
+  @Test
+  public void testBloomFilterIncludeColumns() {
+    TypeDescription schema = TypeDescription.createStruct()
+        .addField("msisdn", TypeDescription.createString())
+        .addField("imsi",  TypeDescription.createVarchar())
+        .addField("imei", TypeDescription.createInt());
+
+    boolean[] includeColumns = new boolean[3+1];
+    includeColumns[1] = true;
+    includeColumns[3] = true;
+
+    Assert.assertTrue(Arrays.equals(includeColumns,
+        OrcUtils.includeColumns("msisdn, imei", schema)));
+  }
+
+  @Test
+  public void testBloomFilterIncludeColumns_ACID() {
+    TypeDescription rowSchema = TypeDescription.createStruct()
+        .addField("msisdn", TypeDescription.createString())
+        .addField("imei", TypeDescription.createInt());
+
+    TypeDescription schema = TypeDescription.createStruct()
+        .addField("operation", TypeDescription.createString())
+        .addField("originalTransaction", TypeDescription.createInt())
+        .addField("bucket", TypeDescription.createInt())
+        .addField("rowId", TypeDescription.createInt())
+        .addField("currentTransaction", TypeDescription.createInt())
+        .addField("row", rowSchema);
+
+    boolean[] includeColumns = new boolean[8+1];
+    includeColumns[7] = true;
+
+    Assert.assertTrue(Arrays.equals(includeColumns,
+        OrcUtils.includeColumns("msisdn", schema)));
+  }
+
+  @Test
+  public void testBloomFilterIncludeColumns_Nested() {
+    TypeDescription rowSchema = TypeDescription.createStruct()
+        .addField("msisdn", TypeDescription.createString())
+        .addField("imei", TypeDescription.createInt());
+
+    TypeDescription schema = TypeDescription.createStruct()
+        .addField("row", rowSchema);
+
+    boolean[] includeColumns = new boolean[3+1];
+    includeColumns[2] = true;
+
+    Assert.assertTrue(Arrays.equals(includeColumns,
+        OrcUtils.includeColumns("row.msisdn", schema)));
+  }
+
+  @Test
+  public void testBloomFilterIncludeColumns_NonExisting() {
+    TypeDescription rowSchema = TypeDescription.createStruct()
+        .addField("msisdn", TypeDescription.createString())
+        .addField("imei", TypeDescription.createInt());
+
+    TypeDescription schema = TypeDescription.createStruct()
+        .addField("row", rowSchema);
+
+    boolean[] includeColumns = new boolean[3+1];
+
+    Assert.assertTrue(Arrays.equals(includeColumns,
+        OrcUtils.includeColumns("msisdn, row.msisdn2", schema)));
+  }
+}