You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pv...@apache.org on 2021/11/08 09:56:17 UTC
[hive] branch master updated: HIVE-25673: Column pruning fix for MR tasks (Peter Vary reviewed by Marton Bod) (#2765)

This is an automated email from the ASF dual-hosted git repository.

pvary pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 85b98a0  HIVE-25673: Column pruning fix for MR tasks (Peter Vary reviewed by Marton Bod) (#2765)
85b98a0 is described below

commit 85b98a011f98952f5c5755c7a0c036b48b2bd17a
Author: pvary <pv...@cloudera.com>
AuthorDate: Mon Nov 8 10:56:06 2021 +0100

    HIVE-25673: Column pruning fix for MR tasks (Peter Vary reviewed by Marton Bod) (#2765)
---
 .../iceberg/mr/hive/TestHiveIcebergSelects.java    | 26 ++++++++++++++++++++++
 .../org/apache/iceberg/mr/hive/TestTables.java     |  2 +-
 .../apache/hadoop/hive/ql/exec/MapOperator.java    | 16 +++++++------
 3 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java
index 84e8b57..96e5dc2 100644
--- a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java
+++ b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSelects.java
@@ -33,6 +33,7 @@ import org.apache.iceberg.types.Types;
 import org.junit.Assert;
 import org.junit.Test;
 
+import static org.apache.iceberg.types.Types.NestedField.optional;
 import static org.apache.iceberg.types.Types.NestedField.required;
 
 /**
@@ -203,4 +204,29 @@ public class TestHiveIcebergSelects extends HiveIcebergStorageHandlerWithEngineB
     Assert.assertArrayEquals(new Object[] {0L, "Alice", "Brown"}, rows.get(0));
     Assert.assertArrayEquals(new Object[] {1L, "Bob", "Green"}, rows.get(1));
   }
+
+  /**
+   * Column pruning could become problematic when a single Map Task contains multiple TableScan operators where
+   * different columns are pruned. This only occurs on MR, as Tez initializes a single Map task for every TableScan
+   * operator.
+   */
+  @Test
+  public void testMultiColumnPruning() throws IOException {
+    shell.setHiveSessionValue("hive.cbo.enable", true);
+
+    Schema schema1 = new Schema(optional(1, "fk", Types.StringType.get()));
+    List<Record> records1 = TestHelper.RecordsBuilder.newInstance(schema1).add("fk1").build();
+    testTables.createTable(shell, "table1", schema1, fileFormat, records1);
+
+    Schema schema2 = new Schema(optional(1, "fk", Types.StringType.get()), optional(2, "val", Types.StringType.get()));
+    List<Record> records2 = TestHelper.RecordsBuilder.newInstance(schema2).add("fk1", "val").build();
+    testTables.createTable(shell, "table2", schema2, fileFormat, records2);
+
+    // MR is needed for the reproduction
+    shell.setHiveSessionValue("hive.execution.engine", "mr");
+    String query = "SELECT t2.val FROM table1 t1 JOIN table2 t2 ON t1.fk = t2.fk";
+    List<Object[]> result = shell.executeStatement(query);
+    Assert.assertEquals(1, result.size());
+    Assert.assertArrayEquals(new Object[]{"val"}, result.get(0));
+  }
 }
diff --git a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestTables.java b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestTables.java
index 5a6f38c..85ba748 100644
--- a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestTables.java
+++ b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestTables.java
@@ -437,7 +437,7 @@ abstract class TestTables {
       }
 
       Assert.assertTrue(location.delete());
-      return location.toString();
+      return "file://" + location;
     }
 
     @Override
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java
index ea8e634..358dbbb 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java
@@ -37,7 +37,6 @@ import org.apache.hadoop.hive.ql.CompilationOpContext;
 import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext;
 import org.apache.hadoop.hive.ql.io.AcidUtils;
 import org.apache.hadoop.hive.ql.io.RecordIdentifier;
-import org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
 import org.apache.hadoop.hive.ql.plan.MapWork;
@@ -340,10 +339,10 @@ public class MapOperator extends AbstractMapOperator {
   /**
    * For each source table, combine the nested column pruning information from all its
    * table scan descriptors and set it in a configuration copy. This is necessary since
-   * the configuration property "READ_NESTED_COLUMN_PATH_CONF_STR" is set on a per-table
-   * basis, so we can't just use a single configuration for all the tables.
+   * the configuration properties are set on a per-table basis, so we can't just use a
+   * single configuration for all the tables.
    */
-  private Map<String, Configuration> cloneConfsForNestedColPruning(Configuration hconf) {
+  private Map<String, Configuration> cloneConfsForColPruning(Configuration hconf) {
     Map<String, Configuration> tableNameToConf = new HashMap<>();
 
     for (Map.Entry<Path, List<String>> e : conf.getPathToAliases().entrySet()) {
@@ -369,10 +368,13 @@ public class MapOperator extends AbstractMapOperator {
         if (!tableNameToConf.containsKey(tableName)) {
           Configuration clonedConf = new Configuration(hconf);
           clonedConf.unset(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR);
+          clonedConf.unset(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR);
+          clonedConf.unset(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR);
           tableNameToConf.put(tableName, clonedConf);
         }
         Configuration newConf = tableNameToConf.get(tableName);
-        ColumnProjectionUtils.appendNestedColumnPaths(newConf, nestedColumnPaths);
+        ColumnProjectionUtils.appendReadColumns(newConf, tableScanDesc.getNeededColumnIDs(),
+            tableScanDesc.getOutputColumnNames(), tableScanDesc.getNeededNestedColumnPaths());
       }
     }
 
@@ -403,7 +405,7 @@ public class MapOperator extends AbstractMapOperator {
     throws SerDeException, Exception {
     setChildOperators(children);
 
-    Map<String, Configuration> tableNameToConf = cloneConfsForNestedColPruning(hconf);
+    Map<String, Configuration> tableNameToConf = cloneConfsForColPruning(hconf);
 
     for (Operator<?> child : children) {
       TableScanOperator tsOp = (TableScanOperator) child;
@@ -426,7 +428,7 @@ public class MapOperator extends AbstractMapOperator {
     List<Operator<? extends OperatorDesc>> children =
         new ArrayList<Operator<? extends OperatorDesc>>();
 
-    Map<String, Configuration> tableNameToConf = cloneConfsForNestedColPruning(hconf);
+    Map<String, Configuration> tableNameToConf = cloneConfsForColPruning(hconf);
     Map<TableDesc, StructObjectInspector> convertedOI = getConvertedOI(tableNameToConf);
 
     for (Map.Entry<Path, List<String>> entry : conf.getPathToAliases().entrySet()) {