You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by se...@apache.org on 2015/09/30 20:37:47 UTC
hive git commit: HIVE-11823 : create a self-contained translation for SARG to be used by metastore (Sergey Shelukhin, reviewed by Prasanth Jayachandran)

Repository: hive
Updated Branches:
  refs/heads/master 8c8cc19fd -> 064e37c46


HIVE-11823 : create a self-contained translation for SARG to be used by metastore (Sergey Shelukhin, reviewed by Prasanth Jayachandran)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/064e37c4
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/064e37c4
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/064e37c4

Branch: refs/heads/master
Commit: 064e37c460d1c464431f740e480a6f08353d69e6
Parents: 8c8cc19
Author: Sergey Shelukhin <se...@apache.org>
Authored: Wed Sep 30 11:35:36 2015 -0700
Committer: Sergey Shelukhin <se...@apache.org>
Committed: Wed Sep 30 11:35:47 2015 -0700

----------------------------------------------------------------------
 .../hadoop/hive/ql/io/orc/OrcInputFormat.java   | 49 ++++++++++++++++++++
 .../hadoop/hive/ql/io/orc/RecordReaderImpl.java | 35 ++++++++++++++
 2 files changed, 84 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/064e37c4/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 52e1b06..c45b6e6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -61,6 +61,7 @@ import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg;
 import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
 import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
 import org.apache.hadoop.hive.serde2.SerDeStats;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -254,6 +255,40 @@ public class OrcInputFormat  implements InputFormat<NullWritable, OrcStruct>,
     }
   }
 
+  /**
+   * Modifies the SARG, replacing column names with column indexes in target table schema. This
+   * basically does the same thing as all the shennannigans with included columns, except for the
+   * last step where ORC gets direct subtypes of root column and uses the ordered match to map
+   * table columns to file columns. The numbers put into predicate leaf should allow to go into
+   * said subtypes directly by index to get the proper index in the file.
+   * This won't work with schema evolution, although it's probably much easier to reason about
+   * if schema evolution was to be supported, because this is a clear boundary between table
+   * schema columns and all things ORC. None of the ORC stuff is used here and none of the
+   * table schema stuff is used after that - ORC doesn't need a bunch of extra crap to apply
+   * the SARG thus modified.
+   */
+  public static void translateSargToTableColIndexes(
+      SearchArgument sarg, Configuration conf, int rootColumn) {
+    String nameStr = getNeededColumnNamesString(conf), idStr = getSargColumnIDsString(conf);
+    String[] knownNames = nameStr.split(",");
+    String[] idStrs = (idStr == null) ? null : idStr.split(",");
+    assert idStrs == null || knownNames.length == idStrs.length;
+    HashMap<String, Integer> nameIdMap = new HashMap<>();
+    for (int i = 0; i < knownNames.length; ++i) {
+      nameIdMap.put(knownNames[i], idStrs != null ? Integer.parseInt(idStrs[i]) : i);
+    }
+    List<PredicateLeaf> leaves = sarg.getLeaves();
+    for (int i = 0; i < leaves.size(); ++i) {
+      PredicateLeaf pl = leaves.get(i);
+      Integer colId = nameIdMap.get(pl.getColumnName());
+      String newColName = RecordReaderImpl.encodeTranslatedSargColumn(rootColumn, colId);
+      SearchArgumentFactory.setPredicateLeafColumn(pl, newColName);
+    }
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("SARG translated into " + sarg);
+    }
+  }
+
   public static boolean[] genIncludedColumns(
       List<OrcProto.Type> types, List<Integer> included, boolean isOriginal) {
     int rootColumn = getRootColumn(isOriginal);
@@ -1342,6 +1377,20 @@ public class OrcInputFormat  implements InputFormat<NullWritable, OrcStruct>,
         directory);
   }
 
+  public static boolean[] pickStripesViaTranslatedSarg(SearchArgument sarg,
+      WriterVersion writerVersion, List<OrcProto.Type> types,
+      List<StripeStatistics> stripeStats, int stripeCount) {
+    LOG.info("Translated ORC pushdown predicate: " + sarg);
+    assert sarg != null;
+    if (stripeStats == null || writerVersion == OrcFile.WriterVersion.ORIGINAL) {
+      return null; // only do split pruning if HIVE-8732 has been fixed in the writer
+    }
+    // eliminate stripes that doesn't satisfy the predicate condition
+    List<PredicateLeaf> sargLeaves = sarg.getLeaves();
+    int[] filterColumns = RecordReaderImpl.mapTranslatedSargColumns(types, sargLeaves);
+    return pickStripesInternal(sarg, filterColumns, stripeStats, stripeCount, null);
+  }
+
   private static boolean[] pickStripes(SearchArgument sarg, String[] sargColNames,
       WriterVersion writerVersion, boolean isOriginal, List<StripeStatistics> stripeStats,
       int stripeCount, Path filePath) {

http://git-wip-us.apache.org/repos/asf/hive/blob/064e37c4/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
index ba304ba..77d2cc6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
@@ -1221,4 +1221,39 @@ class RecordReaderImpl implements RecordReader {
     // if we aren't to the right row yet, advance in the stripe.
     advanceToNextRow(reader, rowNumber, true);
   }
+
+  private static final String TRANSLATED_SARG_SEPARATOR = "_";
+  public static String encodeTranslatedSargColumn(int rootColumn, Integer indexInSourceTable) {
+    return rootColumn + TRANSLATED_SARG_SEPARATOR
+        + ((indexInSourceTable == null) ? -1 : indexInSourceTable);
+  }
+
+  public static int[] mapTranslatedSargColumns(
+      List<OrcProto.Type> types, List<PredicateLeaf> sargLeaves) {
+    int[] result = new int[sargLeaves.size()];
+    OrcProto.Type lastRoot = null; // Root will be the same for everyone as of now.
+    String lastRootStr = null;
+    for (int i = 0; i < result.length; ++i) {
+      String[] rootAndIndex = sargLeaves.get(i).getColumnName().split(TRANSLATED_SARG_SEPARATOR);
+      assert rootAndIndex.length == 2;
+      String rootStr = rootAndIndex[0], indexStr = rootAndIndex[1];
+      int index = Integer.parseInt(indexStr);
+      // First, check if the column even maps to anything.
+      if (index == -1) {
+        result[i] = -1;
+        continue;
+      }
+      assert index >= 0;
+      // Then, find the root type if needed.
+      if (!rootStr.equals(lastRootStr)) {
+        lastRoot = types.get(Integer.parseInt(rootStr));
+        lastRootStr = rootStr;
+      }
+      // Subtypes of the root types correspond, in order, to the columns in the table schema
+      // (disregarding schema evolution that doesn't presently work). Get the index for the
+      // corresponding subtype.
+      result[i] = lastRoot.getSubtypes(index);
+    }
+    return result;
+  }
 }