You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by se...@apache.org on 2015/09/30 21:29:25 UTC
[13/14] hive git commit: HIVE-11823 : create a self-contained
translation for SARG to be used by metastore (Sergey Shelukhin,
reviewed by Prasanth Jayachandran)
HIVE-11823 : create a self-contained translation for SARG to be used by metastore (Sergey Shelukhin, reviewed by Prasanth Jayachandran)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/064e37c4
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/064e37c4
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/064e37c4
Branch: refs/heads/llap
Commit: 064e37c460d1c464431f740e480a6f08353d69e6
Parents: 8c8cc19
Author: Sergey Shelukhin <se...@apache.org>
Authored: Wed Sep 30 11:35:36 2015 -0700
Committer: Sergey Shelukhin <se...@apache.org>
Committed: Wed Sep 30 11:35:47 2015 -0700
----------------------------------------------------------------------
.../hadoop/hive/ql/io/orc/OrcInputFormat.java | 49 ++++++++++++++++++++
.../hadoop/hive/ql/io/orc/RecordReaderImpl.java | 35 ++++++++++++++
2 files changed, 84 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/064e37c4/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 52e1b06..c45b6e6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -61,6 +61,7 @@ import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -254,6 +255,40 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
}
}
+ /**
+ * Modifies the SARG, replacing column names with column indexes in target table schema. This
+ * basically does the same thing as all the shennannigans with included columns, except for the
+ * last step where ORC gets direct subtypes of root column and uses the ordered match to map
+ * table columns to file columns. The numbers put into predicate leaf should allow to go into
+ * said subtypes directly by index to get the proper index in the file.
+ * This won't work with schema evolution, although it's probably much easier to reason about
+ * if schema evolution was to be supported, because this is a clear boundary between table
+ * schema columns and all things ORC. None of the ORC stuff is used here and none of the
+ * table schema stuff is used after that - ORC doesn't need a bunch of extra crap to apply
+ * the SARG thus modified.
+ */
+ public static void translateSargToTableColIndexes(
+ SearchArgument sarg, Configuration conf, int rootColumn) {
+ String nameStr = getNeededColumnNamesString(conf), idStr = getSargColumnIDsString(conf);
+ String[] knownNames = nameStr.split(",");
+ String[] idStrs = (idStr == null) ? null : idStr.split(",");
+ assert idStrs == null || knownNames.length == idStrs.length;
+ HashMap<String, Integer> nameIdMap = new HashMap<>();
+ for (int i = 0; i < knownNames.length; ++i) {
+ nameIdMap.put(knownNames[i], idStrs != null ? Integer.parseInt(idStrs[i]) : i);
+ }
+ List<PredicateLeaf> leaves = sarg.getLeaves();
+ for (int i = 0; i < leaves.size(); ++i) {
+ PredicateLeaf pl = leaves.get(i);
+ Integer colId = nameIdMap.get(pl.getColumnName());
+ String newColName = RecordReaderImpl.encodeTranslatedSargColumn(rootColumn, colId);
+ SearchArgumentFactory.setPredicateLeafColumn(pl, newColName);
+ }
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("SARG translated into " + sarg);
+ }
+ }
+
public static boolean[] genIncludedColumns(
List<OrcProto.Type> types, List<Integer> included, boolean isOriginal) {
int rootColumn = getRootColumn(isOriginal);
@@ -1342,6 +1377,20 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
directory);
}
+ public static boolean[] pickStripesViaTranslatedSarg(SearchArgument sarg,
+ WriterVersion writerVersion, List<OrcProto.Type> types,
+ List<StripeStatistics> stripeStats, int stripeCount) {
+ LOG.info("Translated ORC pushdown predicate: " + sarg);
+ assert sarg != null;
+ if (stripeStats == null || writerVersion == OrcFile.WriterVersion.ORIGINAL) {
+ return null; // only do split pruning if HIVE-8732 has been fixed in the writer
+ }
+ // eliminate stripes that doesn't satisfy the predicate condition
+ List<PredicateLeaf> sargLeaves = sarg.getLeaves();
+ int[] filterColumns = RecordReaderImpl.mapTranslatedSargColumns(types, sargLeaves);
+ return pickStripesInternal(sarg, filterColumns, stripeStats, stripeCount, null);
+ }
+
private static boolean[] pickStripes(SearchArgument sarg, String[] sargColNames,
WriterVersion writerVersion, boolean isOriginal, List<StripeStatistics> stripeStats,
int stripeCount, Path filePath) {
http://git-wip-us.apache.org/repos/asf/hive/blob/064e37c4/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
index ba304ba..77d2cc6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
@@ -1221,4 +1221,39 @@ class RecordReaderImpl implements RecordReader {
// if we aren't to the right row yet, advance in the stripe.
advanceToNextRow(reader, rowNumber, true);
}
+
+ private static final String TRANSLATED_SARG_SEPARATOR = "_";
+ public static String encodeTranslatedSargColumn(int rootColumn, Integer indexInSourceTable) {
+ return rootColumn + TRANSLATED_SARG_SEPARATOR
+ + ((indexInSourceTable == null) ? -1 : indexInSourceTable);
+ }
+
+ public static int[] mapTranslatedSargColumns(
+ List<OrcProto.Type> types, List<PredicateLeaf> sargLeaves) {
+ int[] result = new int[sargLeaves.size()];
+ OrcProto.Type lastRoot = null; // Root will be the same for everyone as of now.
+ String lastRootStr = null;
+ for (int i = 0; i < result.length; ++i) {
+ String[] rootAndIndex = sargLeaves.get(i).getColumnName().split(TRANSLATED_SARG_SEPARATOR);
+ assert rootAndIndex.length == 2;
+ String rootStr = rootAndIndex[0], indexStr = rootAndIndex[1];
+ int index = Integer.parseInt(indexStr);
+ // First, check if the column even maps to anything.
+ if (index == -1) {
+ result[i] = -1;
+ continue;
+ }
+ assert index >= 0;
+ // Then, find the root type if needed.
+ if (!rootStr.equals(lastRootStr)) {
+ lastRoot = types.get(Integer.parseInt(rootStr));
+ lastRootStr = rootStr;
+ }
+ // Subtypes of the root types correspond, in order, to the columns in the table schema
+ // (disregarding schema evolution that doesn't presently work). Get the index for the
+ // corresponding subtype.
+ result[i] = lastRoot.getSubtypes(index);
+ }
+ return result;
+ }
}