You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2016/11/29 23:16:33 UTC
hive git commit: HIVE-15124. Fix OrcInputFormat to use reader's
schema for include boolean array. (Owen O'Malley reviewed by Prasanth
Jayachandran)
Repository: hive
Updated Branches:
refs/heads/master 591880627 -> 4f2d1f509
HIVE-15124. Fix OrcInputFormat to use reader's schema for include boolean
array. (Owen O'Malley reviewed by Prasanth Jayachandran)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4f2d1f50
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4f2d1f50
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4f2d1f50
Branch: refs/heads/master
Commit: 4f2d1f509718e95fbc641abba0b5e53c5e903377
Parents: 5918806
Author: Owen O'Malley <om...@apache.org>
Authored: Fri Nov 4 10:45:56 2016 -0700
Committer: Owen O'Malley <om...@apache.org>
Committed: Tue Nov 29 15:15:45 2016 -0800
----------------------------------------------------------------------
.../hive/llap/io/api/impl/LlapInputFormat.java | 12 +-
.../llap/io/decode/ColumnVectorProducer.java | 6 +-
.../llap/io/decode/OrcColumnVectorProducer.java | 5 +-
.../llap/io/decode/OrcEncodedDataConsumer.java | 9 +
.../hive/llap/io/decode/ReadPipeline.java | 1 +
.../llap/io/encoded/OrcEncodedDataReader.java | 17 +-
.../java/org/apache/orc/TypeDescription.java | 212 ++++++++++++++++++-
.../java/org/apache/orc/impl/ReaderImpl.java | 7 -
.../org/apache/orc/impl/RecordReaderImpl.java | 53 ++---
.../org/apache/orc/impl/SchemaEvolution.java | 77 ++++++-
.../org/apache/orc/impl/TreeReaderFactory.java | 3 +-
pom.xml | 1 +
.../hadoop/hive/ql/io/orc/OrcInputFormat.java | 83 +++-----
.../hive/ql/io/orc/OrcRawRecordMerger.java | 13 +-
.../hadoop/hive/ql/io/orc/ReaderImpl.java | 9 +-
.../ql/io/orc/VectorizedOrcInputFormat.java | 5 +-
.../hive/ql/io/orc/TestInputOutputFormat.java | 198 ++++++++++++++++-
.../clientpositive/llap/orc_llap_counters.q.out | 138 +++++-------
.../llap/orc_llap_counters1.q.out | 2 +-
.../clientpositive/llap/orc_ppd_basic.q.out | 138 +++++-------
20 files changed, 681 insertions(+), 308 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java
index e803125..290624d 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java
@@ -72,6 +72,7 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hive.common.util.HiveStringUtils;
+import org.apache.orc.OrcUtils;
import org.apache.orc.TypeDescription;
import org.apache.orc.impl.SchemaEvolution;
import org.apache.tez.common.counters.TezCounters;
@@ -220,9 +221,11 @@ public class LlapInputFormat implements InputFormat<NullWritable, VectorizedRowB
} else {
partitionValues = null;
}
+ boolean isAcidScan = HiveConf.getBoolVar(jobConf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
+ TypeDescription schema = OrcInputFormat.getDesiredRowTypeDescr(job, isAcidScan, Integer.MAX_VALUE);
// Create the consumer of encoded data; it will coordinate decoding to CVBs.
- rp = cvp.createReadPipeline(this, split, columnIds, sarg, columnNames, counters);
+ rp = cvp.createReadPipeline(this, split, columnIds, sarg, columnNames, counters, schema);
feedback = rp;
fileSchema = rp.getFileSchema();
includedColumns = rp.getIncludedColumns();
@@ -232,11 +235,8 @@ public class LlapInputFormat implements InputFormat<NullWritable, VectorizedRowB
* Starts the data read pipeline
*/
public boolean init() {
- boolean isAcidScan = HiveConf.getBoolVar(jobConf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
- TypeDescription readerSchema = OrcInputFormat.getDesiredRowTypeDescr(jobConf, isAcidScan,
- Integer.MAX_VALUE);
- SchemaEvolution schemaEvolution = new SchemaEvolution(fileSchema, readerSchema,
- includedColumns);
+ SchemaEvolution schemaEvolution = new SchemaEvolution(fileSchema,
+ rp.getReaderSchema(), includedColumns);
for (Integer colId : columnIds) {
if (!schemaEvolution.isPPDSafeConversion(colId)) {
LlapIoImpl.LOG.warn("Unsupported schema evolution! Disabling Llap IO for {}", split);
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/ColumnVectorProducer.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/ColumnVectorProducer.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/ColumnVectorProducer.java
index b77dfbb..db86296 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/ColumnVectorProducer.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/ColumnVectorProducer.java
@@ -26,12 +26,14 @@ import org.apache.hadoop.hive.llap.io.api.impl.ColumnVectorBatch;
import org.apache.hadoop.hive.ql.io.orc.encoded.Consumer;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.mapred.FileSplit;
+import org.apache.orc.TypeDescription;
/**
* Entry point used by LlapInputFormat to create read pipeline to get data.
*/
public interface ColumnVectorProducer {
ReadPipeline createReadPipeline(Consumer<ColumnVectorBatch> consumer, FileSplit split,
- List<Integer> columnIds, SearchArgument sarg, String[] columnNames,
- QueryFragmentCounters counters) throws IOException;
+ List<Integer> columnIds, SearchArgument sarg, String[] columnNames,
+ QueryFragmentCounters counters,
+ TypeDescription readerSchema) throws IOException;
}
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcColumnVectorProducer.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcColumnVectorProducer.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcColumnVectorProducer.java
index 12275ac..2e9b9c3 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcColumnVectorProducer.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcColumnVectorProducer.java
@@ -35,6 +35,7 @@ import org.apache.hadoop.hive.llap.metrics.LlapDaemonIOMetrics;
import org.apache.hadoop.hive.ql.io.orc.encoded.Consumer;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.mapred.FileSplit;
+import org.apache.orc.TypeDescription;
public class OrcColumnVectorProducer implements ColumnVectorProducer {
@@ -64,12 +65,12 @@ public class OrcColumnVectorProducer implements ColumnVectorProducer {
public ReadPipeline createReadPipeline(
Consumer<ColumnVectorBatch> consumer, FileSplit split,
List<Integer> columnIds, SearchArgument sarg, String[] columnNames,
- QueryFragmentCounters counters) throws IOException {
+ QueryFragmentCounters counters, TypeDescription readerSchema) throws IOException {
cacheMetrics.incrCacheReadRequests();
OrcEncodedDataConsumer edc = new OrcEncodedDataConsumer(consumer, columnIds.size(),
_skipCorrupt, counters, ioMetrics);
OrcEncodedDataReader reader = new OrcEncodedDataReader(lowLevelCache, bufferManager,
- metadataCache, conf, split, columnIds, sarg, columnNames, edc, counters);
+ metadataCache, conf, split, columnIds, sarg, columnNames, edc, counters, readerSchema);
edc.init(reader, reader);
return edc;
}
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java
index 8bd985c..2cb7f79 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/OrcEncodedDataConsumer.java
@@ -70,6 +70,7 @@ public class OrcEncodedDataConsumer
private final boolean skipCorrupt; // TODO: get rid of this
private final QueryFragmentCounters counters;
private boolean[] includedColumns;
+ private TypeDescription readerSchema;
public OrcEncodedDataConsumer(
Consumer<ColumnVectorBatch> consumer, int colCount, boolean skipCorrupt,
@@ -273,4 +274,12 @@ public class OrcEncodedDataConsumer
public void setIncludedColumns(final boolean[] includedColumns) {
this.includedColumns = includedColumns;
}
+
+ public void setReaderSchema(TypeDescription readerSchema) {
+ this.readerSchema = readerSchema;
+ }
+
+ public TypeDescription getReaderSchema() {
+ return readerSchema;
+ }
}
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/ReadPipeline.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/ReadPipeline.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/ReadPipeline.java
index 1987451..4e1b851 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/ReadPipeline.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/decode/ReadPipeline.java
@@ -26,5 +26,6 @@ import org.apache.orc.TypeDescription;
public interface ReadPipeline extends ConsumerFeedback<ColumnVectorBatch> {
public Callable<Void> getReadCallable();
TypeDescription getFileSchema();
+ TypeDescription getReaderSchema();
boolean[] getIncludedColumns();
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java
index 121ec25..7944345 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/OrcEncodedDataReader.java
@@ -144,6 +144,7 @@ public class OrcEncodedDataReader extends CallableWithNdc<Void>
private final OrcEncodedDataConsumer consumer;
private final QueryFragmentCounters counters;
private final UserGroupInformation ugi;
+ private final SchemaEvolution evolution;
// Read state.
private int stripeIxFrom;
@@ -168,7 +169,7 @@ public class OrcEncodedDataReader extends CallableWithNdc<Void>
public OrcEncodedDataReader(LowLevelCache lowLevelCache, BufferUsageManager bufferManager,
OrcMetadataCache metadataCache, Configuration conf, FileSplit split, List<Integer> columnIds,
SearchArgument sarg, String[] columnNames, OrcEncodedDataConsumer consumer,
- QueryFragmentCounters counters) throws IOException {
+ QueryFragmentCounters counters, TypeDescription readerSchema) throws IOException {
this.lowLevelCache = lowLevelCache;
this.metadataCache = metadataCache;
this.bufferManager = bufferManager;
@@ -197,9 +198,14 @@ public class OrcEncodedDataReader extends CallableWithNdc<Void>
fileKey = determineFileId(fs, split,
HiveConf.getBoolVar(conf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID));
fileMetadata = getOrReadFileMetadata();
- globalIncludes = OrcInputFormat.genIncludedColumns(fileMetadata.getTypes(), includedColumnIds, true);
+ if (readerSchema == null) {
+ readerSchema = fileMetadata.getSchema();
+ }
+ globalIncludes = OrcInputFormat.genIncludedColumns(readerSchema, includedColumnIds);
+ evolution = new SchemaEvolution(fileMetadata.getSchema(), readerSchema, globalIncludes);
consumer.setFileMetadata(fileMetadata);
consumer.setIncludedColumns(globalIncludes);
+ consumer.setReaderSchema(readerSchema);
}
@Override
@@ -268,8 +274,11 @@ public class OrcEncodedDataReader extends CallableWithNdc<Void>
try {
if (sarg != null && stride != 0) {
// TODO: move this to a common method
+ TypeDescription schema = OrcUtils.convertTypeFromProtobuf(fileMetadata.getTypes(), 0);
+ SchemaEvolution evolution = new SchemaEvolution(schema,
+ null, globalIncludes);
int[] filterColumns = RecordReaderImpl.mapSargColumnsToOrcInternalColIdx(
- sarg.getLeaves(), columnNames, 0);
+ sarg.getLeaves(), evolution);
// included will not be null, row options will fill the array with trues if null
sargColumns = new boolean[globalIncludes.length];
for (int i : filterColumns) {
@@ -694,7 +703,7 @@ public class OrcEncodedDataReader extends CallableWithNdc<Void>
TypeDescription schema = OrcUtils.convertTypeFromProtobuf(types, 0);
SchemaEvolution schemaEvolution = new SchemaEvolution(schema, globalIncludes);
sargApp = new RecordReaderImpl.SargApplier(sarg, colNamesForSarg,
- rowIndexStride, globalIncludes.length, schemaEvolution);
+ rowIndexStride, schemaEvolution);
}
boolean hasAnyData = false;
// readState should have been initialized by this time with an empty array.
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/orc/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/TypeDescription.java b/orc/src/java/org/apache/orc/TypeDescription.java
index 1585e43..2e9328b 100644
--- a/orc/src/java/org/apache/orc/TypeDescription.java
+++ b/orc/src/java/org/apache/orc/TypeDescription.java
@@ -178,6 +178,191 @@ public class TypeDescription
return new TypeDescription(Category.DECIMAL);
}
+ static class StringPosition {
+ final String value;
+ int position;
+ final int length;
+
+ StringPosition(String value) {
+ this.value = value;
+ position = 0;
+ length = value.length();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append('\'');
+ buffer.append(value.substring(0, position));
+ buffer.append('^');
+ buffer.append(value.substring(position));
+ buffer.append('\'');
+ return buffer.toString();
+ }
+ }
+
+ static Category parseCategory(StringPosition source) {
+ int start = source.position;
+ while (source.position < source.length) {
+ char ch = source.value.charAt(source.position);
+ if (!Character.isLetter(ch)) {
+ break;
+ }
+ source.position += 1;
+ }
+ if (source.position != start) {
+ String word = source.value.substring(start, source.position).toLowerCase();
+ for (Category cat : Category.values()) {
+ if (cat.getName().equals(word)) {
+ return cat;
+ }
+ }
+ }
+ throw new IllegalArgumentException("Can't parse category at " + source);
+ }
+
+ static int parseInt(StringPosition source) {
+ int start = source.position;
+ int result = 0;
+ while (source.position < source.length) {
+ char ch = source.value.charAt(source.position);
+ if (!Character.isDigit(ch)) {
+ break;
+ }
+ result = result * 10 + (ch - '0');
+ source.position += 1;
+ }
+ if (source.position == start) {
+ throw new IllegalArgumentException("Missing integer at " + source);
+ }
+ return result;
+ }
+
+ static String parseName(StringPosition source) {
+ int start = source.position;
+ while (source.position < source.length) {
+ char ch = source.value.charAt(source.position);
+ if (!Character.isLetterOrDigit(ch) && ch != '.' && ch != '_') {
+ break;
+ }
+ source.position += 1;
+ }
+ if (source.position == start) {
+ throw new IllegalArgumentException("Missing name at " + source);
+ }
+ return source.value.substring(start, source.position);
+ }
+
+ static void requireChar(StringPosition source, char required) {
+ if (source.position >= source.length ||
+ source.value.charAt(source.position) != required) {
+ throw new IllegalArgumentException("Missing required char '" +
+ required + "' at " + source);
+ }
+ source.position += 1;
+ }
+
+ static boolean consumeChar(StringPosition source, char ch) {
+ boolean result = source.position < source.length &&
+ source.value.charAt(source.position) == ch;
+ if (result) {
+ source.position += 1;
+ }
+ return result;
+ }
+
+ static void parseUnion(TypeDescription type, StringPosition source) {
+ requireChar(source, '<');
+ do {
+ type.addUnionChild(parseType(source));
+ } while (consumeChar(source, ','));
+ requireChar(source, '>');
+ }
+
+ static void parseStruct(TypeDescription type, StringPosition source) {
+ requireChar(source, '<');
+ do {
+ String fieldName = parseName(source);
+ requireChar(source, ':');
+ type.addField(fieldName, parseType(source));
+ } while (consumeChar(source, ','));
+ requireChar(source, '>');
+ }
+
+ static TypeDescription parseType(StringPosition source) {
+ TypeDescription result = new TypeDescription(parseCategory(source));
+ switch (result.getCategory()) {
+ case BINARY:
+ case BOOLEAN:
+ case BYTE:
+ case DATE:
+ case DOUBLE:
+ case FLOAT:
+ case INT:
+ case LONG:
+ case SHORT:
+ case STRING:
+ case TIMESTAMP:
+ break;
+ case CHAR:
+ case VARCHAR:
+ requireChar(source, '(');
+ result.withMaxLength(parseInt(source));
+ requireChar(source, ')');
+ break;
+ case DECIMAL: {
+ requireChar(source, '(');
+ int precision = parseInt(source);
+ requireChar(source, ',');
+ result.withScale(parseInt(source));
+ result.withPrecision(precision);
+ requireChar(source, ')');
+ break;
+ }
+ case LIST:
+ requireChar(source, '<');
+ result.children.add(parseType(source));
+ requireChar(source, '>');
+ break;
+ case MAP:
+ requireChar(source, '<');
+ result.children.add(parseType(source));
+ requireChar(source, ',');
+ result.children.add(parseType(source));
+ requireChar(source, '>');
+ break;
+ case UNION:
+ parseUnion(result, source);
+ break;
+ case STRUCT:
+ parseStruct(result, source);
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown type " +
+ result.getCategory() + " at " + source);
+ }
+ return result;
+ }
+
+ /**
+ * Parse TypeDescription from the Hive type names. This is the inverse
+ * of TypeDescription.toString()
+ * @param typeName the name of the type
+ * @return a new TypeDescription or null if typeName was null
+ * @throws IllegalArgumentException if the string is badly formed
+ */
+ public static TypeDescription fromString(String typeName) {
+ if (typeName == null) {
+ return null;
+ }
+ StringPosition source = new StringPosition(typeName);
+ TypeDescription result = parseType(source);
+ if (source.position != source.length) {
+ throw new IllegalArgumentException("Extra characters at " + source);
+ }
+ return result;
+ }
+
/**
* For decimal types, set the precision.
* @param precision the new precision
@@ -657,4 +842,29 @@ public class TypeDescription
printJsonToBuffer("", buffer, 0);
return buffer.toString();
}
-}
+
+ /**
+ * Locate a subtype by its id.
+ * @param goal the column id to look for
+ * @return the subtype
+ */
+ public TypeDescription findSubtype(int goal) {
+ // call getId method to make sure the ids are assigned
+ int id = getId();
+ if (goal < id || goal > maxId) {
+ throw new IllegalArgumentException("Unknown type id " + id + " in " +
+ toJson());
+ }
+ if (goal == id) {
+ return this;
+ } else {
+ TypeDescription prev = null;
+ for(TypeDescription next: children) {
+ if (next.id > goal) {
+ return prev.findSubtype(goal);
+ }
+ prev = next;
+ }
+ return prev.findSubtype(goal);
+ }
+ }}
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/orc/src/java/org/apache/orc/impl/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/ReaderImpl.java b/orc/src/java/org/apache/orc/impl/ReaderImpl.java
index d6df7d7..93fc0ce 100644
--- a/orc/src/java/org/apache/orc/impl/ReaderImpl.java
+++ b/orc/src/java/org/apache/orc/impl/ReaderImpl.java
@@ -572,13 +572,6 @@ public class ReaderImpl implements Reader {
@Override
public RecordReader rows(Options options) throws IOException {
LOG.info("Reading ORC rows from " + path + " with " + options);
- boolean[] include = options.getInclude();
- // if included columns is null, then include all columns
- if (include == null) {
- include = new boolean[types.size()];
- Arrays.fill(include, true);
- options.include(include);
- }
return new RecordReaderImpl(this, options);
}
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java b/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java
index 92b6a8b..975804b 100644
--- a/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java
+++ b/orc/src/java/org/apache/orc/impl/RecordReaderImpl.java
@@ -75,6 +75,7 @@ public class RecordReaderImpl implements RecordReader {
private final List<OrcProto.Type> types;
private final int bufferSize;
private final SchemaEvolution evolution;
+ // the file included columns indexed by the file's column ids.
private final boolean[] included;
private final long rowIndexStride;
private long rowInStripe = 0;
@@ -95,17 +96,19 @@ public class RecordReaderImpl implements RecordReader {
/**
* Given a list of column names, find the given column and return the index.
*
- * @param columnNames the list of potential column names
+ * @param evolution the mapping from reader to file schema
* @param columnName the column name to look for
- * @param rootColumn offset the result with the rootColumn
- * @return the column number or -1 if the column wasn't found
+ * @return the file column id or -1 if the column wasn't found
*/
- static int findColumns(String[] columnNames,
- String columnName,
- int rootColumn) {
- for(int i=0; i < columnNames.length; ++i) {
- if (columnName.equals(columnNames[i])) {
- return i + rootColumn;
+ static int findColumns(SchemaEvolution evolution,
+ String columnName) {
+ TypeDescription readerSchema = evolution.getReaderBaseSchema();
+ List<String> fieldNames = readerSchema.getFieldNames();
+ List<TypeDescription> children = readerSchema.getChildren();
+ for (int i = 0; i < fieldNames.size(); ++i) {
+ if (columnName.equals(fieldNames.get(i))) {
+ TypeDescription result = evolution.getFileType(children.get(i).getId());
+ return result == null ? -1 : result.getId();
}
}
return -1;
@@ -114,40 +117,36 @@ public class RecordReaderImpl implements RecordReader {
/**
* Find the mapping from predicate leaves to columns.
* @param sargLeaves the search argument that we need to map
- * @param columnNames the names of the columns
- * @param rootColumn the offset of the top level row, which offsets the
- * result
- * @return an array mapping the sarg leaves to concrete column numbers
+ * @param evolution the mapping from reader to file schema
+ * @return an array mapping the sarg leaves to file column ids
*/
public static int[] mapSargColumnsToOrcInternalColIdx(List<PredicateLeaf> sargLeaves,
- String[] columnNames,
- int rootColumn) {
+ SchemaEvolution evolution) {
int[] result = new int[sargLeaves.size()];
Arrays.fill(result, -1);
for(int i=0; i < result.length; ++i) {
String colName = sargLeaves.get(i).getColumnName();
- result[i] = findColumns(columnNames, colName, rootColumn);
+ result[i] = findColumns(evolution, colName);
}
return result;
}
protected RecordReaderImpl(ReaderImpl fileReader,
Reader.Options options) throws IOException {
- this.included = options.getInclude();
- included[0] = true;
+ boolean[] readerIncluded = options.getInclude();
if (options.getSchema() == null) {
if (LOG.isInfoEnabled()) {
LOG.info("Reader schema not provided -- using file schema " +
fileReader.getSchema());
}
- evolution = new SchemaEvolution(fileReader.getSchema(), included);
+ evolution = new SchemaEvolution(fileReader.getSchema(), readerIncluded);
} else {
// Now that we are creating a record reader for a file, validate that the schema to read
// is compatible with the file schema.
//
evolution = new SchemaEvolution(fileReader.getSchema(),
- options.getSchema(),included);
+ options.getSchema(), readerIncluded);
if (LOG.isDebugEnabled() && evolution.hasConversion()) {
LOG.debug("ORC file " + fileReader.path.toString() +
" has data type conversion --\n" +
@@ -164,7 +163,7 @@ public class RecordReaderImpl implements RecordReader {
SearchArgument sarg = options.getSearchArgument();
if (sarg != null && rowIndexStride != 0) {
sargApp = new SargApplier(sarg, options.getColumnNames(), rowIndexStride,
- included.length, evolution);
+ evolution);
} else {
sargApp = null;
}
@@ -209,9 +208,10 @@ public class RecordReaderImpl implements RecordReader {
}
reader = TreeReaderFactory.createTreeReader(evolution.getReaderSchema(),
- evolution, included, skipCorrupt);
+ evolution, readerIncluded, skipCorrupt);
indexes = new OrcProto.RowIndex[types.size()];
bloomFilterIndices = new OrcProto.BloomFilterIndex[types.size()];
+ this.included = evolution.getFileIncluded();
advanceToNextRow(reader, 0L, true);
}
@@ -710,14 +710,15 @@ public class RecordReaderImpl implements RecordReader {
private final boolean[] sargColumns;
private SchemaEvolution evolution;
- public SargApplier(SearchArgument sarg, String[] columnNames, long rowIndexStride,
- int includedCount, final SchemaEvolution evolution) {
+ public SargApplier(SearchArgument sarg, String[] columnNames,
+ long rowIndexStride,
+ SchemaEvolution evolution) {
this.sarg = sarg;
sargLeaves = sarg.getLeaves();
- filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves, columnNames, 0);
+ filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves, evolution);
this.rowIndexStride = rowIndexStride;
// included will not be null, row options will fill the array with trues if null
- sargColumns = new boolean[includedCount];
+ sargColumns = new boolean[evolution.getFileIncluded().length];
for (int i : filterColumns) {
// filter columns may have -1 as index which could be partition column in SARG.
if (i > 0) {
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/orc/src/java/org/apache/orc/impl/SchemaEvolution.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/SchemaEvolution.java b/orc/src/java/org/apache/orc/impl/SchemaEvolution.java
index 7379de9..bb5bcf7 100644
--- a/orc/src/java/org/apache/orc/impl/SchemaEvolution.java
+++ b/orc/src/java/org/apache/orc/impl/SchemaEvolution.java
@@ -32,7 +32,11 @@ public class SchemaEvolution {
// indexed by reader column id
private final TypeDescription[] readerFileTypes;
// indexed by reader column id
- private final boolean[] included;
+ private final boolean[] readerIncluded;
+ // the offset to the first column id ignoring any ACID columns
+ private final int readerColumnOffset;
+ // indexed by file column id
+ private final boolean[] fileIncluded;
private final TypeDescription fileSchema;
private final TypeDescription readerSchema;
private boolean hasConversion;
@@ -46,20 +50,36 @@ public class SchemaEvolution {
public SchemaEvolution(TypeDescription fileSchema,
TypeDescription readerSchema,
boolean[] includeCols) {
- this.included = includeCols == null ? null : Arrays.copyOf(includeCols, includeCols.length);
+ this.readerIncluded = includeCols == null ? null : Arrays.copyOf(includeCols, includeCols.length);
this.hasConversion = false;
this.fileSchema = fileSchema;
+ boolean isAcid = checkAcidSchema(fileSchema);
+ this.readerColumnOffset = isAcid ? acidEventFieldNames.size() : 0;
if (readerSchema != null) {
- if (checkAcidSchema(fileSchema)) {
+ if (isAcid) {
this.readerSchema = createEventSchema(readerSchema);
} else {
this.readerSchema = readerSchema;
}
+ if (readerIncluded != null &&
+ readerIncluded.length + readerColumnOffset != this.readerSchema.getMaximumId() + 1) {
+ throw new IllegalArgumentException("Include vector the wrong length: " +
+ this.readerSchema.toJson() + " with include length " +
+ readerIncluded.length);
+ }
this.readerFileTypes = new TypeDescription[this.readerSchema.getMaximumId() + 1];
+ this.fileIncluded = new boolean[fileSchema.getMaximumId() + 1];
buildConversionFileTypesArray(fileSchema, this.readerSchema);
} else {
this.readerSchema = fileSchema;
this.readerFileTypes = new TypeDescription[this.readerSchema.getMaximumId() + 1];
+ this.fileIncluded = readerIncluded;
+ if (readerIncluded != null &&
+ readerIncluded.length + readerColumnOffset != this.readerSchema.getMaximumId() + 1) {
+ throw new IllegalArgumentException("Include vector the wrong length: " +
+ this.readerSchema.toJson() + " with include length " +
+ readerIncluded.length);
+ }
buildSameSchemaFileTypesArray();
}
this.ppdSafeConversion = populatePpdSafeConversion();
@@ -70,6 +90,15 @@ public class SchemaEvolution {
}
/**
+ * Returns the non-ACID (aka base) reader type description.
+ *
+ * @return the reader type ignoring the ACID rowid columns, if any
+ */
+ public TypeDescription getReaderBaseSchema() {
+ return readerSchema.findSubtype(readerColumnOffset);
+ }
+
+ /**
* Is there Schema Evolution data type conversion?
* @return
*/
@@ -82,6 +111,22 @@ public class SchemaEvolution {
}
/**
+ * Get whether each column is included from the reader's point of view.
+ * @return a boolean array indexed by reader column id
+ */
+ public boolean[] getReaderIncluded() {
+ return readerIncluded;
+ }
+
+ /**
+ * Get whether each column is included from the file's point of view.
+ * @return a boolean array indexed by file column id
+ */
+ public boolean[] getFileIncluded() {
+ return fileIncluded;
+ }
+
+ /**
* Get the file type by reader type id.
* @param id reader column id
* @return
@@ -189,10 +234,22 @@ public class SchemaEvolution {
return false;
}
+ /**
+ * Should we read the given reader column?
+ * @param readerId the id of column in the extended reader schema
+ * @return true if the column should be read
+ */
+ public boolean includeReaderColumn(int readerId) {
+ return readerIncluded == null ||
+ readerId <= readerColumnOffset ||
+ readerIncluded[readerId - readerColumnOffset];
+ }
+
void buildConversionFileTypesArray(TypeDescription fileType,
TypeDescription readerType) {
// if the column isn't included, don't map it
- if (included != null && !included[readerType.getId()]) {
+ int readerId = readerType.getId();
+ if (!includeReaderColumn(readerId)) {
return;
}
boolean isOk = true;
@@ -266,17 +323,17 @@ public class SchemaEvolution {
hasConversion = true;
}
if (isOk) {
- int id = readerType.getId();
- if (readerFileTypes[id] != null) {
+ if (readerFileTypes[readerId] != null) {
throw new RuntimeException("reader to file type entry already assigned");
}
- readerFileTypes[id] = fileType;
+ readerFileTypes[readerId] = fileType;
+ fileIncluded[fileType.getId()] = true;
} else {
throw new IllegalArgumentException(
String.format(
"ORC does not support type conversion from file type %s (%d) to reader type %s (%d)",
fileType.toString(), fileType.getId(),
- readerType.toString(), readerType.getId()));
+ readerType.toString(), readerId));
}
}
@@ -289,10 +346,10 @@ public class SchemaEvolution {
}
void buildSameSchemaFileTypesArrayRecurse(TypeDescription readerType) {
- if (included != null && !included[readerType.getId()]) {
+ int id = readerType.getId();
+ if (!includeReaderColumn(id)) {
return;
}
- int id = readerType.getId();
if (readerFileTypes[id] != null) {
throw new RuntimeException("reader to file type entry already assigned");
}
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java
----------------------------------------------------------------------
diff --git a/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java b/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java
index e46a0a4..484209b 100644
--- a/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java
+++ b/orc/src/java/org/apache/orc/impl/TreeReaderFactory.java
@@ -2082,8 +2082,7 @@ public class TreeReaderFactory {
boolean skipCorrupt
) throws IOException {
TypeDescription fileType = evolution.getFileType(readerType);
- if (fileType == null ||
- (included != null && !included[readerType.getId()])) {
+ if (fileType == null || !evolution.includeReaderColumn(readerType.getId())){
return new NullTreeReader(0);
}
TypeDescription.Category readerTypeCategory = readerType.getCategory();
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 8c58f4c..3fc35bc 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1011,6 +1011,7 @@
<reuseForks>false</reuseForks>
<failIfNoTests>false</failIfNoTests>
<argLine>${maven.test.jvm.args}</argLine>
+ <trimStackTrace>false</trimStackTrace>
<additionalClasspathElements>
<additionalClasspathElement>${test.conf.dir}</additionalClasspathElement>
<additionalClasspathElement>${basedir}/${hive.path.to.root}/conf</additionalClasspathElement>
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 572953a..3fe93ac 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -314,8 +314,11 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
Reader.Options options = new Reader.Options().range(offset, length);
options.schema(schema);
boolean isOriginal = isOriginal(file);
- List<OrcProto.Type> types = file.getTypes();
- options.include(genIncludedColumns(types, conf, isOriginal));
+ if (schema == null) {
+ schema = file.getSchema();
+ }
+ List<OrcProto.Type> types = OrcUtils.getOrcTypes(schema);
+ options.include(genIncludedColumns(schema, conf));
setSearchArgument(options, types, conf, isOriginal);
return file.rowsOptions(options);
}
@@ -324,35 +327,18 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
return !file.hasMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME);
}
- /**
- * Recurse down into a type subtree turning on all of the sub-columns.
- * @param types the types of the file
- * @param result the global view of columns that should be included
- * @param typeId the root of tree to enable
- * @param rootColumn the top column
- */
- private static void includeColumnRecursive(List<OrcProto.Type> types,
- boolean[] result,
- int typeId,
- int rootColumn) {
- result[typeId - rootColumn] = true;
- OrcProto.Type type = types.get(typeId);
- int children = type.getSubtypesCount();
- for(int i=0; i < children; ++i) {
- includeColumnRecursive(types, result, type.getSubtypes(i), rootColumn);
- }
- }
+ public static boolean[] genIncludedColumns(TypeDescription readerSchema,
+ List<Integer> included) {
- public static boolean[] genIncludedColumns(
- List<OrcProto.Type> types, List<Integer> included, boolean isOriginal) {
- int rootColumn = getRootColumn(isOriginal);
- int numColumns = types.size() - rootColumn;
- boolean[] result = new boolean[numColumns];
+ boolean[] result = new boolean[readerSchema.getMaximumId() + 1];
result[0] = true;
- OrcProto.Type root = types.get(rootColumn);
- for (int i = 0; i < root.getSubtypesCount(); ++i) {
- if (included.contains(i)) {
- includeColumnRecursive(types, result, root.getSubtypes(i), rootColumn);
+ List<TypeDescription> children = readerSchema.getChildren();
+ for (int columnNumber = 0; columnNumber < children.size(); ++columnNumber) {
+ if (included.contains(columnNumber)) {
+ TypeDescription child = children.get(columnNumber);
+ for(int col = child.getId(); col <= child.getMaximumId(); ++col) {
+ result[col] = true;
+ }
}
}
return result;
@@ -360,15 +346,14 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
/**
* Take the configuration and figure out which columns we need to include.
- * @param types the types for the file
+ * @param readerSchema the types for the reader
* @param conf the configuration
- * @param isOriginal is the file in the original format?
*/
- public static boolean[] genIncludedColumns(
- List<OrcProto.Type> types, Configuration conf, boolean isOriginal) {
+ public static boolean[] genIncludedColumns(TypeDescription readerSchema,
+ Configuration conf) {
if (!ColumnProjectionUtils.isReadAllColumns(conf)) {
List<Integer> included = ColumnProjectionUtils.getReadColumnIDs(conf);
- return genIncludedColumns(types, included, isOriginal);
+ return genIncludedColumns(readerSchema, included);
} else {
return null;
}
@@ -1162,7 +1147,6 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
private List<StripeInformation> stripes;
private List<StripeStatistics> stripeStats;
private List<OrcProto.Type> fileTypes;
- private boolean[] included; // The included columns from the Hive configuration.
private boolean[] readerIncluded; // The included columns of the reader / file schema that
// include ACID columns if present.
private final boolean isOriginal;
@@ -1359,11 +1343,11 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
if ((deltas == null || deltas.isEmpty()) && context.sarg != null) {
String[] colNames =
extractNeededColNames((readerTypes == null ? fileTypes : readerTypes),
- context.conf, included, isOriginal);
+ context.conf, readerIncluded, isOriginal);
if (colNames == null) {
LOG.warn("Skipping split elimination for {} as column names is null", file.getPath());
} else {
- includeStripe = pickStripes(context.sarg, colNames, writerVersion, isOriginal,
+ includeStripe = pickStripes(context.sarg, writerVersion,
stripeStats, stripes.size(), file.getPath(), evolution);
}
}
@@ -1481,20 +1465,12 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
fileTypes = orcTail.getTypes();
TypeDescription fileSchema = OrcUtils.convertTypeFromProtobuf(fileTypes, 0);
if (readerTypes == null) {
- included = genIncludedColumns(fileTypes, context.conf, isOriginal);
- evolution = new SchemaEvolution(fileSchema, included);
- readerIncluded = included;
+ readerIncluded = genIncludedColumns(fileSchema, context.conf);
+ evolution = new SchemaEvolution(fileSchema, readerIncluded);
} else {
// The reader schema always comes in without ACID columns.
- included = genIncludedColumns(readerTypes, context.conf, /* isOriginal */ true);
- if (included != null && !isOriginal) {
- // We shift the include columns here because the SchemaEvolution constructor will
- // add the ACID event metadata the readerSchema...
- readerIncluded = shiftReaderIncludedForAcid(included);
- } else {
- readerIncluded = included;
- }
TypeDescription readerSchema = OrcUtils.convertTypeFromProtobuf(readerTypes, 0);
+ readerIncluded = genIncludedColumns(readerSchema, context.conf);
evolution = new SchemaEvolution(fileSchema, readerSchema, readerIncluded);
if (!isOriginal) {
// The SchemaEvolution class has added the ACID metadata columns. Let's update our
@@ -1992,8 +1968,8 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
Reader.Options readerOptions = new Reader.Options().schema(schema);
// TODO: Convert genIncludedColumns and setSearchArgument to use TypeDescription.
final List<OrcProto.Type> schemaTypes = OrcUtils.getOrcTypes(schema);
- readerOptions.include(OrcInputFormat.genIncludedColumns(schemaTypes, conf, SCHEMA_TYPES_IS_ORIGINAL));
- OrcInputFormat.setSearchArgument(readerOptions, schemaTypes, conf, SCHEMA_TYPES_IS_ORIGINAL);
+ readerOptions.include(OrcInputFormat.genIncludedColumns(schema, conf));
+ OrcInputFormat.setSearchArgument(readerOptions, schemaTypes, conf, true);
return readerOptions;
}
@@ -2037,8 +2013,9 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
return pickStripesInternal(sarg, filterColumns, stripeStats, stripeCount, null, evolution);
}
- private static boolean[] pickStripes(SearchArgument sarg, String[] sargColNames,
- OrcFile.WriterVersion writerVersion, boolean isOriginal, List<StripeStatistics> stripeStats,
+ private static boolean[] pickStripes(SearchArgument sarg,
+ OrcFile.WriterVersion writerVersion,
+ List<StripeStatistics> stripeStats,
int stripeCount, Path filePath, final SchemaEvolution evolution) {
if (sarg == null || stripeStats == null || writerVersion == OrcFile.WriterVersion.ORIGINAL) {
return null; // only do split pruning if HIVE-8732 has been fixed in the writer
@@ -2046,7 +2023,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
// eliminate stripes that doesn't satisfy the predicate condition
List<PredicateLeaf> sargLeaves = sarg.getLeaves();
int[] filterColumns = RecordReaderImpl.mapSargColumnsToOrcInternalColIdx(sargLeaves,
- sargColNames, getRootColumn(isOriginal));
+ evolution);
return pickStripesInternal(sarg, filterColumns, stripeStats, stripeCount, filePath, evolution);
}
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
index efde2db..fb7a6b2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java
@@ -393,18 +393,7 @@ public class OrcRawRecordMerger implements AcidInputFormat.RawReader<OrcStruct>{
static Reader.Options createEventOptions(Reader.Options options) {
Reader.Options result = options.clone();
result.range(options.getOffset(), Long.MAX_VALUE);
- // slide the columns down by 6 for the include array
- if (options.getInclude() != null) {
- boolean[] orig = options.getInclude();
- // we always need the base row
- orig[0] = true;
- boolean[] include = new boolean[orig.length + OrcRecordUpdater.FIELDS];
- Arrays.fill(include, 0, OrcRecordUpdater.FIELDS, true);
- for(int i= 0; i < orig.length; ++i) {
- include[i + OrcRecordUpdater.FIELDS] = orig[i];
- }
- result.include(include);
- }
+ result.include(options.getInclude());
// slide the column names down by 6 for the name array
if (options.getColumnNames() != null) {
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
index 74c5071..cbbbb15 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.orc.TypeDescription;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -76,14 +77,6 @@ public class ReaderImpl extends org.apache.orc.impl.ReaderImpl
@Override
public RecordReader rowsOptions(Options options) throws IOException {
LOG.info("Reading ORC rows from " + path + " with " + options);
- boolean[] include = options.getInclude();
- // if included columns is null, then include all columns
- if (include == null) {
- options = options.clone();
- include = new boolean[types.size()];
- Arrays.fill(include, true);
- options.include(include);
- }
return new RecordReaderImpl(this, options);
}
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java
index 9275aa9..f7388a4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java
@@ -43,6 +43,7 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.orc.OrcProto;
+import org.apache.orc.OrcUtils;
import org.apache.orc.TypeDescription;
/**
@@ -74,7 +75,6 @@ public class VectorizedOrcInputFormat extends FileInputFormat<NullWritable, Vect
/**
* Do we have schema on read in the configuration variables?
*/
- List<OrcProto.Type> types = file.getTypes();
int dataColumns = rbCtx.getDataColumnCount();
TypeDescription schema =
OrcInputFormat.getDesiredRowTypeDescr(conf, false, dataColumns);
@@ -91,12 +91,13 @@ public class VectorizedOrcInputFormat extends FileInputFormat<NullWritable, Vect
}
}
}
+ List<OrcProto.Type> types = OrcUtils.getOrcTypes(schema);
Reader.Options options = new Reader.Options().schema(schema);
this.offset = fileSplit.getStart();
this.length = fileSplit.getLength();
options.range(offset, length);
- options.include(OrcInputFormat.genIncludedColumns(types, conf, true));
+ options.include(OrcInputFormat.genIncludedColumns(schema, conf));
OrcInputFormat.setSearchArgument(options, types, conf, true);
this.reader = file.rowsOptions(options);
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 4aac90a..d6b48a3 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -25,6 +25,8 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.security.PrivilegedExceptionAction;
import java.sql.Date;
import java.sql.Timestamp;
@@ -54,9 +56,11 @@ import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.llap.TypeDesc;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.exec.SerializationUtilities;
import org.apache.hadoop.hive.ql.exec.Utilities;
@@ -66,6 +70,7 @@ import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
@@ -76,6 +81,8 @@ import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.InputFormatChecker;
+import org.apache.hadoop.hive.ql.io.RecordIdentifier;
+import org.apache.hadoop.hive.ql.io.RecordUpdater;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy;
import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg;
@@ -114,7 +121,7 @@ import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Progressable;
-import org.apache.orc.OrcProto;
+import org.apache.orc.*;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
@@ -2213,7 +2220,7 @@ public class TestInputOutputFormat {
OrcRecordUpdater writer = new OrcRecordUpdater(partDir,
new AcidOutputFormat.Options(conf).maximumTransactionId(10)
.writingBase(true).bucket(0).inspector(inspector).finalDestination(partDir));
- for(int i=0; i < 100; ++i) {
+ for (int i = 0; i < 100; ++i) {
BigRow row = new BigRow(i);
writer.insert(10, row);
}
@@ -2223,7 +2230,7 @@ public class TestInputOutputFormat {
.setBlocks(new MockBlock("host0", "host1"));
// call getsplits
- HiveInputFormat<?,?> inputFormat =
+ HiveInputFormat<?, ?> inputFormat =
new HiveInputFormat<WritableComparable, Writable>();
InputSplit[] splits = inputFormat.getSplits(conf, 10);
assertEquals(1, splits.length);
@@ -2233,7 +2240,7 @@ public class TestInputOutputFormat {
HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch>
- reader = inputFormat.getRecordReader(splits[0], conf, Reporter.NULL);
+ reader = inputFormat.getRecordReader(splits[0], conf, Reporter.NULL);
NullWritable key = reader.createKey();
VectorizedRowBatch value = reader.createValue();
assertEquals(true, reader.next(key, value));
@@ -3682,4 +3689,187 @@ public class TestInputOutputFormat {
conf.unset(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname);
}
}
+
+ /**
+ * Test schema evolution when using the reader directly.
+ */
+ @Test
+ public void testSchemaEvolution() throws Exception {
+ TypeDescription fileSchema =
+ TypeDescription.fromString("struct<a:int,b:struct<c:int>,d:string>");
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(fileSchema)
+ .compress(org.apache.orc.CompressionKind.NONE));
+ VectorizedRowBatch batch = fileSchema.createRowBatch(1000);
+ batch.size = 1000;
+ LongColumnVector lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
+ for(int r=0; r < 1000; r++) {
+ ((LongColumnVector) batch.cols[0]).vector[r] = r * 42;
+ lcv.vector[r] = r * 10001;
+ ((BytesColumnVector) batch.cols[2]).setVal(r,
+ Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+ TypeDescription readerSchema = TypeDescription.fromString(
+ "struct<a:int,b:struct<c:int,future1:int>,d:string,future2:int>");
+ Reader reader = OrcFile.createReader(testFilePath,
+ OrcFile.readerOptions(conf).filesystem(fs));
+ RecordReader rows = reader.rowsOptions(new Reader.Options()
+ .schema(readerSchema));
+ batch = readerSchema.createRowBatch();
+ lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
+ LongColumnVector future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]);
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1000, batch.size);
+ assertEquals(true, future1.isRepeating);
+ assertEquals(true, future1.isNull[0]);
+ assertEquals(true, batch.cols[3].isRepeating);
+ assertEquals(true, batch.cols[3].isNull[0]);
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]);
+ assertEquals("row " + r, r * 10001, lcv.vector[r]);
+ assertEquals("row " + r, r * 10001, lcv.vector[r]);
+ assertEquals("row " + r, Integer.toHexString(r),
+ ((BytesColumnVector) batch.cols[2]).toString(r));
+ }
+ assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+
+ // try it again with an include vector
+ rows = reader.rowsOptions(new Reader.Options()
+ .schema(readerSchema)
+ .include(new boolean[]{false, true, true, true, false, false, true}));
+ batch = readerSchema.createRowBatch();
+ lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
+ future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]);
+ assertEquals(true, rows.nextBatch(batch));
+ assertEquals(1000, batch.size);
+ assertEquals(true, future1.isRepeating);
+ assertEquals(true, future1.isNull[0]);
+ assertEquals(true, batch.cols[3].isRepeating);
+ assertEquals(true, batch.cols[3].isNull[0]);
+ assertEquals(true, batch.cols[2].isRepeating);
+ assertEquals(true, batch.cols[2].isNull[0]);
+ for(int r=0; r < batch.size; ++r) {
+ assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]);
+ assertEquals("row " + r, r * 10001, lcv.vector[r]);
+ }
+ assertEquals(false, rows.nextBatch(batch));
+ rows.close();
+ }
+
+ /**
+ * Test column projection when using ACID.
+ */
+ @Test
+ public void testColumnProjectionWithAcid() throws Exception {
+ Path baseDir = new Path(workDir, "base_00100");
+ testFilePath = new Path(baseDir, "bucket_00000");
+ fs.mkdirs(baseDir);
+ fs.delete(testFilePath, true);
+ TypeDescription fileSchema =
+ TypeDescription.fromString("struct<operation:int," +
+ "originalTransaction:bigint,bucket:int,rowId:bigint," +
+ "currentTransaction:bigint," +
+ "row:struct<a:int,b:struct<c:int>,d:string>>");
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf)
+ .fileSystem(fs)
+ .setSchema(fileSchema)
+ .compress(org.apache.orc.CompressionKind.NONE));
+ VectorizedRowBatch batch = fileSchema.createRowBatch(1000);
+ batch.size = 1000;
+ StructColumnVector scv = (StructColumnVector)batch.cols[5];
+ // operation
+ batch.cols[0].isRepeating = true;
+ ((LongColumnVector) batch.cols[0]).vector[0] = 0;
+ // original transaction
+ batch.cols[1].isRepeating = true;
+ ((LongColumnVector) batch.cols[1]).vector[0] = 1;
+ // bucket
+ batch.cols[2].isRepeating = true;
+ ((LongColumnVector) batch.cols[2]).vector[0] = 0;
+ // current transaction
+ batch.cols[4].isRepeating = true;
+ ((LongColumnVector) batch.cols[4]).vector[0] = 1;
+
+ LongColumnVector lcv = (LongColumnVector)
+ ((StructColumnVector) scv.fields[1]).fields[0];
+ for(int r=0; r < 1000; r++) {
+ // row id
+ ((LongColumnVector) batch.cols[3]).vector[r] = r;
+ // a
+ ((LongColumnVector) scv.fields[0]).vector[r] = r * 42;
+ // b.c
+ lcv.vector[r] = r * 10001;
+ // d
+ ((BytesColumnVector) scv.fields[2]).setVal(r,
+ Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
+ }
+ writer.addRowBatch(batch);
+ writer.addUserMetadata(OrcRecordUpdater.ACID_KEY_INDEX_NAME,
+ ByteBuffer.wrap("0,0,999".getBytes(StandardCharsets.UTF_8)));
+ writer.close();
+ long fileLength = fs.getFileStatus(testFilePath).getLen();
+
+ // test with same schema with include
+ conf.set(ValidTxnList.VALID_TXNS_KEY, "100:99:");
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d");
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int>,string");
+ conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
+ conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2");
+ OrcSplit split = new OrcSplit(testFilePath, null, 0, fileLength,
+ new String[0], null, false, true,
+ new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength);
+ OrcInputFormat inputFormat = new OrcInputFormat();
+ AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split,
+ new AcidInputFormat.Options(conf));
+ int record = 0;
+ RecordIdentifier id = reader.createKey();
+ OrcStruct struct = reader.createValue();
+ while (reader.next(id, struct)) {
+ assertEquals("id " + record, record, id.getRowId());
+ assertEquals("bucket " + record, 0, id.getBucketId());
+ assertEquals("trans " + record, 1, id.getTransactionId());
+ assertEquals("a " + record,
+ 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
+ assertEquals(null, struct.getFieldValue(1));
+ assertEquals("d " + record,
+ Integer.toHexString(record), struct.getFieldValue(2).toString());
+ record += 1;
+ }
+ assertEquals(1000, record);
+ reader.close();
+
+ // test with schema evolution and include
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d,f");
+ conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int,e:string>,string,int");
+ conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
+ conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2,3");
+ split = new OrcSplit(testFilePath, null, 0, fileLength,
+ new String[0], null, false, true,
+ new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength);
+ inputFormat = new OrcInputFormat();
+ reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
+ record = 0;
+ id = reader.createKey();
+ struct = reader.createValue();
+ while (reader.next(id, struct)) {
+ assertEquals("id " + record, record, id.getRowId());
+ assertEquals("bucket " + record, 0, id.getBucketId());
+ assertEquals("trans " + record, 1, id.getTransactionId());
+ assertEquals("a " + record,
+ 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
+ assertEquals(null, struct.getFieldValue(1));
+ assertEquals("d " + record,
+ Integer.toHexString(record), struct.getFieldValue(2).toString());
+ assertEquals("f " + record, null, struct.getFieldValue(3));
+ record += 1;
+ }
+ assertEquals(1000, record);
+ reader.close();
+ }
}
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/ql/src/test/results/clientpositive/llap/orc_llap_counters.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/orc_llap_counters.q.out b/ql/src/test/results/clientpositive/llap/orc_llap_counters.q.out
index 81e2277..81f1d74 100644
--- a/ql/src/test/results/clientpositive/llap/orc_llap_counters.q.out
+++ b/ql/src/test/results/clientpositive/llap/orc_llap_counters.q.out
@@ -309,7 +309,7 @@ PREHOOK: type: QUERY
PREHOOK: Input: default@orc_ppd
#### A masked pattern was here ####
Stage-1 FILE SYSTEM COUNTERS:
- HDFS_BYTES_READ: 638
+ HDFS_BYTES_READ: 1501
HDFS_BYTES_WRITTEN: 101
HDFS_READ_OPS: 4
HDFS_LARGE_READ_OPS: 0
@@ -683,17 +683,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 100
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 0
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 249
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 1
- NUM_VECTOR_BATCHES: 1
- ROWS_EMITTED: 100
- SELECTED_ROWGROUPS: 1
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: select count(*) from orc_ppd where t IN (-100, 125, 200)
PREHOOK: type: QUERY
@@ -708,17 +703,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 100
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 249
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 1
- NUM_VECTOR_BATCHES: 1
- ROWS_EMITTED: 100
- SELECTED_ROWGROUPS: 1
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: -- Row group statistics for column s:
-- Entry 0: count: 1000 hasNull: false min: max: zach young sum: 12907 positions: 0,0,0
@@ -746,7 +736,7 @@ PREHOOK: type: QUERY
PREHOOK: Input: default@orc_ppd
#### A masked pattern was here ####
Stage-1 FILE SYSTEM COUNTERS:
- HDFS_BYTES_READ: 4402
+ HDFS_BYTES_READ: 6166
HDFS_BYTES_WRITTEN: 101
HDFS_READ_OPS: 4
HDFS_LARGE_READ_OPS: 0
@@ -754,7 +744,7 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 1000
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
@@ -764,10 +754,10 @@ Stage-1 LLAP IO COUNTERS:
CACHE_MISS_BYTES: 3980
METADATA_CACHE_HIT: 1
METADATA_CACHE_MISS: 1
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 1000
+ SELECTED_ROWGROUPS: 1
2
PREHOOK: query: select count(*) from orc_ppd where s <=> "zach zipper"
PREHOOK: type: QUERY
@@ -782,17 +772,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 1100
+ RECORDS_IN_Map_1: 1000
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 2
- NUM_VECTOR_BATCHES: 2
- ROWS_EMITTED: 1100
- SELECTED_ROWGROUPS: 2
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 1000
+ SELECTED_ROWGROUPS: 1
6
PREHOOK: query: select count(*) from orc_ppd where s <=> ""
PREHOOK: type: QUERY
@@ -874,17 +864,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 1100
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 0
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 3980
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 2
- NUM_VECTOR_BATCHES: 2
- ROWS_EMITTED: 1100
- SELECTED_ROWGROUPS: 2
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: -- INPUT_RECORDS: 1000 (1 row group)
select count(*) from orc_ppd where s = cast("zach young" as char(10))
@@ -900,17 +885,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 1000
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 1000
+ SELECTED_ROWGROUPS: 1
2
PREHOOK: query: select count(*) from orc_ppd where s = cast("zach young" as varchar(10))
PREHOOK: type: QUERY
@@ -925,17 +910,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 1000
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 1000
+ SELECTED_ROWGROUPS: 1
2
PREHOOK: query: select count(*) from orc_ppd where s = cast("zach young" as varchar(50))
PREHOOK: type: QUERY
@@ -950,17 +935,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 1000
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 1000
+ SELECTED_ROWGROUPS: 1
2
PREHOOK: query: -- INPUT_RECORDS: 2000 (2 row groups)
select count(*) from orc_ppd where s < "b"
@@ -1133,17 +1118,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 0
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 3980
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: select count(*) from orc_ppd where s <=> "apache hive"
PREHOOK: type: QUERY
@@ -1158,17 +1138,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2000
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 3980
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 2
- NUM_VECTOR_BATCHES: 2
- ROWS_EMITTED: 2000
- SELECTED_ROWGROUPS: 2
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: select count(*) from orc_ppd where s IN ("a", "z")
PREHOOK: type: QUERY
@@ -1183,17 +1158,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 0
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 3980
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: -- INPUT_RECORDS: 100
select count(*) from orc_ppd where s = "sarah ovid"
@@ -1209,17 +1179,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 100
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 100
+ SELECTED_ROWGROUPS: 1
2
PREHOOK: query: -- INPUT_RECORDS: 1100
select count(*) from orc_ppd where s = "wendy king"
@@ -1235,17 +1205,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 1100
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 2
+ NUM_VECTOR_BATCHES: 2
+ ROWS_EMITTED: 1100
+ SELECTED_ROWGROUPS: 2
6
PREHOOK: query: -- INPUT_RECORDS: 1000
select count(*) from orc_ppd where s = "wendy king" and t < 0
@@ -1287,15 +1257,15 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 1100
+ RECORDS_IN_Map_1: 100
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 4229
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 2
- NUM_VECTOR_BATCHES: 2
- ROWS_EMITTED: 1100
- SELECTED_ROWGROUPS: 2
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 100
+ SELECTED_ROWGROUPS: 1
2
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/ql/src/test/results/clientpositive/llap/orc_llap_counters1.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/orc_llap_counters1.q.out b/ql/src/test/results/clientpositive/llap/orc_llap_counters1.q.out
index 213cb9f..885871f 100644
--- a/ql/src/test/results/clientpositive/llap/orc_llap_counters1.q.out
+++ b/ql/src/test/results/clientpositive/llap/orc_llap_counters1.q.out
@@ -270,7 +270,7 @@ PREHOOK: type: QUERY
PREHOOK: Input: default@orc_ppd
#### A masked pattern was here ####
Stage-1 FILE SYSTEM COUNTERS:
- HDFS_BYTES_READ: 17046
+ HDFS_BYTES_READ: 17909
HDFS_BYTES_WRITTEN: 104
HDFS_READ_OPS: 6
HDFS_LARGE_READ_OPS: 0
http://git-wip-us.apache.org/repos/asf/hive/blob/4f2d1f50/ql/src/test/results/clientpositive/llap/orc_ppd_basic.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/orc_ppd_basic.q.out b/ql/src/test/results/clientpositive/llap/orc_ppd_basic.q.out
index 1b92cd9..b2acef1 100644
--- a/ql/src/test/results/clientpositive/llap/orc_ppd_basic.q.out
+++ b/ql/src/test/results/clientpositive/llap/orc_ppd_basic.q.out
@@ -260,7 +260,7 @@ PREHOOK: type: QUERY
PREHOOK: Input: default@orc_ppd
#### A masked pattern was here ####
Stage-1 FILE SYSTEM COUNTERS:
- HDFS_BYTES_READ: 638
+ HDFS_BYTES_READ: 1501
HDFS_BYTES_WRITTEN: 101
HDFS_READ_OPS: 4
HDFS_LARGE_READ_OPS: 0
@@ -634,17 +634,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 100
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 249
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 1
- NUM_VECTOR_BATCHES: 1
- ROWS_EMITTED: 100
- SELECTED_ROWGROUPS: 1
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: select count(*) from orc_ppd where t IN (-100, 125, 200)
PREHOOK: type: QUERY
@@ -659,17 +654,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 100
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 249
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 1
- NUM_VECTOR_BATCHES: 1
- ROWS_EMITTED: 100
- SELECTED_ROWGROUPS: 1
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: -- Row group statistics for column s:
-- Entry 0: count: 1000 hasNull: false min: max: zach young sum: 12907 positions: 0,0,0
@@ -697,7 +687,7 @@ PREHOOK: type: QUERY
PREHOOK: Input: default@orc_ppd
#### A masked pattern was here ####
Stage-1 FILE SYSTEM COUNTERS:
- HDFS_BYTES_READ: 4402
+ HDFS_BYTES_READ: 6166
HDFS_BYTES_WRITTEN: 101
HDFS_READ_OPS: 4
HDFS_LARGE_READ_OPS: 0
@@ -705,7 +695,7 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 1000
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
@@ -715,10 +705,10 @@ Stage-1 LLAP IO COUNTERS:
CACHE_MISS_BYTES: 3980
METADATA_CACHE_HIT: 1
METADATA_CACHE_MISS: 1
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 1000
+ SELECTED_ROWGROUPS: 1
2
PREHOOK: query: select count(*) from orc_ppd where s <=> "zach zipper"
PREHOOK: type: QUERY
@@ -733,17 +723,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 1100
+ RECORDS_IN_Map_1: 1000
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 2
- NUM_VECTOR_BATCHES: 2
- ROWS_EMITTED: 1100
- SELECTED_ROWGROUPS: 2
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 1000
+ SELECTED_ROWGROUPS: 1
6
PREHOOK: query: select count(*) from orc_ppd where s <=> ""
PREHOOK: type: QUERY
@@ -825,17 +815,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 1100
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 3980
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 2
- NUM_VECTOR_BATCHES: 2
- ROWS_EMITTED: 1100
- SELECTED_ROWGROUPS: 2
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: -- INPUT_RECORDS: 1000 (1 row group)
select count(*) from orc_ppd where s = cast("zach young" as char(10))
@@ -851,17 +836,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 1000
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 1000
+ SELECTED_ROWGROUPS: 1
2
PREHOOK: query: select count(*) from orc_ppd where s = cast("zach young" as varchar(10))
PREHOOK: type: QUERY
@@ -876,17 +861,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 1000
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 1000
+ SELECTED_ROWGROUPS: 1
2
PREHOOK: query: select count(*) from orc_ppd where s = cast("zach young" as varchar(50))
PREHOOK: type: QUERY
@@ -901,17 +886,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 1000
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 1000
+ SELECTED_ROWGROUPS: 1
2
PREHOOK: query: -- INPUT_RECORDS: 2000 (2 row groups)
select count(*) from orc_ppd where s < "b"
@@ -1084,17 +1069,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 3980
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: select count(*) from orc_ppd where s <=> "apache hive"
PREHOOK: type: QUERY
@@ -1109,17 +1089,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2000
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 3980
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 2
- NUM_VECTOR_BATCHES: 2
- ROWS_EMITTED: 2000
- SELECTED_ROWGROUPS: 2
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: select count(*) from orc_ppd where s IN ("a", "z")
PREHOOK: type: QUERY
@@ -1134,17 +1109,12 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 0
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
- CACHE_HIT_BYTES: 3980
- CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ SELECTED_ROWGROUPS: 0
0
PREHOOK: query: -- INPUT_RECORDS: 100
select count(*) from orc_ppd where s = "sarah ovid"
@@ -1160,17 +1130,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 100
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 100
+ SELECTED_ROWGROUPS: 1
2
PREHOOK: query: -- INPUT_RECORDS: 1100
select count(*) from orc_ppd where s = "wendy king"
@@ -1186,17 +1156,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 2100
+ RECORDS_IN_Map_1: 1100
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 3980
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 3
- NUM_VECTOR_BATCHES: 3
- ROWS_EMITTED: 2100
- SELECTED_ROWGROUPS: 3
+ NUM_DECODED_BATCHES: 2
+ NUM_VECTOR_BATCHES: 2
+ ROWS_EMITTED: 1100
+ SELECTED_ROWGROUPS: 2
6
PREHOOK: query: -- INPUT_RECORDS: 1000
select count(*) from orc_ppd where s = "wendy king" and t < 0
@@ -1238,17 +1208,17 @@ Stage-1 FILE SYSTEM COUNTERS:
Stage-1 HIVE COUNTERS:
CREATED_FILES: 1
DESERIALIZE_ERRORS: 0
- RECORDS_IN_Map_1: 1100
+ RECORDS_IN_Map_1: 100
RECORDS_OUT_0: 1
RECORDS_OUT_INTERMEDIATE_Map_1: 1
Stage-1 LLAP IO COUNTERS:
CACHE_HIT_BYTES: 4229
CACHE_MISS_BYTES: 0
METADATA_CACHE_HIT: 2
- NUM_DECODED_BATCHES: 2
- NUM_VECTOR_BATCHES: 2
- ROWS_EMITTED: 1100
- SELECTED_ROWGROUPS: 2
+ NUM_DECODED_BATCHES: 1
+ NUM_VECTOR_BATCHES: 1
+ ROWS_EMITTED: 100
+ SELECTED_ROWGROUPS: 1
2
PREHOOK: query: -- when cbo is disabled constant gets converted to HiveDecimal
select count(*) from orc_ppd where f=74.72