You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/04/11 05:34:49 UTC
[5/6] impala git commit: IMPALA-5717: Support for reading ORC data files

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/common/thrift/CatalogObjects.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/CatalogObjects.thrift b/common/thrift/CatalogObjects.thrift
index ecd27dc..0f71f5f 100644
--- a/common/thrift/CatalogObjects.thrift
+++ b/common/thrift/CatalogObjects.thrift
@@ -58,7 +58,8 @@ enum THdfsFileFormat {
   SEQUENCE_FILE,
   AVRO,
   PARQUET,
-  KUDU
+  KUDU,
+  ORC
 }
 
 // TODO: Since compression is also enabled for Kudu columns, we should
@@ -73,7 +74,8 @@ enum THdfsCompression {
   SNAPPY_BLOCKED,
   LZO,
   LZ4,
-  ZLIB
+  ZLIB,
+  ZSTD
 }
 
 enum TColumnEncoding {

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/cup/sql-parser.cup
----------------------------------------------------------------------
diff --git a/fe/src/main/cup/sql-parser.cup b/fe/src/main/cup/sql-parser.cup
index 999ed16..f2d7cef 100644
--- a/fe/src/main/cup/sql-parser.cup
+++ b/fe/src/main/cup/sql-parser.cup
@@ -263,8 +263,8 @@ terminal
   KW_IN, KW_INCREMENTAL, KW_INIT_FN, KW_INNER, KW_INPATH, KW_INSERT, KW_INT,
   KW_INTERMEDIATE, KW_INTERVAL, KW_INTO, KW_INVALIDATE, KW_IREGEXP, KW_IS, KW_JOIN,
   KW_KUDU, KW_LAST, KW_LEFT, KW_LIKE, KW_LIMIT, KW_LINES, KW_LOAD, KW_LOCATION, KW_MAP,
-  KW_MERGE_FN, KW_METADATA, KW_NOT, KW_NULL, KW_NULLS, KW_OFFSET, KW_ON, KW_OR, KW_ORDER,
-  KW_OUTER, KW_OVER, KW_OVERWRITE, KW_PARQUET, KW_PARQUETFILE, KW_PARTITION,
+  KW_MERGE_FN, KW_METADATA, KW_NOT, KW_NULL, KW_NULLS, KW_OFFSET, KW_ON, KW_OR, KW_ORC,
+  KW_ORDER, KW_OUTER, KW_OVER, KW_OVERWRITE, KW_PARQUET, KW_PARQUETFILE, KW_PARTITION,
   KW_PARTITIONED, KW_PARTITIONS, KW_PRECEDING, KW_PREPARE_FN, KW_PRIMARY, KW_PRODUCED,
   KW_PURGE, KW_RANGE, KW_RCFILE, KW_RECOVER, KW_REFRESH, KW_REGEXP, KW_RENAME,
   KW_REPEATABLE, KW_REPLACE, KW_REPLICATION, KW_RESTRICT, KW_RETURNS, KW_REVOKE,
@@ -1562,6 +1562,8 @@ file_format_val ::=
   {: RESULT = THdfsFileFormat.PARQUET; :}
   | KW_PARQUETFILE
   {: RESULT = THdfsFileFormat.PARQUET; :}
+  | KW_ORC
+  {: RESULT = THdfsFileFormat.ORC; :}
   | KW_TEXTFILE
   {: RESULT = THdfsFileFormat.TEXT; :}
   | KW_SEQUENCEFILE
@@ -3487,6 +3489,8 @@ word ::=
   {: RESULT = r.toString(); :}
   | KW_OR:r
   {: RESULT = r.toString(); :}
+  | KW_ORC:r
+  {: RESULT = r.toString(); :}
   | KW_ORDER:r
   {: RESULT = r.toString(); :}
   | KW_OUTER:r

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
index 5df3dfa..e442d66 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
@@ -743,10 +743,10 @@ public class ComputeStatsStmt extends StatementBase {
   public Set<Column> getValidatedColumnWhitelist() { return validatedColumnWhitelist_; }
 
   /**
-   * Returns true if this statement computes stats on Parquet partitions only,
+   * Returns true if this statement computes stats on Parquet/ORC partitions only,
    * false otherwise.
    */
-  public boolean isParquetOnly() {
+  public boolean isColumnar() {
     if (!(table_ instanceof HdfsTable)) return false;
     Collection<HdfsPartition> affectedPartitions = null;
     if (partitionSet_ != null) {
@@ -755,7 +755,9 @@ public class ComputeStatsStmt extends StatementBase {
       affectedPartitions = ((HdfsTable) table_).getPartitions();
     }
     for (HdfsPartition partition: affectedPartitions) {
-      if (partition.getFileFormat() != HdfsFileFormat.PARQUET) return false;
+      if (partition.getFileFormat() != HdfsFileFormat.PARQUET
+          && partition.getFileFormat() != HdfsFileFormat.ORC)
+        return false;
     }
     return true;
   }

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java b/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java
index e4fce60..32cae72 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java
@@ -62,6 +62,10 @@ public enum HdfsFileFormat {
       "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
       "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
       true, true),
+  ORC("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
+      "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat",
+      "org.apache.hadoop.hive.ql.io.orc.OrcSerde",
+      false, true),
   KUDU("org.apache.kudu.mapreduce.KuduTableInputFormat",
       "org.apache.kudu.mapreduce.KuduTableOutputFormat",
       "", false, false);
@@ -99,19 +103,23 @@ public enum HdfsFileFormat {
       "parquet.hive.MapredParquetInputFormat"
   };
 
-  private static final Map<String, HdfsFileFormat> VALID_INPUT_FORMATS =
-      ImmutableMap.<String, HdfsFileFormat>builder()
-          .put(RC_FILE.inputFormat(), RC_FILE)
-          .put(TEXT.inputFormat(), TEXT)
-          .put(LZO_TEXT.inputFormat(), TEXT)
-          .put(SEQUENCE_FILE.inputFormat(), SEQUENCE_FILE)
-          .put(AVRO.inputFormat(), AVRO)
-          .put(PARQUET.inputFormat(), PARQUET)
-          .put(PARQUET_LEGACY_INPUT_FORMATS[0], PARQUET)
-          .put(PARQUET_LEGACY_INPUT_FORMATS[1], PARQUET)
-          .put(PARQUET_LEGACY_INPUT_FORMATS[2], PARQUET)
-          .put(KUDU.inputFormat(), KUDU)
-          .build();
+  private static Map<String, HdfsFileFormat> VALID_INPUT_FORMATS;
+  public static void init(boolean enableOrcScanner) {
+    ImmutableMap.Builder<String, HdfsFileFormat> builder =
+        ImmutableMap.<String, HdfsFileFormat>builder()
+            .put(RC_FILE.inputFormat(), RC_FILE)
+            .put(TEXT.inputFormat(), TEXT)
+            .put(LZO_TEXT.inputFormat(), TEXT)
+            .put(SEQUENCE_FILE.inputFormat(), SEQUENCE_FILE)
+            .put(AVRO.inputFormat(), AVRO)
+            .put(PARQUET.inputFormat(), PARQUET)
+            .put(PARQUET_LEGACY_INPUT_FORMATS[0], PARQUET)
+            .put(PARQUET_LEGACY_INPUT_FORMATS[1], PARQUET)
+            .put(PARQUET_LEGACY_INPUT_FORMATS[2], PARQUET)
+            .put(KUDU.inputFormat(), KUDU);
+    if (enableOrcScanner) builder.put(ORC.inputFormat(), ORC);
+    VALID_INPUT_FORMATS = builder.build();
+  }
 
   /**
    * Returns true if the string describes an input format class that we support.
@@ -145,6 +153,7 @@ public enum HdfsFileFormat {
       case TEXT: return HdfsFileFormat.TEXT;
       case SEQUENCE_FILE: return HdfsFileFormat.SEQUENCE_FILE;
       case AVRO: return HdfsFileFormat.AVRO;
+      case ORC: return HdfsFileFormat.ORC;
       case PARQUET: return HdfsFileFormat.PARQUET;
       case KUDU: return HdfsFileFormat.KUDU;
       default:
@@ -159,6 +168,7 @@ public enum HdfsFileFormat {
       case TEXT: return THdfsFileFormat.TEXT;
       case SEQUENCE_FILE: return THdfsFileFormat.SEQUENCE_FILE;
       case AVRO: return THdfsFileFormat.AVRO;
+      case ORC: return THdfsFileFormat.ORC;
       case PARQUET: return THdfsFileFormat.PARQUET;
       case KUDU: return THdfsFileFormat.KUDU;
       default:
@@ -170,6 +180,7 @@ public enum HdfsFileFormat {
   public String toSql(HdfsCompression compressionType) {
     switch (this) {
       case RC_FILE: return "RCFILE";
+      case ORC: return "ORC";
       case TEXT:
         if (compressionType == HdfsCompression.LZO ||
             compressionType == HdfsCompression.LZO_INDEX) {
@@ -240,6 +251,7 @@ public enum HdfsFileFormat {
       case SEQUENCE_FILE:
       case AVRO:
       case PARQUET:
+      case ORC:
         return true;
       case KUDU:
         return false;

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java b/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
index b4e2564..f51b10e 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
@@ -57,6 +57,7 @@ public class HdfsStorageDescriptor {
       "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", // (seq / text / parquet)
       "org.apache.hadoop.hive.serde2.avro.AvroSerDe", // (avro)
       "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe", // (rc)
+      "org.apache.hadoop.hive.ql.io.orc.OrcSerde", // (orc)
       "parquet.hive.serde.ParquetHiveSerDe", // (parquet - legacy)
       // TODO: Verify the following Parquet SerDe works with Impala and add
       // support for the new input/output format classes. See IMPALA-4214.

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index 7735f98..ac67d7d 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -332,11 +332,12 @@ public class HdfsScanNode extends ScanNode {
     Set<HdfsFileFormat> fileFormats = computeScanRangeLocations(analyzer);
 
     // Determine backend scan node implementation to use. The optimized MT implementation
-    // is currently only supported for Parquet.
+    // is currently supported for Parquet, ORC and Text.
     if (analyzer.getQueryOptions().isSetMt_dop() &&
         analyzer.getQueryOptions().mt_dop > 0 &&
         fileFormats.size() == 1 &&
         (fileFormats.contains(HdfsFileFormat.PARQUET)
+          || fileFormats.contains(HdfsFileFormat.ORC)
           || fileFormats.contains(HdfsFileFormat.TEXT))) {
       useMtScanNode_ = true;
     } else {
@@ -1191,9 +1192,10 @@ public class HdfsScanNode extends ScanNode {
     Preconditions.checkNotNull(desc_.getTable() instanceof HdfsTable);
     HdfsTable table = (HdfsTable) desc_.getTable();
     int perHostScanRanges;
-    if (table.getMajorityFormat() == HdfsFileFormat.PARQUET) {
+    if (table.getMajorityFormat() == HdfsFileFormat.PARQUET
+        || table.getMajorityFormat() == HdfsFileFormat.ORC) {
       // For the purpose of this estimation, the number of per-host scan ranges for
-      // Parquet files are equal to the number of columns read from the file. I.e.
+      // Parquet/ORC files are equal to the number of columns read from the file. I.e.
       // excluding partition columns and columns that are populated from file metadata.
       perHostScanRanges = 0;
       for (SlotDescriptor slot: desc_.getSlots()) {

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/service/BackendConfig.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/service/BackendConfig.java b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
index 3833094..a94f46e 100644
--- a/fe/src/main/java/org/apache/impala/service/BackendConfig.java
+++ b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
@@ -23,6 +23,7 @@ import static org.apache.hadoop.fs.CommonConfigurationKeysPublic
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.security.authentication.util.KerberosName;
 import org.apache.impala.analysis.SqlScanner;
+import org.apache.impala.catalog.HdfsFileFormat;
 import org.apache.impala.thrift.TBackendGflags;
 
 import com.google.common.base.Preconditions;
@@ -45,6 +46,7 @@ public class BackendConfig {
     Preconditions.checkNotNull(cfg);
     INSTANCE = new BackendConfig(cfg);
     SqlScanner.init(cfg.getReserved_words_version());
+    HdfsFileFormat.init(cfg.isEnable_orc_scanner());
     initAuthToLocal();
   }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/service/Frontend.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/service/Frontend.java b/fe/src/main/java/org/apache/impala/service/Frontend.java
index 392c249..348adaf 100644
--- a/fe/src/main/java/org/apache/impala/service/Frontend.java
+++ b/fe/src/main/java/org/apache/impala/service/Frontend.java
@@ -1014,11 +1014,11 @@ public class Frontend {
       if (thriftLineageGraph != null && thriftLineageGraph.isSetQuery_text()) {
         result.catalog_op_request.setLineage_graph(thriftLineageGraph);
       }
-      // Set MT_DOP=4 for COMPUTE STATS on Parquet tables, unless the user has already
+      // Set MT_DOP=4 for COMPUTE STATS on Parquet/ORC tables, unless the user has already
       // provided another value for MT_DOP.
       if (!queryOptions.isSetMt_dop() &&
           analysisResult.isComputeStatsStmt() &&
-          analysisResult.getComputeStatsStmt().isParquetOnly()) {
+          analysisResult.getComputeStatsStmt().isColumnar()) {
         queryOptions.setMt_dop(4);
       }
       // If unset, set MT_DOP to 0 to simplify the rest of the code.

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/jflex/sql-scanner.flex
----------------------------------------------------------------------
diff --git a/fe/src/main/jflex/sql-scanner.flex b/fe/src/main/jflex/sql-scanner.flex
index dd1da7c..0512a2a 100644
--- a/fe/src/main/jflex/sql-scanner.flex
+++ b/fe/src/main/jflex/sql-scanner.flex
@@ -176,6 +176,7 @@ import org.apache.impala.thrift.TReservedWordsVersion;
     keywordMap.put("on", SqlParserSymbols.KW_ON);
     keywordMap.put("||", SqlParserSymbols.KW_OR);
     keywordMap.put("or", SqlParserSymbols.KW_OR);
+    keywordMap.put("orc", SqlParserSymbols.KW_ORC);
     keywordMap.put("order", SqlParserSymbols.KW_ORDER);
     keywordMap.put("outer", SqlParserSymbols.KW_OUTER);
     keywordMap.put("over", SqlParserSymbols.KW_OVER);

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/LineItemMultiBlock/README.dox
----------------------------------------------------------------------
diff --git a/testdata/LineItemMultiBlock/README.dox b/testdata/LineItemMultiBlock/README.dox
index 7608067..1d6db46 100755
--- a/testdata/LineItemMultiBlock/README.dox
+++ b/testdata/LineItemMultiBlock/README.dox
@@ -1,6 +1,7 @@
 This file was created for:
 IMPALA-1881: Maximize data locality when scanning Parquet files with multiple row groups.
 IMPALA-2466: Add more tests to the HDFS parquet scanner.
+IMPALA-5717: Add tests for HDFS orc scanner.
 
 The table lineitem_multiblock is a single parquet file with:
  - A row group size of approximately 12 KB each.
@@ -31,3 +32,21 @@ blocks.
 
 'lineitem_multiblock_one_row_group' was created similarly but with a much higher
 'parquet.block.size' so that everything fit in one row group.
+
+----
+
+The orc files are created by the following hive queries:
+
+use functional_orc_def;
+
+set orc.stripe.size=1024;
+set orc.compress=ZLIB;
+create table lineitem_threeblocks like tpch.lineitem stored as orc;
+create table lineitem_sixblocks like tpch.lineitem stored as orc;
+insert overwrite table lineitem_threeblocks select * from tpch.lineitem limit 16000;
+insert overwrite table lineitem_sixblocks select * from tpch.lineitem limit 30000;
+
+set orc.stripe.size=67108864;
+create table lineitem_orc_multiblock_one_stripe like tpch.lineitem stored as orc;
+insert overwrite table lineitem_orc_multiblock_one_stripe select * from
+tpch.lineitem limit 16000;

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/LineItemMultiBlock/lineitem_orc_multiblock_one_stripe.orc
----------------------------------------------------------------------
diff --git a/testdata/LineItemMultiBlock/lineitem_orc_multiblock_one_stripe.orc b/testdata/LineItemMultiBlock/lineitem_orc_multiblock_one_stripe.orc
new file mode 100644
index 0000000..7dbbffb
Binary files /dev/null and b/testdata/LineItemMultiBlock/lineitem_orc_multiblock_one_stripe.orc differ

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/LineItemMultiBlock/lineitem_sixblocks.orc
----------------------------------------------------------------------
diff --git a/testdata/LineItemMultiBlock/lineitem_sixblocks.orc b/testdata/LineItemMultiBlock/lineitem_sixblocks.orc
new file mode 100644
index 0000000..5fa6cfa
Binary files /dev/null and b/testdata/LineItemMultiBlock/lineitem_sixblocks.orc differ

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/LineItemMultiBlock/lineitem_threeblocks.orc
----------------------------------------------------------------------
diff --git a/testdata/LineItemMultiBlock/lineitem_threeblocks.orc b/testdata/LineItemMultiBlock/lineitem_threeblocks.orc
new file mode 100644
index 0000000..9b12540
Binary files /dev/null and b/testdata/LineItemMultiBlock/lineitem_threeblocks.orc differ

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/bin/create-load-data.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 311029d..e50515b 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -154,6 +154,9 @@ function load-custom-schemas {
   hadoop fs -mkdir -p /test-warehouse/chars_formats_parquet/
   hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.parquet \
     /test-warehouse/chars_formats_parquet
+  hadoop fs -mkdir -p /test-warehouse/chars_formats_orc_def/
+  hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.orc \
+    /test-warehouse/chars_formats_orc_def
   hadoop fs -mkdir -p /test-warehouse/chars_formats_text/
   hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.txt \
     /test-warehouse/chars_formats_text

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/bin/generate-schema-statements.py
----------------------------------------------------------------------
diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py
index 34c2084..3f730e6 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -128,6 +128,7 @@ FILE_FORMAT_MAP = {
   'text': 'TEXTFILE',
   'seq': 'SEQUENCEFILE',
   'rc': 'RCFILE',
+  'orc': 'ORC',
   'parquet': 'PARQUET',
   'text_lzo':
     "\nINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'" +
@@ -219,7 +220,7 @@ def build_table_template(file_format, columns, partition_columns, row_format,
     else:
       tblproperties["avro.schema.url"] = "hdfs://%s/%s/%s/{table_name}.json" \
         % (options.hdfs_namenode, options.hive_warehouse_dir, avro_schema_dir)
-  elif file_format in 'parquet':
+  elif file_format in ['parquet', 'orc']:  # columnar formats don't need row format
     row_format_stmt = str()
   elif file_format == 'kudu':
     # Use partitioned_by to set a trivial hash distribution
@@ -243,7 +244,7 @@ def build_table_template(file_format, columns, partition_columns, row_format,
     for table_property in table_properties.split("\n"):
       format_prop = table_property.split(":")
       if format_prop[0] == file_format:
-        key_val = format_prop[1].split("=");
+        key_val = format_prop[1].split("=")
         tblproperties[key_val[0]] = key_val[1]
 
   all_tblproperties = []
@@ -658,7 +659,7 @@ def generate_statements(output_name, test_vectors, sections,
             # that weren't already added to the table. So, for force reload, manually
             # delete the partition directories.
             output.create.append(("DFS -rm -R {data_path};").format(
-              data_path=data_path));
+              data_path=data_path))
           else:
             # If this is not a force reload use msck repair to add the partitions
             # into the table.

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/bin/run-hive-server.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/run-hive-server.sh b/testdata/bin/run-hive-server.sh
index 49d1de2..3b2c83d 100755
--- a/testdata/bin/run-hive-server.sh
+++ b/testdata/bin/run-hive-server.sh
@@ -73,9 +73,10 @@ ${CLUSTER_BIN}/wait-for-metastore.py --transport=${METASTORE_TRANSPORT}
 
 if [ ${ONLY_METASTORE} -eq 0 ]; then
   # Starts a HiveServer2 instance on the port specified by the HIVE_SERVER2_THRIFT_PORT
-  # environment variable.
+  # environment variable. HADOOP_HEAPSIZE should be set to at least 2048 to avoid OOM
+  # when loading ORC tables like widerow.
   if [[ $IMPALA_MINICLUSTER_PROFILE == 2 ]]; then
-    HADOOP_HEAPSIZE="512" hive --service hiveserver2 > ${LOGDIR}/hive-server2.out 2>&1 &
+    HADOOP_HEAPSIZE="2048" hive --service hiveserver2 > ${LOGDIR}/hive-server2.out 2>&1 &
   elif [[ $IMPALA_MINICLUSTER_PROFILE == 3 ]]; then
     HADOOP_CLIENT_OPTS="-Xmx2048m -Dhive.log.file=hive-server2.log" hive \
       --service hiveserver2 > ${LOGDIR}/hive-server2.out 2>&1 &

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/cluster/node_templates/common/etc/hadoop/conf/hdfs-site.xml.tmpl
----------------------------------------------------------------------
diff --git a/testdata/cluster/node_templates/common/etc/hadoop/conf/hdfs-site.xml.tmpl b/testdata/cluster/node_templates/common/etc/hadoop/conf/hdfs-site.xml.tmpl
index f72dd97..c9ee70b 100644
--- a/testdata/cluster/node_templates/common/etc/hadoop/conf/hdfs-site.xml.tmpl
+++ b/testdata/cluster/node_templates/common/etc/hadoop/conf/hdfs-site.xml.tmpl
@@ -82,6 +82,12 @@
     <value>134217728</value>
   </property>
 
+  <!-- Decrease this so we can create mini test files across several blocks -->
+  <property>
+    <name>dfs.namenode.fs-limits.min-block-size</name>
+    <value>1024</value>
+  </property>
+
   <!-- Set the max cached memory to ~64kb. This must be less than ulimit -l -->
   <property>
     <name>dfs.datanode.max.locked.memory</name>

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/data/chars-formats.orc
----------------------------------------------------------------------
diff --git a/testdata/data/chars-formats.orc b/testdata/data/chars-formats.orc
new file mode 100644
index 0000000..625c2c8
Binary files /dev/null and b/testdata/data/chars-formats.orc differ

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/datasets/functional/functional_schema_template.sql
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index cede525..a7a5eac 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -739,6 +739,7 @@ INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION(p=1) SELECT i
 INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION(p=2) SELECT id, named_struct("f1",string_col,"f2",int_col), array(1, 2, 3), map("k", cast(0 as bigint)) FROM functional.alltypestiny;
 INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION(p=3) SELECT id, named_struct("f1",string_col,"f2",int_col), array(1, 2, 3), map("k", cast(0 as bigint)) FROM functional.alltypestiny;
 INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION(p=4) SELECT id, named_struct("f1",string_col,"f2",int_col), array(1, 2, 3), map("k", cast(0 as bigint)) FROM functional.alltypestiny;
+INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION(p=5) SELECT id, named_struct("f1",string_col,"f2",int_col), array(1, 2, 3), map("k", cast(0 as bigint)) FROM functional.alltypestiny;
 -- The order of insertions and alterations is deliberately chose to work around a Hive
 -- bug where the format of an altered partition is reverted back to the original format after
 -- an insert. So we first do the insert, and then alter the format.
@@ -746,6 +747,7 @@ USE {db_name}{db_suffix};
 ALTER TABLE {table_name} PARTITION (p=2) SET FILEFORMAT PARQUET;
 ALTER TABLE {table_name} PARTITION (p=3) SET FILEFORMAT AVRO;
 ALTER TABLE {table_name} PARTITION (p=4) SET FILEFORMAT RCFILE;
+ALTER TABLE {table_name} PARTITION (p=5) SET FILEFORMAT ORC;
 USE default;
 ====
 ---- DATASET

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/datasets/functional/schema_constraints.csv
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index ef65b9a..baf0306 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -66,6 +66,7 @@ table_name:complextypes_fileformat, constraint:restrict_to, table_format:parquet
 table_name:complextypes_fileformat, constraint:restrict_to, table_format:avro/snap/block
 table_name:complextypes_fileformat, constraint:restrict_to, table_format:rc/snap/block
 table_name:complextypes_fileformat, constraint:restrict_to, table_format:seq/snap/block
+table_name:complextypes_fileformat, constraint:restrict_to, table_format:orc/def/block
 table_name:complextypes_multifileformat, constraint:restrict_to, table_format:text/none/none
 
 # TODO: Avro
@@ -134,6 +135,8 @@ table_name:decimal_tbl, constraint:restrict_to, table_format:parquet/none/none
 table_name:decimal_tiny, constraint:restrict_to, table_format:parquet/none/none
 table_name:decimal_tbl, constraint:restrict_to, table_format:kudu/none/none
 table_name:decimal_tiny, constraint:restrict_to, table_format:kudu/none/none
+table_name:decimal_tbl, constraint:restrict_to, table_format:orc/def/block
+table_name:decimal_tiny, constraint:restrict_to, table_format:orc/def/block
 
 table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/block
 

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
index 9c68c65..1e61b7d 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
@@ -15,6 +15,38 @@ PLAN-ROOT SINK
    partitions=1/1 files=1 size=227B
    predicates: !empty(t.a)
 ====
+# Complex types are not supported on ORC.
+select 1 from functional_orc_def.complextypes_fileformat t, t.a
+---- PLAN
+not implemented: Scan of table 't' in format 'ORC' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
+Complex types are supported for these file formats: PARQUET.
+====
+select s.f1 from functional_orc_def.complextypes_fileformat t, t.m
+---- PLAN
+not implemented: Scan of table 't' in format 'ORC' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
+Complex types are supported for these file formats: PARQUET.
+====
+# Complex types are not supported on ORC, however queries materializing
+# only scalar type columns are allowed.
+select id from functional_orc_def.complextypes_fileformat
+---- PLAN
+PLAN-ROOT SINK
+|
+00:SCAN HDFS [functional_orc_def.complextypes_fileformat]
+   partitions=1/1 files=1 size=624B
+====
+# Complex types are not supported on ORC but count(*) and similar
+# queries should work.
+select count(*) from functional_orc_def.complextypes_fileformat
+---- PLAN
+PLAN-ROOT SINK
+|
+01:AGGREGATE [FINALIZE]
+|  output: count(*)
+|
+00:SCAN HDFS [functional_orc_def.complextypes_fileformat]
+   partitions=1/1 files=1 size=624B
+====
 # Complex types are not supported on Avro.
 select s.f1 from functional_avro_snap.complextypes_fileformat t, t.a
 ---- PLAN
@@ -111,11 +143,12 @@ select complex_struct_col.f1 from functional_hbase.allcomplextypes
 not implemented: Scan of table 'functional_hbase.allcomplextypes.complex_struct_col.f1' is not supported because 'functional_hbase.allcomplextypes' references a nested field/collection.
 Complex types are supported for these file formats: PARQUET.
 ====
-# The complextypes_multifileformat has three partitions with different file formats:
+# The complextypes_multifileformat has five partitions with different file formats:
 # p=1 text
 # p=2 parquet
 # p=3 avro
 # p=4 rc
+# p=5 orc
 # Scanning a text partition of a multi-format table with complex types fails.
 select 1 from functional.complextypes_multifileformat where p = 1
 ---- PLAN
@@ -136,7 +169,7 @@ PLAN-ROOT SINK
 |  03:UNNEST [t.a]
 |
 00:SCAN HDFS [functional.complextypes_multifileformat t]
-   partitions=1/4 files=1 size=128B
+   partitions=1/5 files=1 size=128B
    predicates: !empty(t.a)
 ====
 # Scanning an Avro partition of a multi-format table with complex types fails.
@@ -161,5 +194,23 @@ PLAN-ROOT SINK
 |  output: count(*)
 |
 00:SCAN HDFS [functional.complextypes_multifileformat]
-   partitions=1/4 files=1 size=128B
+   partitions=1/5 files=1 size=128B
+====
+# Scanning an ORC file partition of a multi-format table with complex types fails.
+select id from functional.complextypes_multifileformat t, t.a where p = 5
+---- PLAN
+not implemented: Scan of partition 'p=5' in format 'ORC' of table 't' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
+Complex types are supported for these file formats: PARQUET.
+====
+# Complex types are not supported on ORC files but count(*) and similar
+# queries should work.
+select count(*) from functional.complextypes_multifileformat where p = 5
+---- PLAN
+PLAN-ROOT SINK
+|
+01:AGGREGATE [FINALIZE]
+|  output: count(*)
+|
+00:SCAN HDFS [functional.complextypes_multifileformat]
+   partitions=1/5 files=1 size=128B
 ====

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-query/functional-query_core.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/functional-query_core.csv b/testdata/workloads/functional-query/functional-query_core.csv
index dffca78..7118e3f 100644
--- a/testdata/workloads/functional-query/functional-query_core.csv
+++ b/testdata/workloads/functional-query/functional-query_core.csv
@@ -2,6 +2,7 @@
 file_format:text, dataset:functional, compression_codec:none, compression_type:none
 file_format:seq, dataset:functional, compression_codec:snap, compression_type:block
 file_format:rc, dataset: functional, compression_codec: snap, compression_type: block
+file_format:orc, dataset: functional, compression_codec: def, compression_type: block
 file_format:parquet, dataset: functional, compression_codec: none, compression_type: none
 file_format:avro, dataset: functional, compression_codec: snap, compression_type: block
 file_format:hbase, dataset:functional, compression_codec:none, compression_type:none

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-query/functional-query_dimensions.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/functional-query_dimensions.csv b/testdata/workloads/functional-query/functional-query_dimensions.csv
index 539122b..bcb4406 100644
--- a/testdata/workloads/functional-query/functional-query_dimensions.csv
+++ b/testdata/workloads/functional-query/functional-query_dimensions.csv
@@ -1,4 +1,4 @@
-file_format: text,seq,rc,avro,parquet,hbase,kudu
+file_format: text,seq,rc,avro,parquet,orc,hbase,kudu
 dataset: functional
 compression_codec: none,def,gzip,bzip,snap,lzo
 compression_type: none,block,record

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-query/functional-query_exhaustive.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/functional-query_exhaustive.csv b/testdata/workloads/functional-query/functional-query_exhaustive.csv
index 18331c6..a06ab52 100644
--- a/testdata/workloads/functional-query/functional-query_exhaustive.csv
+++ b/testdata/workloads/functional-query/functional-query_exhaustive.csv
@@ -22,5 +22,6 @@ file_format: avro, dataset: functional, compression_codec: none, compression_typ
 file_format: avro, dataset: functional, compression_codec: def, compression_type: block
 file_format: avro, dataset: functional, compression_codec: snap, compression_type: block
 file_format: parquet, dataset: functional, compression_codec: none, compression_type: none
+file_format: orc, dataset: functional, compression_codec: def, compression_type: block
 file_format: hbase, dataset: functional, compression_codec: none, compression_type: none
 file_format: kudu, dataset: functional, compression_codec: none, compression_type: none

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-query/functional-query_pairwise.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/functional-query_pairwise.csv b/testdata/workloads/functional-query/functional-query_pairwise.csv
index 0a4ee09..e046a09 100644
--- a/testdata/workloads/functional-query/functional-query_pairwise.csv
+++ b/testdata/workloads/functional-query/functional-query_pairwise.csv
@@ -4,5 +4,6 @@ file_format: seq, dataset: functional, compression_codec: def, compression_type:
 file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block
 file_format: avro, dataset: functional, compression_codec: snap, compression_type: block
 file_format: parquet, dataset: functional, compression_codec: none, compression_type: none
+file_format: orc, dataset: functional, compression_codec: def, compression_type: block
 file_format: hbase, dataset: functional, compression_codec: none, compression_type: none
 file_format: kudu, dataset: functional, compression_codec: none, compression_type: none

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test
new file mode 100644
index 0000000..ee06258
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test
@@ -0,0 +1,127 @@
+====
+---- QUERY
+select c1 from illtypes
+---- CATCH
+Type mismatch: table column BOOLEAN is map to column int in ORC file
+====
+---- QUERY
+select c2 from illtypes
+---- CATCH
+Type mismatch: table column FLOAT is map to column boolean in ORC file
+====
+---- QUERY
+select c3 from illtypes
+---- CATCH
+Type mismatch: table column BOOLEAN is map to column tinyint in ORC file
+====
+---- QUERY
+select c4 from illtypes
+---- CATCH
+Type mismatch: table column TINYINT is map to column smallint in ORC file
+====
+---- QUERY
+select c5 from illtypes
+---- CATCH
+Type mismatch: table column SMALLINT is map to column int in ORC file
+====
+---- QUERY
+select c6 from illtypes
+---- CATCH
+Type mismatch: table column INT is map to column bigint in ORC file
+====
+---- QUERY
+select c7 from illtypes
+---- CATCH
+Type mismatch: table column BOOLEAN is map to column float in ORC file
+====
+---- QUERY
+select c8 from illtypes
+---- CATCH
+Type mismatch: table column STRING is map to column double in ORC file
+====
+---- QUERY
+select c9 from illtypes
+---- CATCH
+Type mismatch: table column INT is map to column string in ORC file
+====
+---- QUERY
+select c10 from illtypes
+---- CATCH
+Type mismatch: table column FLOAT is map to column string in ORC file
+====
+---- QUERY
+select c11 from illtypes
+---- CATCH
+Type mismatch: table column BIGINT is map to column timestamp in ORC file
+====
+---- QUERY
+select * from safetypes order by c1
+---- TYPES
+bigint,boolean,smallint,int,bigint,bigint,double,double,char,string,timestamp,int,int
+---- RESULTS
+0,true,0,0,0,0,0,0,'01/','0',2009-01-01 00:00:00,2009,1
+1,false,1,1,1,10,1.100000023841858,10.1,'01/','1',2009-01-01 00:01:00,2009,1
+2,true,0,0,0,0,0,0,'02/','0',2009-02-01 00:00:00,2009,2
+3,false,1,1,1,10,1.100000023841858,10.1,'02/','1',2009-02-01 00:01:00,2009,2
+4,true,0,0,0,0,0,0,'03/','0',2009-03-01 00:00:00,2009,3
+5,false,1,1,1,10,1.100000023841858,10.1,'03/','1',2009-03-01 00:01:00,2009,3
+6,true,0,0,0,0,0,0,'04/','0',2009-04-01 00:00:00,2009,4
+7,false,1,1,1,10,1.100000023841858,10.1,'04/','1',2009-04-01 00:01:00,2009,4
+====
+---- QUERY
+select d1 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+1234
+2345
+12345
+12345
+132842
+====
+---- QUERY
+select d2 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+---- CATCH
+It can't be truncated to table column DECIMAL(8,0) for column decimal(10,0) in ORC file
+====
+---- QUERY
+select d3 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+1.2345678900
+12.3456789000
+123.4567890000
+1234.5678900000
+12345.6789000000
+====
+---- QUERY
+select d4 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+---- CATCH
+Type mismatch: table column DECIMAL(20,20) is map to column decimal(38,38) in ORC file
+====
+---- QUERY
+select d5 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+---- CATCH
+Type mismatch: table column DECIMAL(2,0) is map to column decimal(10,5) in ORC file
+====
+---- QUERY
+select d6 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+1
+1
+1
+1
+1
+====

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpcds/tpcds_core.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpcds/tpcds_core.csv b/testdata/workloads/tpcds/tpcds_core.csv
index 94b4b22..48cc97d 100644
--- a/testdata/workloads/tpcds/tpcds_core.csv
+++ b/testdata/workloads/tpcds/tpcds_core.csv
@@ -2,3 +2,4 @@
 file_format: text, dataset: tpcds, compression_codec: none, compression_type: none
 file_format: seq, dataset: tpcds, compression_codec: snap, compression_type: block
 file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpcds, compression_codec: def, compression_type: block

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpcds/tpcds_dimensions.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpcds/tpcds_dimensions.csv b/testdata/workloads/tpcds/tpcds_dimensions.csv
index 8137b7a..bae5d90 100644
--- a/testdata/workloads/tpcds/tpcds_dimensions.csv
+++ b/testdata/workloads/tpcds/tpcds_dimensions.csv
@@ -1,4 +1,4 @@
-file_format: text,seq,rc,avro,parquet
+file_format: text,seq,rc,avro,parquet,orc
 dataset: tpcds
 compression_codec: none,def,gzip,bzip,snap,lzo
 compression_type: none,block,record

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpcds/tpcds_exhaustive.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpcds/tpcds_exhaustive.csv b/testdata/workloads/tpcds/tpcds_exhaustive.csv
index c4b4f99..57fcddd 100644
--- a/testdata/workloads/tpcds/tpcds_exhaustive.csv
+++ b/testdata/workloads/tpcds/tpcds_exhaustive.csv
@@ -21,3 +21,6 @@ file_format: avro, dataset: tpcds, compression_codec: snap, compression_type: bl
 file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none
 file_format: parquet, dataset: tpcds, compression_codec: def, compression_type: block
 file_format: parquet, dataset: tpcds, compression_codec: snap, compression_type: block
+file_format: orc, dataset: tpcds, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpcds, compression_codec: def, compression_type: block
+file_format: orc, dataset: tpcds, compression_codec: snap, compression_type: block

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpcds/tpcds_pairwise.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpcds/tpcds_pairwise.csv b/testdata/workloads/tpcds/tpcds_pairwise.csv
index e643495..61ee66c 100644
--- a/testdata/workloads/tpcds/tpcds_pairwise.csv
+++ b/testdata/workloads/tpcds/tpcds_pairwise.csv
@@ -13,3 +13,6 @@ file_format: rc, dataset: tpcds, compression_codec: def, compression_type: block
 file_format: avro, dataset: tpcds, compression_codec: none, compression_type: none
 file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none
 file_format: rc, dataset: tpcds, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpcds, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpcds, compression_codec: def, compression_type: block
+file_format: orc, dataset: tpcds, compression_codec: snap, compression_type: block

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpch/tpch_core.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/tpch_core.csv b/testdata/workloads/tpch/tpch_core.csv
index 86804ac..024063c 100644
--- a/testdata/workloads/tpch/tpch_core.csv
+++ b/testdata/workloads/tpch/tpch_core.csv
@@ -7,4 +7,5 @@ file_format:rc, dataset:tpch, compression_codec:none, compression_type:none
 file_format:avro, dataset:tpch, compression_codec: none, compression_type: none
 file_format:avro, dataset:tpch, compression_codec: snap, compression_type: block
 file_format:parquet, dataset:tpch, compression_codec: none, compression_type: none
+file_format:orc, dataset:tpch, compression_codec: def, compression_type: block
 file_format:kudu, dataset:tpch, compression_codec: none, compression_type: none

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpch/tpch_dimensions.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/tpch_dimensions.csv b/testdata/workloads/tpch/tpch_dimensions.csv
index 1de34aa..f1ce5f0 100644
--- a/testdata/workloads/tpch/tpch_dimensions.csv
+++ b/testdata/workloads/tpch/tpch_dimensions.csv
@@ -1,4 +1,4 @@
-file_format: text,seq,rc,avro,parquet,kudu
+file_format: text,seq,rc,avro,parquet,orc,kudu
 dataset: tpch
 compression_codec: none,def,gzip,bzip,snap,lzo
 compression_type: none,block,record

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpch/tpch_exhaustive.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/tpch_exhaustive.csv b/testdata/workloads/tpch/tpch_exhaustive.csv
index 32085bf..3513dc5 100644
--- a/testdata/workloads/tpch/tpch_exhaustive.csv
+++ b/testdata/workloads/tpch/tpch_exhaustive.csv
@@ -22,4 +22,7 @@ file_format: avro, dataset: tpch, compression_codec: snap, compression_type: blo
 file_format: parquet, dataset: tpch, compression_codec: none, compression_type: none
 file_format: parquet, dataset: tpch, compression_codec: def, compression_type: block
 file_format: parquet, dataset: tpch, compression_codec: snap, compression_type: block
+file_format: orc, dataset: tpch, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpch, compression_codec: def, compression_type: block
+file_format: orc, dataset: tpch, compression_codec: snap, compression_type: block
 file_format: kudu, dataset:tpch, compression_codec: none, compression_type: none

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpch/tpch_pairwise.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/tpch_pairwise.csv b/testdata/workloads/tpch/tpch_pairwise.csv
index 0744cf5..2eb4176 100644
--- a/testdata/workloads/tpch/tpch_pairwise.csv
+++ b/testdata/workloads/tpch/tpch_pairwise.csv
@@ -13,4 +13,7 @@ file_format: rc, dataset: tpch, compression_codec: def, compression_type: block
 file_format: avro, dataset: tpch, compression_codec: none, compression_type: none
 file_format: parquet, dataset: tpch, compression_codec: none, compression_type: none
 file_format: rc, dataset: tpch, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpch, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpch, compression_codec: def, compression_type: block
+file_format: orc, dataset: tpch, compression_codec: snap, compression_type: block
 file_format: kudu, dataset:tpch, compression_codec: none, compression_type: none

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/common/impala_test_suite.py
----------------------------------------------------------------------
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index d57b1cb..2e35c67 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -503,7 +503,7 @@ class ImpalaTestSuite(BaseTestSuite):
 
     Database names are dependent on the input format for table, which the table names
     remaining the same. A use database is issued before query execution. As such,
-    dabase names need to be build pre execution, this method wraps around the different
+    database names need to be build pre execution, this method wraps around the different
     execute methods and provides a common interface to issue the proper use command.
     """
     @wraps(function)

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/common/test_dimensions.py
----------------------------------------------------------------------
diff --git a/tests/common/test_dimensions.py b/tests/common/test_dimensions.py
index 4171e1f..df3f8c2 100644
--- a/tests/common/test_dimensions.py
+++ b/tests/common/test_dimensions.py
@@ -28,7 +28,7 @@ WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
 # of what specific table format to target along with the exec options (num_nodes, etc)
 # to use when running the query.
 class TableFormatInfo(object):
-  KNOWN_FILE_FORMATS = ['text', 'seq', 'rc', 'parquet', 'avro', 'hbase']
+  KNOWN_FILE_FORMATS = ['text', 'seq', 'rc', 'parquet', 'orc', 'avro', 'hbase']
   if os.environ['KUDU_IS_SUPPORTED'] == 'true':
     KNOWN_FILE_FORMATS.append('kudu')
   KNOWN_COMPRESSION_CODECS = ['none', 'snap', 'gzip', 'bzip', 'def', 'lzo']

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/common/test_vector.py
----------------------------------------------------------------------
diff --git a/tests/common/test_vector.py b/tests/common/test_vector.py
index 4d22269..0c9cca4 100644
--- a/tests/common/test_vector.py
+++ b/tests/common/test_vector.py
@@ -52,7 +52,7 @@
 # otherwise. For example, if we want to make sure 'bool' columns are not used with 'sum':
 #
 # ImpalaTestMatrix.add_constraint(lambda v:\
-#    not (v.get_value('col_type') == 'bool and v.get_value('agg_func') == 'sum'))
+#    not (v.get_value('col_type') == 'bool' and v.get_value('agg_func') == 'sum'))
 #
 # Additional examples of usage can be found within the test suites.
 

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/comparison/cli_options.py
----------------------------------------------------------------------
diff --git a/tests/comparison/cli_options.py b/tests/comparison/cli_options.py
index 885ef84..1d737cf 100644
--- a/tests/comparison/cli_options.py
+++ b/tests/comparison/cli_options.py
@@ -221,7 +221,7 @@ def create_cluster(args):
 
 
 def add_storage_format_options(parser):
-  storage_formats = ['avro', 'parquet', 'rcfile', 'sequencefile', 'textfile']
+  storage_formats = ['avro', 'parquet', 'orc', 'rcfile', 'sequencefile', 'textfile']
   parser.add_argument(
       '--storage-file-formats', default=','.join(storage_formats),
       help='A comma separated list of storage formats to use.')

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/query_test/test_chars.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_chars.py b/tests/query_test/test_chars.py
index b182b91..4444410 100644
--- a/tests/query_test/test_chars.py
+++ b/tests/query_test/test_chars.py
@@ -57,6 +57,11 @@ class TestCharFormats(ImpalaTestSuite):
         STORED AS PARQUET
         LOCATION "{0}"'''.format(get_fs_path("/test-warehouse/chars_formats_parquet")))
     self.client.execute('''create external table if not exists
+        functional_orc_def.chars_formats
+        (cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
+        STORED AS ORC
+        LOCATION "{0}"'''.format(get_fs_path("/test-warehouse/chars_formats_orc_def")))
+    self.client.execute('''create external table if not exists
         functional.chars_formats
         (cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
         ROW FORMAT delimited fields terminated by ','  escaped by '\\\\'
@@ -84,6 +89,7 @@ class TestCharFormats(ImpalaTestSuite):
         (v.get_value('table_format').file_format in ['avro'] and
         v.get_value('table_format').compression_codec in ['snap']) or
         v.get_value('table_format').file_format in ['parquet'] or
+        v.get_value('table_format').file_format in ['orc'] or
         (v.get_value('table_format').file_format in ['text'] and
         v.get_value('table_format').compression_codec in ['none']))
 

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/query_test/test_decimal_queries.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_decimal_queries.py b/tests/query_test/test_decimal_queries.py
index 3a14ed3..45a702d 100644
--- a/tests/query_test/test_decimal_queries.py
+++ b/tests/query_test/test_decimal_queries.py
@@ -43,7 +43,7 @@ class TestDecimalQueries(ImpalaTestSuite):
     cls.ImpalaTestMatrix.add_constraint(lambda v:\
         (v.get_value('table_format').file_format == 'text' and
          v.get_value('table_format').compression_codec == 'none') or
-         v.get_value('table_format').file_format in ['parquet', 'kudu'])
+         v.get_value('table_format').file_format in ['parquet', 'orc', 'kudu'])
 
   def test_queries(self, vector):
     self.run_test_case('QueryTest/decimal', vector)

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 2dcc213..bae52a6 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -105,7 +105,7 @@ class TestScannersAllTableFormatsWithLimit(ImpalaTestSuite):
     query_template = "select * from alltypes limit %s"
     for i in range(1, iterations):
       # Vary the limit to vary the timing of cancellation
-      limit = (iterations * 100) % 1000 + 1
+      limit = (i * 100) % 1001 + 1
       query = query_template % limit
       result = self.execute_query(query, vector.get_value('exec_option'),
           table_format=vector.get_value('table_format'))
@@ -837,7 +837,7 @@ class TestTextScanRangeLengths(ImpalaTestSuite):
 @SkipIfLocal.hive
 class TestScanTruncatedFiles(ImpalaTestSuite):
   @classmethod
-  def get_workload(self):
+  def get_workload(cls):
     return 'functional-query'
 
   @classmethod
@@ -900,3 +900,101 @@ class TestUncompressedText(ImpalaTestSuite):
     check_call(['hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] +
           "/testdata/data/lazy_timestamp.csv", tbl_loc])
     self.run_test_case('QueryTest/select-lazy-timestamp', vector, unique_database)
+
+class TestOrc(ImpalaTestSuite):
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestOrc, cls).add_test_dimensions()
+    cls.ImpalaTestMatrix.add_constraint(
+      lambda v: v.get_value('table_format').file_format == 'orc')
+
+  def test_misaligned_orc_stripes(self, vector, unique_database):
+    self._build_lineitem_table_helper(unique_database, 'lineitem_threeblocks',
+        'lineitem_threeblocks.orc')
+    self._build_lineitem_table_helper(unique_database, 'lineitem_sixblocks',
+        'lineitem_sixblocks.orc')
+    self._build_lineitem_table_helper(unique_database,
+        'lineitem_orc_multiblock_one_stripe',
+        'lineitem_orc_multiblock_one_stripe.orc')
+
+    # functional_orc.alltypes is well-formatted. 'NumScannersWithNoReads' counters are
+    # set to 0.
+    table_name = 'functional_orc_def.alltypes'
+    self._misaligned_orc_stripes_helper(table_name, 7300)
+    # lineitem_threeblock.orc is ill-formatted but every scanner reads some stripes.
+    # 'NumScannersWithNoReads' counters are set to 0.
+    table_name = unique_database + '.lineitem_threeblocks'
+    self._misaligned_orc_stripes_helper(table_name, 16000)
+    # lineitem_sixblocks.orc is ill-formatted but every scanner reads some stripes.
+    # 'NumScannersWithNoReads' counters are set to 0.
+    table_name = unique_database + '.lineitem_sixblocks'
+    self._misaligned_orc_stripes_helper(table_name, 30000)
+    # Scanning lineitem_orc_multiblock_one_stripe.orc finds two scan ranges that end up
+    # doing no reads because the file is poorly formatted.
+    table_name = unique_database + '.lineitem_orc_multiblock_one_stripe'
+    self._misaligned_orc_stripes_helper(
+      table_name, 16000, num_scanners_with_no_reads=2)
+
+  def _build_lineitem_table_helper(self, db, tbl, file):
+    self.client.execute("create table %s.%s like tpch.lineitem stored as orc" % (db, tbl))
+    tbl_loc = get_fs_path("/test-warehouse/%s.db/%s" % (db, tbl))
+    # set block size to 156672 so lineitem_threeblocks.orc occupies 3 blocks,
+    # lineitem_sixblocks.orc occupies 6 blocks.
+    check_call(['hdfs', 'dfs', '-Ddfs.block.size=156672', '-copyFromLocal',
+        os.environ['IMPALA_HOME'] + "/testdata/LineItemMultiBlock/" + file, tbl_loc])
+
+  def _misaligned_orc_stripes_helper(
+          self, table_name, rows_in_table, num_scanners_with_no_reads=0):
+    """Checks if 'num_scanners_with_no_reads' indicates the expected number of scanners
+    that don't read anything because the underlying file is poorly formatted
+    """
+    query = 'select * from %s' % table_name
+    result = self.client.execute(query)
+    assert len(result.data) == rows_in_table
+
+    runtime_profile = str(result.runtime_profile)
+    num_scanners_with_no_reads_list = re.findall(
+      'NumScannersWithNoReads: ([0-9]*)', runtime_profile)
+
+    # This will fail if the number of impalads != 3
+    # The fourth fragment is the "Averaged Fragment"
+    assert len(num_scanners_with_no_reads_list) == 4
+
+    # Calculate the total number of scan ranges that ended up not reading anything because
+    # an underlying file was poorly formatted.
+    # Skip the Averaged Fragment; it comes first in the runtime profile.
+    total = 0
+    for n in num_scanners_with_no_reads_list[1:]:
+      total += int(n)
+    assert total == num_scanners_with_no_reads
+
+  def test_type_conversions(self, vector, unique_database):
+    # Create an "illtypes" table whose columns can't match the underlining ORC file's.
+    # Create an "safetypes" table likes above but ORC columns can still fit into it.
+    # Reuse the data files of functional_orc_def.alltypestiny
+    tbl_loc = get_fs_path("/test-warehouse/alltypestiny_orc_def")
+    self.client.execute("""create external table %s.illtypes (c1 boolean, c2 float,
+        c3 boolean, c4 tinyint, c5 smallint, c6 int, c7 boolean, c8 string, c9 int,
+        c10 float, c11 bigint) partitioned by (year int, month int) stored as ORC
+        location '%s';""" % (unique_database, tbl_loc))
+    self.client.execute("""create external table %s.safetypes (c1 bigint, c2 boolean,
+        c3 smallint, c4 int, c5 bigint, c6 bigint, c7 double, c8 double, c9 char(3),
+        c10 varchar(3), c11 timestamp) partitioned by (year int, month int) stored as ORC
+        location '%s';""" % (unique_database, tbl_loc))
+    self.client.execute("alter table %s.illtypes recover partitions" % unique_database)
+    self.client.execute("alter table %s.safetypes recover partitions" % unique_database)
+
+    # Create a decimal table whose precisions don't match the underlining orc files.
+    # Reuse the data files of functional_orc_def.decimal_tbl.
+    decimal_loc = get_fs_path("/test-warehouse/decimal_tbl_orc_def")
+    self.client.execute("""create external table %s.mismatch_decimals (d1 decimal(8,0),
+        d2 decimal(8,0), d3 decimal(19,10), d4 decimal(20,20), d5 decimal(2,0))
+        partitioned by (d6 decimal(9,0)) stored as orc location '%s'"""
+        % (unique_database, decimal_loc))
+    self.client.execute("alter table %s.mismatch_decimals recover partitions" % unique_database)
+
+    self.run_test_case('DataErrorsTest/orc-type-checks', vector, unique_database)

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/query_test/test_scanners_fuzz.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners_fuzz.py b/tests/query_test/test_scanners_fuzz.py
index c336a17..791c343 100644
--- a/tests/query_test/test_scanners_fuzz.py
+++ b/tests/query_test/test_scanners_fuzz.py
@@ -61,6 +61,8 @@ class TestScannersFuzzing(ImpalaTestSuite):
           'num_nodes' : cls.NUM_NODES_VALUES,
           'mem_limit' : cls.MEM_LIMITS}))
     # TODO: enable for more table formats once they consistently pass the fuzz test.
+    # TODO(IMPALA-6772): enable for ORC formats once a new version after release-1.4.3
+    # of ORC library is released.
     cls.ImpalaTestMatrix.add_constraint(lambda v:
         v.get_value('table_format').file_format in ('avro', 'parquet') or
         (v.get_value('table_format').file_format == 'text' and

http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/query_test/test_tpch_queries.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_tpch_queries.py b/tests/query_test/test_tpch_queries.py
index ece8347..68a2984 100644
--- a/tests/query_test/test_tpch_queries.py
+++ b/tests/query_test/test_tpch_queries.py
@@ -36,7 +36,7 @@ class TestTpchQuery(ImpalaTestSuite):
     # TODO: the planner tests are based on text and need this.
     if cls.exploration_strategy() == 'core':
       cls.ImpalaTestMatrix.add_constraint(lambda v:\
-          v.get_value('table_format').file_format in ['text', 'parquet', 'kudu'])
+          v.get_value('table_format').file_format in ['text', 'parquet', 'kudu', 'orc'])
 
   def idfn(val):
     return "TPC-H: Q{0}".format(val)