You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/04/11 05:34:49 UTC
[5/6] impala git commit: IMPALA-5717: Support for reading ORC data
files
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/common/thrift/CatalogObjects.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/CatalogObjects.thrift b/common/thrift/CatalogObjects.thrift
index ecd27dc..0f71f5f 100644
--- a/common/thrift/CatalogObjects.thrift
+++ b/common/thrift/CatalogObjects.thrift
@@ -58,7 +58,8 @@ enum THdfsFileFormat {
SEQUENCE_FILE,
AVRO,
PARQUET,
- KUDU
+ KUDU,
+ ORC
}
// TODO: Since compression is also enabled for Kudu columns, we should
@@ -73,7 +74,8 @@ enum THdfsCompression {
SNAPPY_BLOCKED,
LZO,
LZ4,
- ZLIB
+ ZLIB,
+ ZSTD
}
enum TColumnEncoding {
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/cup/sql-parser.cup
----------------------------------------------------------------------
diff --git a/fe/src/main/cup/sql-parser.cup b/fe/src/main/cup/sql-parser.cup
index 999ed16..f2d7cef 100644
--- a/fe/src/main/cup/sql-parser.cup
+++ b/fe/src/main/cup/sql-parser.cup
@@ -263,8 +263,8 @@ terminal
KW_IN, KW_INCREMENTAL, KW_INIT_FN, KW_INNER, KW_INPATH, KW_INSERT, KW_INT,
KW_INTERMEDIATE, KW_INTERVAL, KW_INTO, KW_INVALIDATE, KW_IREGEXP, KW_IS, KW_JOIN,
KW_KUDU, KW_LAST, KW_LEFT, KW_LIKE, KW_LIMIT, KW_LINES, KW_LOAD, KW_LOCATION, KW_MAP,
- KW_MERGE_FN, KW_METADATA, KW_NOT, KW_NULL, KW_NULLS, KW_OFFSET, KW_ON, KW_OR, KW_ORDER,
- KW_OUTER, KW_OVER, KW_OVERWRITE, KW_PARQUET, KW_PARQUETFILE, KW_PARTITION,
+ KW_MERGE_FN, KW_METADATA, KW_NOT, KW_NULL, KW_NULLS, KW_OFFSET, KW_ON, KW_OR, KW_ORC,
+ KW_ORDER, KW_OUTER, KW_OVER, KW_OVERWRITE, KW_PARQUET, KW_PARQUETFILE, KW_PARTITION,
KW_PARTITIONED, KW_PARTITIONS, KW_PRECEDING, KW_PREPARE_FN, KW_PRIMARY, KW_PRODUCED,
KW_PURGE, KW_RANGE, KW_RCFILE, KW_RECOVER, KW_REFRESH, KW_REGEXP, KW_RENAME,
KW_REPEATABLE, KW_REPLACE, KW_REPLICATION, KW_RESTRICT, KW_RETURNS, KW_REVOKE,
@@ -1562,6 +1562,8 @@ file_format_val ::=
{: RESULT = THdfsFileFormat.PARQUET; :}
| KW_PARQUETFILE
{: RESULT = THdfsFileFormat.PARQUET; :}
+ | KW_ORC
+ {: RESULT = THdfsFileFormat.ORC; :}
| KW_TEXTFILE
{: RESULT = THdfsFileFormat.TEXT; :}
| KW_SEQUENCEFILE
@@ -3487,6 +3489,8 @@ word ::=
{: RESULT = r.toString(); :}
| KW_OR:r
{: RESULT = r.toString(); :}
+ | KW_ORC:r
+ {: RESULT = r.toString(); :}
| KW_ORDER:r
{: RESULT = r.toString(); :}
| KW_OUTER:r
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
index 5df3dfa..e442d66 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
@@ -743,10 +743,10 @@ public class ComputeStatsStmt extends StatementBase {
public Set<Column> getValidatedColumnWhitelist() { return validatedColumnWhitelist_; }
/**
- * Returns true if this statement computes stats on Parquet partitions only,
+ * Returns true if this statement computes stats on Parquet/ORC partitions only,
* false otherwise.
*/
- public boolean isParquetOnly() {
+ public boolean isColumnar() {
if (!(table_ instanceof HdfsTable)) return false;
Collection<HdfsPartition> affectedPartitions = null;
if (partitionSet_ != null) {
@@ -755,7 +755,9 @@ public class ComputeStatsStmt extends StatementBase {
affectedPartitions = ((HdfsTable) table_).getPartitions();
}
for (HdfsPartition partition: affectedPartitions) {
- if (partition.getFileFormat() != HdfsFileFormat.PARQUET) return false;
+ if (partition.getFileFormat() != HdfsFileFormat.PARQUET
+ && partition.getFileFormat() != HdfsFileFormat.ORC)
+ return false;
}
return true;
}
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java b/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java
index e4fce60..32cae72 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java
@@ -62,6 +62,10 @@ public enum HdfsFileFormat {
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
"org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
true, true),
+ ORC("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
+ "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat",
+ "org.apache.hadoop.hive.ql.io.orc.OrcSerde",
+ false, true),
KUDU("org.apache.kudu.mapreduce.KuduTableInputFormat",
"org.apache.kudu.mapreduce.KuduTableOutputFormat",
"", false, false);
@@ -99,19 +103,23 @@ public enum HdfsFileFormat {
"parquet.hive.MapredParquetInputFormat"
};
- private static final Map<String, HdfsFileFormat> VALID_INPUT_FORMATS =
- ImmutableMap.<String, HdfsFileFormat>builder()
- .put(RC_FILE.inputFormat(), RC_FILE)
- .put(TEXT.inputFormat(), TEXT)
- .put(LZO_TEXT.inputFormat(), TEXT)
- .put(SEQUENCE_FILE.inputFormat(), SEQUENCE_FILE)
- .put(AVRO.inputFormat(), AVRO)
- .put(PARQUET.inputFormat(), PARQUET)
- .put(PARQUET_LEGACY_INPUT_FORMATS[0], PARQUET)
- .put(PARQUET_LEGACY_INPUT_FORMATS[1], PARQUET)
- .put(PARQUET_LEGACY_INPUT_FORMATS[2], PARQUET)
- .put(KUDU.inputFormat(), KUDU)
- .build();
+ private static Map<String, HdfsFileFormat> VALID_INPUT_FORMATS;
+ public static void init(boolean enableOrcScanner) {
+ ImmutableMap.Builder<String, HdfsFileFormat> builder =
+ ImmutableMap.<String, HdfsFileFormat>builder()
+ .put(RC_FILE.inputFormat(), RC_FILE)
+ .put(TEXT.inputFormat(), TEXT)
+ .put(LZO_TEXT.inputFormat(), TEXT)
+ .put(SEQUENCE_FILE.inputFormat(), SEQUENCE_FILE)
+ .put(AVRO.inputFormat(), AVRO)
+ .put(PARQUET.inputFormat(), PARQUET)
+ .put(PARQUET_LEGACY_INPUT_FORMATS[0], PARQUET)
+ .put(PARQUET_LEGACY_INPUT_FORMATS[1], PARQUET)
+ .put(PARQUET_LEGACY_INPUT_FORMATS[2], PARQUET)
+ .put(KUDU.inputFormat(), KUDU);
+ if (enableOrcScanner) builder.put(ORC.inputFormat(), ORC);
+ VALID_INPUT_FORMATS = builder.build();
+ }
/**
* Returns true if the string describes an input format class that we support.
@@ -145,6 +153,7 @@ public enum HdfsFileFormat {
case TEXT: return HdfsFileFormat.TEXT;
case SEQUENCE_FILE: return HdfsFileFormat.SEQUENCE_FILE;
case AVRO: return HdfsFileFormat.AVRO;
+ case ORC: return HdfsFileFormat.ORC;
case PARQUET: return HdfsFileFormat.PARQUET;
case KUDU: return HdfsFileFormat.KUDU;
default:
@@ -159,6 +168,7 @@ public enum HdfsFileFormat {
case TEXT: return THdfsFileFormat.TEXT;
case SEQUENCE_FILE: return THdfsFileFormat.SEQUENCE_FILE;
case AVRO: return THdfsFileFormat.AVRO;
+ case ORC: return THdfsFileFormat.ORC;
case PARQUET: return THdfsFileFormat.PARQUET;
case KUDU: return THdfsFileFormat.KUDU;
default:
@@ -170,6 +180,7 @@ public enum HdfsFileFormat {
public String toSql(HdfsCompression compressionType) {
switch (this) {
case RC_FILE: return "RCFILE";
+ case ORC: return "ORC";
case TEXT:
if (compressionType == HdfsCompression.LZO ||
compressionType == HdfsCompression.LZO_INDEX) {
@@ -240,6 +251,7 @@ public enum HdfsFileFormat {
case SEQUENCE_FILE:
case AVRO:
case PARQUET:
+ case ORC:
return true;
case KUDU:
return false;
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java b/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
index b4e2564..f51b10e 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
@@ -57,6 +57,7 @@ public class HdfsStorageDescriptor {
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", // (seq / text / parquet)
"org.apache.hadoop.hive.serde2.avro.AvroSerDe", // (avro)
"org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe", // (rc)
+ "org.apache.hadoop.hive.ql.io.orc.OrcSerde", // (orc)
"parquet.hive.serde.ParquetHiveSerDe", // (parquet - legacy)
// TODO: Verify the following Parquet SerDe works with Impala and add
// support for the new input/output format classes. See IMPALA-4214.
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index 7735f98..ac67d7d 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -332,11 +332,12 @@ public class HdfsScanNode extends ScanNode {
Set<HdfsFileFormat> fileFormats = computeScanRangeLocations(analyzer);
// Determine backend scan node implementation to use. The optimized MT implementation
- // is currently only supported for Parquet.
+ // is currently supported for Parquet, ORC and Text.
if (analyzer.getQueryOptions().isSetMt_dop() &&
analyzer.getQueryOptions().mt_dop > 0 &&
fileFormats.size() == 1 &&
(fileFormats.contains(HdfsFileFormat.PARQUET)
+ || fileFormats.contains(HdfsFileFormat.ORC)
|| fileFormats.contains(HdfsFileFormat.TEXT))) {
useMtScanNode_ = true;
} else {
@@ -1191,9 +1192,10 @@ public class HdfsScanNode extends ScanNode {
Preconditions.checkNotNull(desc_.getTable() instanceof HdfsTable);
HdfsTable table = (HdfsTable) desc_.getTable();
int perHostScanRanges;
- if (table.getMajorityFormat() == HdfsFileFormat.PARQUET) {
+ if (table.getMajorityFormat() == HdfsFileFormat.PARQUET
+ || table.getMajorityFormat() == HdfsFileFormat.ORC) {
// For the purpose of this estimation, the number of per-host scan ranges for
- // Parquet files are equal to the number of columns read from the file. I.e.
+ // Parquet/ORC files are equal to the number of columns read from the file. I.e.
// excluding partition columns and columns that are populated from file metadata.
perHostScanRanges = 0;
for (SlotDescriptor slot: desc_.getSlots()) {
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/service/BackendConfig.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/service/BackendConfig.java b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
index 3833094..a94f46e 100644
--- a/fe/src/main/java/org/apache/impala/service/BackendConfig.java
+++ b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
@@ -23,6 +23,7 @@ import static org.apache.hadoop.fs.CommonConfigurationKeysPublic
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.authentication.util.KerberosName;
import org.apache.impala.analysis.SqlScanner;
+import org.apache.impala.catalog.HdfsFileFormat;
import org.apache.impala.thrift.TBackendGflags;
import com.google.common.base.Preconditions;
@@ -45,6 +46,7 @@ public class BackendConfig {
Preconditions.checkNotNull(cfg);
INSTANCE = new BackendConfig(cfg);
SqlScanner.init(cfg.getReserved_words_version());
+ HdfsFileFormat.init(cfg.isEnable_orc_scanner());
initAuthToLocal();
}
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/java/org/apache/impala/service/Frontend.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/service/Frontend.java b/fe/src/main/java/org/apache/impala/service/Frontend.java
index 392c249..348adaf 100644
--- a/fe/src/main/java/org/apache/impala/service/Frontend.java
+++ b/fe/src/main/java/org/apache/impala/service/Frontend.java
@@ -1014,11 +1014,11 @@ public class Frontend {
if (thriftLineageGraph != null && thriftLineageGraph.isSetQuery_text()) {
result.catalog_op_request.setLineage_graph(thriftLineageGraph);
}
- // Set MT_DOP=4 for COMPUTE STATS on Parquet tables, unless the user has already
+ // Set MT_DOP=4 for COMPUTE STATS on Parquet/ORC tables, unless the user has already
// provided another value for MT_DOP.
if (!queryOptions.isSetMt_dop() &&
analysisResult.isComputeStatsStmt() &&
- analysisResult.getComputeStatsStmt().isParquetOnly()) {
+ analysisResult.getComputeStatsStmt().isColumnar()) {
queryOptions.setMt_dop(4);
}
// If unset, set MT_DOP to 0 to simplify the rest of the code.
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/fe/src/main/jflex/sql-scanner.flex
----------------------------------------------------------------------
diff --git a/fe/src/main/jflex/sql-scanner.flex b/fe/src/main/jflex/sql-scanner.flex
index dd1da7c..0512a2a 100644
--- a/fe/src/main/jflex/sql-scanner.flex
+++ b/fe/src/main/jflex/sql-scanner.flex
@@ -176,6 +176,7 @@ import org.apache.impala.thrift.TReservedWordsVersion;
keywordMap.put("on", SqlParserSymbols.KW_ON);
keywordMap.put("||", SqlParserSymbols.KW_OR);
keywordMap.put("or", SqlParserSymbols.KW_OR);
+ keywordMap.put("orc", SqlParserSymbols.KW_ORC);
keywordMap.put("order", SqlParserSymbols.KW_ORDER);
keywordMap.put("outer", SqlParserSymbols.KW_OUTER);
keywordMap.put("over", SqlParserSymbols.KW_OVER);
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/LineItemMultiBlock/README.dox
----------------------------------------------------------------------
diff --git a/testdata/LineItemMultiBlock/README.dox b/testdata/LineItemMultiBlock/README.dox
index 7608067..1d6db46 100755
--- a/testdata/LineItemMultiBlock/README.dox
+++ b/testdata/LineItemMultiBlock/README.dox
@@ -1,6 +1,7 @@
This file was created for:
IMPALA-1881: Maximize data locality when scanning Parquet files with multiple row groups.
IMPALA-2466: Add more tests to the HDFS parquet scanner.
+IMPALA-5717: Add tests for HDFS orc scanner.
The table lineitem_multiblock is a single parquet file with:
- A row group size of approximately 12 KB each.
@@ -31,3 +32,21 @@ blocks.
'lineitem_multiblock_one_row_group' was created similarly but with a much higher
'parquet.block.size' so that everything fit in one row group.
+
+----
+
+The orc files are created by the following hive queries:
+
+use functional_orc_def;
+
+set orc.stripe.size=1024;
+set orc.compress=ZLIB;
+create table lineitem_threeblocks like tpch.lineitem stored as orc;
+create table lineitem_sixblocks like tpch.lineitem stored as orc;
+insert overwrite table lineitem_threeblocks select * from tpch.lineitem limit 16000;
+insert overwrite table lineitem_sixblocks select * from tpch.lineitem limit 30000;
+
+set orc.stripe.size=67108864;
+create table lineitem_orc_multiblock_one_stripe like tpch.lineitem stored as orc;
+insert overwrite table lineitem_orc_multiblock_one_stripe select * from
+tpch.lineitem limit 16000;
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/LineItemMultiBlock/lineitem_orc_multiblock_one_stripe.orc
----------------------------------------------------------------------
diff --git a/testdata/LineItemMultiBlock/lineitem_orc_multiblock_one_stripe.orc b/testdata/LineItemMultiBlock/lineitem_orc_multiblock_one_stripe.orc
new file mode 100644
index 0000000..7dbbffb
Binary files /dev/null and b/testdata/LineItemMultiBlock/lineitem_orc_multiblock_one_stripe.orc differ
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/LineItemMultiBlock/lineitem_sixblocks.orc
----------------------------------------------------------------------
diff --git a/testdata/LineItemMultiBlock/lineitem_sixblocks.orc b/testdata/LineItemMultiBlock/lineitem_sixblocks.orc
new file mode 100644
index 0000000..5fa6cfa
Binary files /dev/null and b/testdata/LineItemMultiBlock/lineitem_sixblocks.orc differ
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/LineItemMultiBlock/lineitem_threeblocks.orc
----------------------------------------------------------------------
diff --git a/testdata/LineItemMultiBlock/lineitem_threeblocks.orc b/testdata/LineItemMultiBlock/lineitem_threeblocks.orc
new file mode 100644
index 0000000..9b12540
Binary files /dev/null and b/testdata/LineItemMultiBlock/lineitem_threeblocks.orc differ
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/bin/create-load-data.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 311029d..e50515b 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -154,6 +154,9 @@ function load-custom-schemas {
hadoop fs -mkdir -p /test-warehouse/chars_formats_parquet/
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.parquet \
/test-warehouse/chars_formats_parquet
+ hadoop fs -mkdir -p /test-warehouse/chars_formats_orc_def/
+ hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.orc \
+ /test-warehouse/chars_formats_orc_def
hadoop fs -mkdir -p /test-warehouse/chars_formats_text/
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.txt \
/test-warehouse/chars_formats_text
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/bin/generate-schema-statements.py
----------------------------------------------------------------------
diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py
index 34c2084..3f730e6 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -128,6 +128,7 @@ FILE_FORMAT_MAP = {
'text': 'TEXTFILE',
'seq': 'SEQUENCEFILE',
'rc': 'RCFILE',
+ 'orc': 'ORC',
'parquet': 'PARQUET',
'text_lzo':
"\nINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'" +
@@ -219,7 +220,7 @@ def build_table_template(file_format, columns, partition_columns, row_format,
else:
tblproperties["avro.schema.url"] = "hdfs://%s/%s/%s/{table_name}.json" \
% (options.hdfs_namenode, options.hive_warehouse_dir, avro_schema_dir)
- elif file_format in 'parquet':
+ elif file_format in ['parquet', 'orc']: # columnar formats don't need row format
row_format_stmt = str()
elif file_format == 'kudu':
# Use partitioned_by to set a trivial hash distribution
@@ -243,7 +244,7 @@ def build_table_template(file_format, columns, partition_columns, row_format,
for table_property in table_properties.split("\n"):
format_prop = table_property.split(":")
if format_prop[0] == file_format:
- key_val = format_prop[1].split("=");
+ key_val = format_prop[1].split("=")
tblproperties[key_val[0]] = key_val[1]
all_tblproperties = []
@@ -658,7 +659,7 @@ def generate_statements(output_name, test_vectors, sections,
# that weren't already added to the table. So, for force reload, manually
# delete the partition directories.
output.create.append(("DFS -rm -R {data_path};").format(
- data_path=data_path));
+ data_path=data_path))
else:
# If this is not a force reload use msck repair to add the partitions
# into the table.
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/bin/run-hive-server.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/run-hive-server.sh b/testdata/bin/run-hive-server.sh
index 49d1de2..3b2c83d 100755
--- a/testdata/bin/run-hive-server.sh
+++ b/testdata/bin/run-hive-server.sh
@@ -73,9 +73,10 @@ ${CLUSTER_BIN}/wait-for-metastore.py --transport=${METASTORE_TRANSPORT}
if [ ${ONLY_METASTORE} -eq 0 ]; then
# Starts a HiveServer2 instance on the port specified by the HIVE_SERVER2_THRIFT_PORT
- # environment variable.
+ # environment variable. HADOOP_HEAPSIZE should be set to at least 2048 to avoid OOM
+ # when loading ORC tables like widerow.
if [[ $IMPALA_MINICLUSTER_PROFILE == 2 ]]; then
- HADOOP_HEAPSIZE="512" hive --service hiveserver2 > ${LOGDIR}/hive-server2.out 2>&1 &
+ HADOOP_HEAPSIZE="2048" hive --service hiveserver2 > ${LOGDIR}/hive-server2.out 2>&1 &
elif [[ $IMPALA_MINICLUSTER_PROFILE == 3 ]]; then
HADOOP_CLIENT_OPTS="-Xmx2048m -Dhive.log.file=hive-server2.log" hive \
--service hiveserver2 > ${LOGDIR}/hive-server2.out 2>&1 &
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/cluster/node_templates/common/etc/hadoop/conf/hdfs-site.xml.tmpl
----------------------------------------------------------------------
diff --git a/testdata/cluster/node_templates/common/etc/hadoop/conf/hdfs-site.xml.tmpl b/testdata/cluster/node_templates/common/etc/hadoop/conf/hdfs-site.xml.tmpl
index f72dd97..c9ee70b 100644
--- a/testdata/cluster/node_templates/common/etc/hadoop/conf/hdfs-site.xml.tmpl
+++ b/testdata/cluster/node_templates/common/etc/hadoop/conf/hdfs-site.xml.tmpl
@@ -82,6 +82,12 @@
<value>134217728</value>
</property>
+ <!-- Decrease this so we can create mini test files across several blocks -->
+ <property>
+ <name>dfs.namenode.fs-limits.min-block-size</name>
+ <value>1024</value>
+ </property>
+
<!-- Set the max cached memory to ~64kb. This must be less than ulimit -l -->
<property>
<name>dfs.datanode.max.locked.memory</name>
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/data/chars-formats.orc
----------------------------------------------------------------------
diff --git a/testdata/data/chars-formats.orc b/testdata/data/chars-formats.orc
new file mode 100644
index 0000000..625c2c8
Binary files /dev/null and b/testdata/data/chars-formats.orc differ
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/datasets/functional/functional_schema_template.sql
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index cede525..a7a5eac 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -739,6 +739,7 @@ INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION(p=1) SELECT i
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION(p=2) SELECT id, named_struct("f1",string_col,"f2",int_col), array(1, 2, 3), map("k", cast(0 as bigint)) FROM functional.alltypestiny;
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION(p=3) SELECT id, named_struct("f1",string_col,"f2",int_col), array(1, 2, 3), map("k", cast(0 as bigint)) FROM functional.alltypestiny;
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION(p=4) SELECT id, named_struct("f1",string_col,"f2",int_col), array(1, 2, 3), map("k", cast(0 as bigint)) FROM functional.alltypestiny;
+INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION(p=5) SELECT id, named_struct("f1",string_col,"f2",int_col), array(1, 2, 3), map("k", cast(0 as bigint)) FROM functional.alltypestiny;
-- The order of insertions and alterations is deliberately chose to work around a Hive
-- bug where the format of an altered partition is reverted back to the original format after
-- an insert. So we first do the insert, and then alter the format.
@@ -746,6 +747,7 @@ USE {db_name}{db_suffix};
ALTER TABLE {table_name} PARTITION (p=2) SET FILEFORMAT PARQUET;
ALTER TABLE {table_name} PARTITION (p=3) SET FILEFORMAT AVRO;
ALTER TABLE {table_name} PARTITION (p=4) SET FILEFORMAT RCFILE;
+ALTER TABLE {table_name} PARTITION (p=5) SET FILEFORMAT ORC;
USE default;
====
---- DATASET
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/datasets/functional/schema_constraints.csv
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index ef65b9a..baf0306 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -66,6 +66,7 @@ table_name:complextypes_fileformat, constraint:restrict_to, table_format:parquet
table_name:complextypes_fileformat, constraint:restrict_to, table_format:avro/snap/block
table_name:complextypes_fileformat, constraint:restrict_to, table_format:rc/snap/block
table_name:complextypes_fileformat, constraint:restrict_to, table_format:seq/snap/block
+table_name:complextypes_fileformat, constraint:restrict_to, table_format:orc/def/block
table_name:complextypes_multifileformat, constraint:restrict_to, table_format:text/none/none
# TODO: Avro
@@ -134,6 +135,8 @@ table_name:decimal_tbl, constraint:restrict_to, table_format:parquet/none/none
table_name:decimal_tiny, constraint:restrict_to, table_format:parquet/none/none
table_name:decimal_tbl, constraint:restrict_to, table_format:kudu/none/none
table_name:decimal_tiny, constraint:restrict_to, table_format:kudu/none/none
+table_name:decimal_tbl, constraint:restrict_to, table_format:orc/def/block
+table_name:decimal_tiny, constraint:restrict_to, table_format:orc/def/block
table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/block
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
index 9c68c65..1e61b7d 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
@@ -15,6 +15,38 @@ PLAN-ROOT SINK
partitions=1/1 files=1 size=227B
predicates: !empty(t.a)
====
+# Complex types are not supported on ORC.
+select 1 from functional_orc_def.complextypes_fileformat t, t.a
+---- PLAN
+not implemented: Scan of table 't' in format 'ORC' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
+Complex types are supported for these file formats: PARQUET.
+====
+select s.f1 from functional_orc_def.complextypes_fileformat t, t.m
+---- PLAN
+not implemented: Scan of table 't' in format 'ORC' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
+Complex types are supported for these file formats: PARQUET.
+====
+# Complex types are not supported on ORC, however queries materializing
+# only scalar type columns are allowed.
+select id from functional_orc_def.complextypes_fileformat
+---- PLAN
+PLAN-ROOT SINK
+|
+00:SCAN HDFS [functional_orc_def.complextypes_fileformat]
+ partitions=1/1 files=1 size=624B
+====
+# Complex types are not supported on ORC but count(*) and similar
+# queries should work.
+select count(*) from functional_orc_def.complextypes_fileformat
+---- PLAN
+PLAN-ROOT SINK
+|
+01:AGGREGATE [FINALIZE]
+| output: count(*)
+|
+00:SCAN HDFS [functional_orc_def.complextypes_fileformat]
+ partitions=1/1 files=1 size=624B
+====
# Complex types are not supported on Avro.
select s.f1 from functional_avro_snap.complextypes_fileformat t, t.a
---- PLAN
@@ -111,11 +143,12 @@ select complex_struct_col.f1 from functional_hbase.allcomplextypes
not implemented: Scan of table 'functional_hbase.allcomplextypes.complex_struct_col.f1' is not supported because 'functional_hbase.allcomplextypes' references a nested field/collection.
Complex types are supported for these file formats: PARQUET.
====
-# The complextypes_multifileformat has three partitions with different file formats:
+# The complextypes_multifileformat has five partitions with different file formats:
# p=1 text
# p=2 parquet
# p=3 avro
# p=4 rc
+# p=5 orc
# Scanning a text partition of a multi-format table with complex types fails.
select 1 from functional.complextypes_multifileformat where p = 1
---- PLAN
@@ -136,7 +169,7 @@ PLAN-ROOT SINK
| 03:UNNEST [t.a]
|
00:SCAN HDFS [functional.complextypes_multifileformat t]
- partitions=1/4 files=1 size=128B
+ partitions=1/5 files=1 size=128B
predicates: !empty(t.a)
====
# Scanning an Avro partition of a multi-format table with complex types fails.
@@ -161,5 +194,23 @@ PLAN-ROOT SINK
| output: count(*)
|
00:SCAN HDFS [functional.complextypes_multifileformat]
- partitions=1/4 files=1 size=128B
+ partitions=1/5 files=1 size=128B
+====
+# Scanning an ORC file partition of a multi-format table with complex types fails.
+select id from functional.complextypes_multifileformat t, t.a where p = 5
+---- PLAN
+not implemented: Scan of partition 'p=5' in format 'ORC' of table 't' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
+Complex types are supported for these file formats: PARQUET.
+====
+# Complex types are not supported on ORC files but count(*) and similar
+# queries should work.
+select count(*) from functional.complextypes_multifileformat where p = 5
+---- PLAN
+PLAN-ROOT SINK
+|
+01:AGGREGATE [FINALIZE]
+| output: count(*)
+|
+00:SCAN HDFS [functional.complextypes_multifileformat]
+ partitions=1/5 files=1 size=128B
====
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-query/functional-query_core.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/functional-query_core.csv b/testdata/workloads/functional-query/functional-query_core.csv
index dffca78..7118e3f 100644
--- a/testdata/workloads/functional-query/functional-query_core.csv
+++ b/testdata/workloads/functional-query/functional-query_core.csv
@@ -2,6 +2,7 @@
file_format:text, dataset:functional, compression_codec:none, compression_type:none
file_format:seq, dataset:functional, compression_codec:snap, compression_type:block
file_format:rc, dataset: functional, compression_codec: snap, compression_type: block
+file_format:orc, dataset: functional, compression_codec: def, compression_type: block
file_format:parquet, dataset: functional, compression_codec: none, compression_type: none
file_format:avro, dataset: functional, compression_codec: snap, compression_type: block
file_format:hbase, dataset:functional, compression_codec:none, compression_type:none
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-query/functional-query_dimensions.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/functional-query_dimensions.csv b/testdata/workloads/functional-query/functional-query_dimensions.csv
index 539122b..bcb4406 100644
--- a/testdata/workloads/functional-query/functional-query_dimensions.csv
+++ b/testdata/workloads/functional-query/functional-query_dimensions.csv
@@ -1,4 +1,4 @@
-file_format: text,seq,rc,avro,parquet,hbase,kudu
+file_format: text,seq,rc,avro,parquet,orc,hbase,kudu
dataset: functional
compression_codec: none,def,gzip,bzip,snap,lzo
compression_type: none,block,record
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-query/functional-query_exhaustive.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/functional-query_exhaustive.csv b/testdata/workloads/functional-query/functional-query_exhaustive.csv
index 18331c6..a06ab52 100644
--- a/testdata/workloads/functional-query/functional-query_exhaustive.csv
+++ b/testdata/workloads/functional-query/functional-query_exhaustive.csv
@@ -22,5 +22,6 @@ file_format: avro, dataset: functional, compression_codec: none, compression_typ
file_format: avro, dataset: functional, compression_codec: def, compression_type: block
file_format: avro, dataset: functional, compression_codec: snap, compression_type: block
file_format: parquet, dataset: functional, compression_codec: none, compression_type: none
+file_format: orc, dataset: functional, compression_codec: def, compression_type: block
file_format: hbase, dataset: functional, compression_codec: none, compression_type: none
file_format: kudu, dataset: functional, compression_codec: none, compression_type: none
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-query/functional-query_pairwise.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/functional-query_pairwise.csv b/testdata/workloads/functional-query/functional-query_pairwise.csv
index 0a4ee09..e046a09 100644
--- a/testdata/workloads/functional-query/functional-query_pairwise.csv
+++ b/testdata/workloads/functional-query/functional-query_pairwise.csv
@@ -4,5 +4,6 @@ file_format: seq, dataset: functional, compression_codec: def, compression_type:
file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block
file_format: avro, dataset: functional, compression_codec: snap, compression_type: block
file_format: parquet, dataset: functional, compression_codec: none, compression_type: none
+file_format: orc, dataset: functional, compression_codec: def, compression_type: block
file_format: hbase, dataset: functional, compression_codec: none, compression_type: none
file_format: kudu, dataset: functional, compression_codec: none, compression_type: none
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test
new file mode 100644
index 0000000..ee06258
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test
@@ -0,0 +1,127 @@
+====
+---- QUERY
+select c1 from illtypes
+---- CATCH
+Type mismatch: table column BOOLEAN is map to column int in ORC file
+====
+---- QUERY
+select c2 from illtypes
+---- CATCH
+Type mismatch: table column FLOAT is map to column boolean in ORC file
+====
+---- QUERY
+select c3 from illtypes
+---- CATCH
+Type mismatch: table column BOOLEAN is map to column tinyint in ORC file
+====
+---- QUERY
+select c4 from illtypes
+---- CATCH
+Type mismatch: table column TINYINT is map to column smallint in ORC file
+====
+---- QUERY
+select c5 from illtypes
+---- CATCH
+Type mismatch: table column SMALLINT is map to column int in ORC file
+====
+---- QUERY
+select c6 from illtypes
+---- CATCH
+Type mismatch: table column INT is map to column bigint in ORC file
+====
+---- QUERY
+select c7 from illtypes
+---- CATCH
+Type mismatch: table column BOOLEAN is map to column float in ORC file
+====
+---- QUERY
+select c8 from illtypes
+---- CATCH
+Type mismatch: table column STRING is map to column double in ORC file
+====
+---- QUERY
+select c9 from illtypes
+---- CATCH
+Type mismatch: table column INT is map to column string in ORC file
+====
+---- QUERY
+select c10 from illtypes
+---- CATCH
+Type mismatch: table column FLOAT is map to column string in ORC file
+====
+---- QUERY
+select c11 from illtypes
+---- CATCH
+Type mismatch: table column BIGINT is map to column timestamp in ORC file
+====
+---- QUERY
+select * from safetypes order by c1
+---- TYPES
+bigint,boolean,smallint,int,bigint,bigint,double,double,char,string,timestamp,int,int
+---- RESULTS
+0,true,0,0,0,0,0,0,'01/','0',2009-01-01 00:00:00,2009,1
+1,false,1,1,1,10,1.100000023841858,10.1,'01/','1',2009-01-01 00:01:00,2009,1
+2,true,0,0,0,0,0,0,'02/','0',2009-02-01 00:00:00,2009,2
+3,false,1,1,1,10,1.100000023841858,10.1,'02/','1',2009-02-01 00:01:00,2009,2
+4,true,0,0,0,0,0,0,'03/','0',2009-03-01 00:00:00,2009,3
+5,false,1,1,1,10,1.100000023841858,10.1,'03/','1',2009-03-01 00:01:00,2009,3
+6,true,0,0,0,0,0,0,'04/','0',2009-04-01 00:00:00,2009,4
+7,false,1,1,1,10,1.100000023841858,10.1,'04/','1',2009-04-01 00:01:00,2009,4
+====
+---- QUERY
+select d1 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+1234
+2345
+12345
+12345
+132842
+====
+---- QUERY
+select d2 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+---- CATCH
+It can't be truncated to table column DECIMAL(8,0) for column decimal(10,0) in ORC file
+====
+---- QUERY
+select d3 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+1.2345678900
+12.3456789000
+123.4567890000
+1234.5678900000
+12345.6789000000
+====
+---- QUERY
+select d4 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+---- CATCH
+Type mismatch: table column DECIMAL(20,20) is map to column decimal(38,38) in ORC file
+====
+---- QUERY
+select d5 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+---- CATCH
+Type mismatch: table column DECIMAL(2,0) is map to column decimal(10,5) in ORC file
+====
+---- QUERY
+select d6 from mismatch_decimals
+---- TYPES
+decimal
+---- RESULTS
+1
+1
+1
+1
+1
+====
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpcds/tpcds_core.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpcds/tpcds_core.csv b/testdata/workloads/tpcds/tpcds_core.csv
index 94b4b22..48cc97d 100644
--- a/testdata/workloads/tpcds/tpcds_core.csv
+++ b/testdata/workloads/tpcds/tpcds_core.csv
@@ -2,3 +2,4 @@
file_format: text, dataset: tpcds, compression_codec: none, compression_type: none
file_format: seq, dataset: tpcds, compression_codec: snap, compression_type: block
file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpcds, compression_codec: def, compression_type: block
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpcds/tpcds_dimensions.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpcds/tpcds_dimensions.csv b/testdata/workloads/tpcds/tpcds_dimensions.csv
index 8137b7a..bae5d90 100644
--- a/testdata/workloads/tpcds/tpcds_dimensions.csv
+++ b/testdata/workloads/tpcds/tpcds_dimensions.csv
@@ -1,4 +1,4 @@
-file_format: text,seq,rc,avro,parquet
+file_format: text,seq,rc,avro,parquet,orc
dataset: tpcds
compression_codec: none,def,gzip,bzip,snap,lzo
compression_type: none,block,record
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpcds/tpcds_exhaustive.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpcds/tpcds_exhaustive.csv b/testdata/workloads/tpcds/tpcds_exhaustive.csv
index c4b4f99..57fcddd 100644
--- a/testdata/workloads/tpcds/tpcds_exhaustive.csv
+++ b/testdata/workloads/tpcds/tpcds_exhaustive.csv
@@ -21,3 +21,6 @@ file_format: avro, dataset: tpcds, compression_codec: snap, compression_type: bl
file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none
file_format: parquet, dataset: tpcds, compression_codec: def, compression_type: block
file_format: parquet, dataset: tpcds, compression_codec: snap, compression_type: block
+file_format: orc, dataset: tpcds, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpcds, compression_codec: def, compression_type: block
+file_format: orc, dataset: tpcds, compression_codec: snap, compression_type: block
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpcds/tpcds_pairwise.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpcds/tpcds_pairwise.csv b/testdata/workloads/tpcds/tpcds_pairwise.csv
index e643495..61ee66c 100644
--- a/testdata/workloads/tpcds/tpcds_pairwise.csv
+++ b/testdata/workloads/tpcds/tpcds_pairwise.csv
@@ -13,3 +13,6 @@ file_format: rc, dataset: tpcds, compression_codec: def, compression_type: block
file_format: avro, dataset: tpcds, compression_codec: none, compression_type: none
file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none
file_format: rc, dataset: tpcds, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpcds, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpcds, compression_codec: def, compression_type: block
+file_format: orc, dataset: tpcds, compression_codec: snap, compression_type: block
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpch/tpch_core.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/tpch_core.csv b/testdata/workloads/tpch/tpch_core.csv
index 86804ac..024063c 100644
--- a/testdata/workloads/tpch/tpch_core.csv
+++ b/testdata/workloads/tpch/tpch_core.csv
@@ -7,4 +7,5 @@ file_format:rc, dataset:tpch, compression_codec:none, compression_type:none
file_format:avro, dataset:tpch, compression_codec: none, compression_type: none
file_format:avro, dataset:tpch, compression_codec: snap, compression_type: block
file_format:parquet, dataset:tpch, compression_codec: none, compression_type: none
+file_format:orc, dataset:tpch, compression_codec: def, compression_type: block
file_format:kudu, dataset:tpch, compression_codec: none, compression_type: none
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpch/tpch_dimensions.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/tpch_dimensions.csv b/testdata/workloads/tpch/tpch_dimensions.csv
index 1de34aa..f1ce5f0 100644
--- a/testdata/workloads/tpch/tpch_dimensions.csv
+++ b/testdata/workloads/tpch/tpch_dimensions.csv
@@ -1,4 +1,4 @@
-file_format: text,seq,rc,avro,parquet,kudu
+file_format: text,seq,rc,avro,parquet,orc,kudu
dataset: tpch
compression_codec: none,def,gzip,bzip,snap,lzo
compression_type: none,block,record
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpch/tpch_exhaustive.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/tpch_exhaustive.csv b/testdata/workloads/tpch/tpch_exhaustive.csv
index 32085bf..3513dc5 100644
--- a/testdata/workloads/tpch/tpch_exhaustive.csv
+++ b/testdata/workloads/tpch/tpch_exhaustive.csv
@@ -22,4 +22,7 @@ file_format: avro, dataset: tpch, compression_codec: snap, compression_type: blo
file_format: parquet, dataset: tpch, compression_codec: none, compression_type: none
file_format: parquet, dataset: tpch, compression_codec: def, compression_type: block
file_format: parquet, dataset: tpch, compression_codec: snap, compression_type: block
+file_format: orc, dataset: tpch, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpch, compression_codec: def, compression_type: block
+file_format: orc, dataset: tpch, compression_codec: snap, compression_type: block
file_format: kudu, dataset:tpch, compression_codec: none, compression_type: none
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/testdata/workloads/tpch/tpch_pairwise.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/tpch_pairwise.csv b/testdata/workloads/tpch/tpch_pairwise.csv
index 0744cf5..2eb4176 100644
--- a/testdata/workloads/tpch/tpch_pairwise.csv
+++ b/testdata/workloads/tpch/tpch_pairwise.csv
@@ -13,4 +13,7 @@ file_format: rc, dataset: tpch, compression_codec: def, compression_type: block
file_format: avro, dataset: tpch, compression_codec: none, compression_type: none
file_format: parquet, dataset: tpch, compression_codec: none, compression_type: none
file_format: rc, dataset: tpch, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpch, compression_codec: none, compression_type: none
+file_format: orc, dataset: tpch, compression_codec: def, compression_type: block
+file_format: orc, dataset: tpch, compression_codec: snap, compression_type: block
file_format: kudu, dataset:tpch, compression_codec: none, compression_type: none
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/common/impala_test_suite.py
----------------------------------------------------------------------
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index d57b1cb..2e35c67 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -503,7 +503,7 @@ class ImpalaTestSuite(BaseTestSuite):
Database names are dependent on the input format for table, which the table names
remaining the same. A use database is issued before query execution. As such,
- dabase names need to be build pre execution, this method wraps around the different
+ database names need to be build pre execution, this method wraps around the different
execute methods and provides a common interface to issue the proper use command.
"""
@wraps(function)
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/common/test_dimensions.py
----------------------------------------------------------------------
diff --git a/tests/common/test_dimensions.py b/tests/common/test_dimensions.py
index 4171e1f..df3f8c2 100644
--- a/tests/common/test_dimensions.py
+++ b/tests/common/test_dimensions.py
@@ -28,7 +28,7 @@ WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
# of what specific table format to target along with the exec options (num_nodes, etc)
# to use when running the query.
class TableFormatInfo(object):
- KNOWN_FILE_FORMATS = ['text', 'seq', 'rc', 'parquet', 'avro', 'hbase']
+ KNOWN_FILE_FORMATS = ['text', 'seq', 'rc', 'parquet', 'orc', 'avro', 'hbase']
if os.environ['KUDU_IS_SUPPORTED'] == 'true':
KNOWN_FILE_FORMATS.append('kudu')
KNOWN_COMPRESSION_CODECS = ['none', 'snap', 'gzip', 'bzip', 'def', 'lzo']
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/common/test_vector.py
----------------------------------------------------------------------
diff --git a/tests/common/test_vector.py b/tests/common/test_vector.py
index 4d22269..0c9cca4 100644
--- a/tests/common/test_vector.py
+++ b/tests/common/test_vector.py
@@ -52,7 +52,7 @@
# otherwise. For example, if we want to make sure 'bool' columns are not used with 'sum':
#
# ImpalaTestMatrix.add_constraint(lambda v:\
-# not (v.get_value('col_type') == 'bool and v.get_value('agg_func') == 'sum'))
+# not (v.get_value('col_type') == 'bool' and v.get_value('agg_func') == 'sum'))
#
# Additional examples of usage can be found within the test suites.
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/comparison/cli_options.py
----------------------------------------------------------------------
diff --git a/tests/comparison/cli_options.py b/tests/comparison/cli_options.py
index 885ef84..1d737cf 100644
--- a/tests/comparison/cli_options.py
+++ b/tests/comparison/cli_options.py
@@ -221,7 +221,7 @@ def create_cluster(args):
def add_storage_format_options(parser):
- storage_formats = ['avro', 'parquet', 'rcfile', 'sequencefile', 'textfile']
+ storage_formats = ['avro', 'parquet', 'orc', 'rcfile', 'sequencefile', 'textfile']
parser.add_argument(
'--storage-file-formats', default=','.join(storage_formats),
help='A comma separated list of storage formats to use.')
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/query_test/test_chars.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_chars.py b/tests/query_test/test_chars.py
index b182b91..4444410 100644
--- a/tests/query_test/test_chars.py
+++ b/tests/query_test/test_chars.py
@@ -57,6 +57,11 @@ class TestCharFormats(ImpalaTestSuite):
STORED AS PARQUET
LOCATION "{0}"'''.format(get_fs_path("/test-warehouse/chars_formats_parquet")))
self.client.execute('''create external table if not exists
+ functional_orc_def.chars_formats
+ (cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
+ STORED AS ORC
+ LOCATION "{0}"'''.format(get_fs_path("/test-warehouse/chars_formats_orc_def")))
+ self.client.execute('''create external table if not exists
functional.chars_formats
(cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
ROW FORMAT delimited fields terminated by ',' escaped by '\\\\'
@@ -84,6 +89,7 @@ class TestCharFormats(ImpalaTestSuite):
(v.get_value('table_format').file_format in ['avro'] and
v.get_value('table_format').compression_codec in ['snap']) or
v.get_value('table_format').file_format in ['parquet'] or
+ v.get_value('table_format').file_format in ['orc'] or
(v.get_value('table_format').file_format in ['text'] and
v.get_value('table_format').compression_codec in ['none']))
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/query_test/test_decimal_queries.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_decimal_queries.py b/tests/query_test/test_decimal_queries.py
index 3a14ed3..45a702d 100644
--- a/tests/query_test/test_decimal_queries.py
+++ b/tests/query_test/test_decimal_queries.py
@@ -43,7 +43,7 @@ class TestDecimalQueries(ImpalaTestSuite):
cls.ImpalaTestMatrix.add_constraint(lambda v:\
(v.get_value('table_format').file_format == 'text' and
v.get_value('table_format').compression_codec == 'none') or
- v.get_value('table_format').file_format in ['parquet', 'kudu'])
+ v.get_value('table_format').file_format in ['parquet', 'orc', 'kudu'])
def test_queries(self, vector):
self.run_test_case('QueryTest/decimal', vector)
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 2dcc213..bae52a6 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -105,7 +105,7 @@ class TestScannersAllTableFormatsWithLimit(ImpalaTestSuite):
query_template = "select * from alltypes limit %s"
for i in range(1, iterations):
# Vary the limit to vary the timing of cancellation
- limit = (iterations * 100) % 1000 + 1
+ limit = (i * 100) % 1001 + 1
query = query_template % limit
result = self.execute_query(query, vector.get_value('exec_option'),
table_format=vector.get_value('table_format'))
@@ -837,7 +837,7 @@ class TestTextScanRangeLengths(ImpalaTestSuite):
@SkipIfLocal.hive
class TestScanTruncatedFiles(ImpalaTestSuite):
@classmethod
- def get_workload(self):
+ def get_workload(cls):
return 'functional-query'
@classmethod
@@ -900,3 +900,101 @@ class TestUncompressedText(ImpalaTestSuite):
check_call(['hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] +
"/testdata/data/lazy_timestamp.csv", tbl_loc])
self.run_test_case('QueryTest/select-lazy-timestamp', vector, unique_database)
+
+class TestOrc(ImpalaTestSuite):
+ @classmethod
+ def get_workload(cls):
+ return 'functional-query'
+
+ @classmethod
+ def add_test_dimensions(cls):
+ super(TestOrc, cls).add_test_dimensions()
+ cls.ImpalaTestMatrix.add_constraint(
+ lambda v: v.get_value('table_format').file_format == 'orc')
+
+ def test_misaligned_orc_stripes(self, vector, unique_database):
+ self._build_lineitem_table_helper(unique_database, 'lineitem_threeblocks',
+ 'lineitem_threeblocks.orc')
+ self._build_lineitem_table_helper(unique_database, 'lineitem_sixblocks',
+ 'lineitem_sixblocks.orc')
+ self._build_lineitem_table_helper(unique_database,
+ 'lineitem_orc_multiblock_one_stripe',
+ 'lineitem_orc_multiblock_one_stripe.orc')
+
+ # functional_orc.alltypes is well-formatted. 'NumScannersWithNoReads' counters are
+ # set to 0.
+ table_name = 'functional_orc_def.alltypes'
+ self._misaligned_orc_stripes_helper(table_name, 7300)
+ # lineitem_threeblock.orc is ill-formatted but every scanner reads some stripes.
+ # 'NumScannersWithNoReads' counters are set to 0.
+ table_name = unique_database + '.lineitem_threeblocks'
+ self._misaligned_orc_stripes_helper(table_name, 16000)
+ # lineitem_sixblocks.orc is ill-formatted but every scanner reads some stripes.
+ # 'NumScannersWithNoReads' counters are set to 0.
+ table_name = unique_database + '.lineitem_sixblocks'
+ self._misaligned_orc_stripes_helper(table_name, 30000)
+ # Scanning lineitem_orc_multiblock_one_stripe.orc finds two scan ranges that end up
+ # doing no reads because the file is poorly formatted.
+ table_name = unique_database + '.lineitem_orc_multiblock_one_stripe'
+ self._misaligned_orc_stripes_helper(
+ table_name, 16000, num_scanners_with_no_reads=2)
+
+ def _build_lineitem_table_helper(self, db, tbl, file):
+ self.client.execute("create table %s.%s like tpch.lineitem stored as orc" % (db, tbl))
+ tbl_loc = get_fs_path("/test-warehouse/%s.db/%s" % (db, tbl))
+ # set block size to 156672 so lineitem_threeblocks.orc occupies 3 blocks,
+ # lineitem_sixblocks.orc occupies 6 blocks.
+ check_call(['hdfs', 'dfs', '-Ddfs.block.size=156672', '-copyFromLocal',
+ os.environ['IMPALA_HOME'] + "/testdata/LineItemMultiBlock/" + file, tbl_loc])
+
+ def _misaligned_orc_stripes_helper(
+ self, table_name, rows_in_table, num_scanners_with_no_reads=0):
+ """Checks if 'num_scanners_with_no_reads' indicates the expected number of scanners
+ that don't read anything because the underlying file is poorly formatted
+ """
+ query = 'select * from %s' % table_name
+ result = self.client.execute(query)
+ assert len(result.data) == rows_in_table
+
+ runtime_profile = str(result.runtime_profile)
+ num_scanners_with_no_reads_list = re.findall(
+ 'NumScannersWithNoReads: ([0-9]*)', runtime_profile)
+
+ # This will fail if the number of impalads != 3
+ # The fourth fragment is the "Averaged Fragment"
+ assert len(num_scanners_with_no_reads_list) == 4
+
+ # Calculate the total number of scan ranges that ended up not reading anything because
+ # an underlying file was poorly formatted.
+ # Skip the Averaged Fragment; it comes first in the runtime profile.
+ total = 0
+ for n in num_scanners_with_no_reads_list[1:]:
+ total += int(n)
+ assert total == num_scanners_with_no_reads
+
+ def test_type_conversions(self, vector, unique_database):
+ # Create an "illtypes" table whose columns can't match the underlining ORC file's.
+ # Create an "safetypes" table likes above but ORC columns can still fit into it.
+ # Reuse the data files of functional_orc_def.alltypestiny
+ tbl_loc = get_fs_path("/test-warehouse/alltypestiny_orc_def")
+ self.client.execute("""create external table %s.illtypes (c1 boolean, c2 float,
+ c3 boolean, c4 tinyint, c5 smallint, c6 int, c7 boolean, c8 string, c9 int,
+ c10 float, c11 bigint) partitioned by (year int, month int) stored as ORC
+ location '%s';""" % (unique_database, tbl_loc))
+ self.client.execute("""create external table %s.safetypes (c1 bigint, c2 boolean,
+ c3 smallint, c4 int, c5 bigint, c6 bigint, c7 double, c8 double, c9 char(3),
+ c10 varchar(3), c11 timestamp) partitioned by (year int, month int) stored as ORC
+ location '%s';""" % (unique_database, tbl_loc))
+ self.client.execute("alter table %s.illtypes recover partitions" % unique_database)
+ self.client.execute("alter table %s.safetypes recover partitions" % unique_database)
+
+ # Create a decimal table whose precisions don't match the underlining orc files.
+ # Reuse the data files of functional_orc_def.decimal_tbl.
+ decimal_loc = get_fs_path("/test-warehouse/decimal_tbl_orc_def")
+ self.client.execute("""create external table %s.mismatch_decimals (d1 decimal(8,0),
+ d2 decimal(8,0), d3 decimal(19,10), d4 decimal(20,20), d5 decimal(2,0))
+ partitioned by (d6 decimal(9,0)) stored as orc location '%s'"""
+ % (unique_database, decimal_loc))
+ self.client.execute("alter table %s.mismatch_decimals recover partitions" % unique_database)
+
+ self.run_test_case('DataErrorsTest/orc-type-checks', vector, unique_database)
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/query_test/test_scanners_fuzz.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners_fuzz.py b/tests/query_test/test_scanners_fuzz.py
index c336a17..791c343 100644
--- a/tests/query_test/test_scanners_fuzz.py
+++ b/tests/query_test/test_scanners_fuzz.py
@@ -61,6 +61,8 @@ class TestScannersFuzzing(ImpalaTestSuite):
'num_nodes' : cls.NUM_NODES_VALUES,
'mem_limit' : cls.MEM_LIMITS}))
# TODO: enable for more table formats once they consistently pass the fuzz test.
+ # TODO(IMPALA-6772): enable for ORC formats once a new version after release-1.4.3
+ # of ORC library is released.
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format in ('avro', 'parquet') or
(v.get_value('table_format').file_format == 'text' and
http://git-wip-us.apache.org/repos/asf/impala/blob/818cd8fa/tests/query_test/test_tpch_queries.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_tpch_queries.py b/tests/query_test/test_tpch_queries.py
index ece8347..68a2984 100644
--- a/tests/query_test/test_tpch_queries.py
+++ b/tests/query_test/test_tpch_queries.py
@@ -36,7 +36,7 @@ class TestTpchQuery(ImpalaTestSuite):
# TODO: the planner tests are based on text and need this.
if cls.exploration_strategy() == 'core':
cls.ImpalaTestMatrix.add_constraint(lambda v:\
- v.get_value('table_format').file_format in ['text', 'parquet', 'kudu'])
+ v.get_value('table_format').file_format in ['text', 'parquet', 'kudu', 'orc'])
def idfn(val):
return "TPC-H: Q{0}".format(val)