You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/04/12 23:18:56 UTC
[22/50] incubator-impala git commit: IMPALA-3194: Allow queries
materializing scalar type columns in RC/sequence files
IMPALA-3194: Allow queries materializing scalar type columns in RC/sequence files
This commit unblocks queries materializing only scalar typed
columns on tables backed by RC/sequence files containing complex
typed columns. This worked prior to 2.3.0 release.
Change-Id: I3a89b211bdc01f7e07497e293fafd75ccf0500fe
Reviewed-on: http://gerrit.cloudera.org:8080/2580
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Internal Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/5cd7ada7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/5cd7ada7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/5cd7ada7
Branch: refs/heads/master
Commit: 5cd7ada727d04fe56d62ced2e8bfa56f4448ea57
Parents: 2809746
Author: Bharath Vissapragada <bh...@cloudera.com>
Authored: Sun Mar 13 06:17:06 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Thu Mar 31 12:06:57 2016 +0000
----------------------------------------------------------------------
.../cloudera/impala/catalog/HdfsFileFormat.java | 28 ++++++++++++++------
.../cloudera/impala/planner/HdfsScanNode.java | 14 ++++++----
.../PlannerTest/complex-types-file-formats.test | 23 +++++++++-------
3 files changed, 43 insertions(+), 22 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cd7ada7/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java b/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
index 9c883fc..3670aa5 100644
--- a/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
+++ b/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
@@ -35,26 +35,26 @@ public enum HdfsFileFormat {
RC_FILE("org.apache.hadoop.hive.ql.io.RCFileInputFormat",
"org.apache.hadoop.hive.ql.io.RCFileOutputFormat",
"org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe",
- false),
+ false, true),
TEXT("org.apache.hadoop.mapred.TextInputFormat",
"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
- false),
+ false, false),
LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat",
"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
- "",
- false),
+ "", false, false),
SEQUENCE_FILE("org.apache.hadoop.mapred.SequenceFileInputFormat",
"org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat",
- "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false),
+ "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false,
+ true),
AVRO("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat",
"org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat",
"org.apache.hadoop.hive.serde2.avro.AvroSerDe",
- false),
+ false, false),
PARQUET("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
"org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
- true);
+ true, true);
private final String inputFormat_;
private final String outputFormat_;
@@ -63,12 +63,18 @@ public enum HdfsFileFormat {
// Indicates whether we support scanning complex types for this file format.
private final boolean isComplexTypesSupported_;
+ // Indicates whether the file format can skip complex columns in scans and just
+ // materialize scalar typed columns. Ignored if isComplexTypesSupported_ is true.
+ // TODO: Remove this once we support complex types for all file formats.
+ private final boolean canSkipColumnTypes_;
+
HdfsFileFormat(String inputFormat, String outputFormat, String serializationLib,
- boolean isComplexTypesSupported) {
+ boolean isComplexTypesSupported, boolean canSkipColumnTypes) {
inputFormat_ = inputFormat;
outputFormat_ = outputFormat;
serializationLib_ = serializationLib;
isComplexTypesSupported_ = isComplexTypesSupported;
+ canSkipColumnTypes_ = canSkipColumnTypes;
}
public String inputFormat() { return inputFormat_; }
@@ -235,6 +241,12 @@ public enum HdfsFileFormat {
public boolean isComplexTypesSupported() { return isComplexTypesSupported_; }
/**
+ * Returns true if this file format can skip complex typed columns and materialize
+ * only scalar typed columns.
+ */
+ public boolean canSkipComplexTypes() { return canSkipColumnTypes_; }
+
+ /**
* Returns a list with all formats for which isComplexTypesSupported() is true.
*/
public static List<HdfsFileFormat> complexTypesFormats() {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cd7ada7/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
index c6f7722..5edc0dc 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
@@ -185,10 +185,11 @@ public class HdfsScanNode extends ScanNode {
}
if (firstComplexTypedCol == null) return;
- boolean hasMaterializedSlots = false;
+ boolean referencesComplexTypedCol = false;
for (SlotDescriptor slotDesc: desc_.getSlots()) {
- if (slotDesc.isMaterialized()) {
- hasMaterializedSlots = true;
+ if (!slotDesc.isMaterialized()) continue;
+ if (slotDesc.getType().isComplexType() || slotDesc.getColumn() == null) {
+ referencesComplexTypedCol = true;
break;
}
}
@@ -196,8 +197,11 @@ public class HdfsScanNode extends ScanNode {
for (HdfsPartition part: partitions_) {
HdfsFileFormat format = part.getInputFormatDescriptor().getFileFormat();
if (format.isComplexTypesSupported()) continue;
- // Allow count(*) and similar queries on RC_FILE with complex types.
- if (format == HdfsFileFormat.RC_FILE && !hasMaterializedSlots) continue;
+ // If the file format allows querying just scalar typed columns and the query
+ // doesn't materialize any complex typed columns, it is allowed.
+ if (format.canSkipComplexTypes() && !referencesComplexTypedCol) {
+ continue;
+ }
String errSuffix = String.format(
"Complex types are supported for these file formats: %s",
Joiner.on(", ").join(HdfsFileFormat.complexTypesFormats()));
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cd7ada7/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
index 487bb3b..f0431a2 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
@@ -38,13 +38,18 @@ select 1 from functional_rc_snap.complextypes_fileformat t, t.a
not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
Complex types are supported for these file formats: PARQUET.
====
-# Complex types are not supported on RC files, even if no complex-typed
-# columns are selected.
-select id from functional_rc_snap.complextypes_fileformat
+select s.f1 from functional_rc_snap.complextypes_fileformat t, t.m
---- PLAN
-not implemented: Scan of table 'functional_rc_snap.complextypes_fileformat' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
+not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
Complex types are supported for these file formats: PARQUET.
====
+# Complex types are not supported on RC files, however queries materializing
+# only scalar type columns are allowed.
+select id from functional_rc_snap.complextypes_fileformat
+---- PLAN
+00:SCAN HDFS [functional_rc_snap.complextypes_fileformat]
+ partitions=1/1 files=1 size=56B
+====
# Complex types are not supported on RC files but count(*) and similar
# queries should work.
select count(*) from functional_rc_snap.complextypes_fileformat
@@ -61,12 +66,12 @@ select s.f1 from functional_seq_snap.complextypes_fileformat t, t.a
not implemented: Scan of table 't' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
Complex types are supported for these file formats: PARQUET.
====
-# Complex types are not supported on sequence files, even if no complex-typed
-# columns are selected.
-select 1 from functional_seq_snap.complextypes_fileformat
+# Queries referencing only scalar typed columns on sequence files
+# are allowed.
+select id from functional_seq_snap.complextypes_fileformat
---- PLAN
-not implemented: Scan of table 'functional_seq_snap.complextypes_fileformat' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
-Complex types are supported for these file formats: PARQUET.
+00:SCAN HDFS [functional_seq_snap.complextypes_fileformat]
+ partitions=1/1 files=1 size=87B
====
# Scanning all partitions fails because there are partitions with a file format for which
# complex types are not supported. The error message is abbreviated because it is