You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2022/03/31 21:39:30 UTC
[impala] 02/02: IMPALA-11210: Impala can only handle lowercase schema elements of Iceberg table
This is an automated email from the ASF dual-hosted git repository.
boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 952f2af0ca5afd48bac828b66db467502da76ff2
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Wed Mar 30 16:18:46 2022 +0200
IMPALA-11210: Impala can only handle lowercase schema elements of Iceberg table
When Impala/Hive creates a table they lowercase the schema elements.
When Spark creates an Iceberg table it doesn't lowercase the names
of the columns in the Iceberg metadata. This triggers a precondition
check in Impala which makes such Iceberg tables unloadable.
This patch converts column names to lowercase when converting Iceberg
schemas to Hive/Impala schemas.
Testing:
* added e2e test
Change-Id: Iffd910f76844fbf34db805dda6c3053c5ad1cf79
Reviewed-on: http://gerrit.cloudera.org:8080/18368
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
.../org/apache/impala/catalog/IcebergColumn.java | 2 +-
.../apache/impala/util/IcebergSchemaConverter.java | 2 +-
testdata/data/README | 3 +
.../metadata/v1.metadata.json | 237 +++++++++++++++++++++
.../metadata/version-hint.txt | 1 +
.../functional/functional_schema_template.sql | 14 ++
.../datasets/functional/schema_constraints.csv | 1 +
.../queries/QueryTest/iceberg-query.test | 14 ++
8 files changed, 272 insertions(+), 2 deletions(-)
diff --git a/fe/src/main/java/org/apache/impala/catalog/IcebergColumn.java b/fe/src/main/java/org/apache/impala/catalog/IcebergColumn.java
index a9be0d9..1b6f953 100644
--- a/fe/src/main/java/org/apache/impala/catalog/IcebergColumn.java
+++ b/fe/src/main/java/org/apache/impala/catalog/IcebergColumn.java
@@ -41,7 +41,7 @@ public class IcebergColumn extends Column {
public IcebergColumn(String name, Type type, String comment, int position,
int fieldId, int fieldMapKeyId, int fieldMapValueId, boolean isNullable) {
- super(name, type, comment, position);
+ super(name.toLowerCase(), type, comment, position);
fieldId_ = fieldId;
fieldMapKeyId_ = fieldMapKeyId;
fieldMapValueId_ = fieldMapValueId;
diff --git a/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java b/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java
index 131a7f0..9757210 100644
--- a/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java
+++ b/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java
@@ -115,7 +115,7 @@ public class IcebergSchemaConverter {
for (Types.NestedField column : schema.columns()) {
Type colType = toImpalaType(column.type());
// Update sd cols by iceberg NestedField
- ret.add(new FieldSchema(column.name(), colType.toSql().toLowerCase(),
+ ret.add(new FieldSchema(column.name().toLowerCase(), colType.toSql().toLowerCase(),
column.doc()));
}
return ret;
diff --git a/testdata/data/README b/testdata/data/README
index 804c5a1..af49e93 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -715,3 +715,6 @@ The tables that have the following schema changes since table migration:
* Partition FLOAT column to DOUBLE
* Partition DECIMAL(5,3) column to DECIMAL(8,3)
* Non-partition column has been moved to end of the schema
+
+iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col:
+Generated by Impala, then modified the metadata.json file to contain uppercase characters.
diff --git a/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/v1.metadata.json b/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/v1.metadata.json
new file mode 100644
index 0000000..5de1476
--- /dev/null
+++ b/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/v1.metadata.json
@@ -0,0 +1,237 @@
+{
+ "format-version" : 1,
+ "table-uuid" : "3a93e4c0-5357-4203-a7e1-242168207af8",
+ "location" : "/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col",
+ "last-updated-ms" : 1648649057966,
+ "last-column-id" : 1,
+ "schema" : {
+ "type" : "struct",
+ "schema-id" : 0,
+ "fields" : [ {
+ "id" : 1,
+ "name" : "Region",
+ "required" : false,
+ "type" : "string",
+ "doc" : "from deserializer"
+ }, {
+ "id" : 2,
+ "name" : "Nested_Struct",
+ "required" : false,
+ "type" : {
+ "type" : "struct",
+ "fields" : [ {
+ "id" : 3,
+ "name" : "A",
+ "required" : true,
+ "type" : "int"
+ }, {
+ "id" : 4,
+ "name" : "B",
+ "required" : true,
+ "type" : {
+ "type" : "list",
+ "element-id" : 7,
+ "element" : "int",
+ "element-required" : true
+ }
+ }, {
+ "id" : 5,
+ "name" : "C",
+ "required" : true,
+ "type" : {
+ "type" : "struct",
+ "fields" : [ {
+ "id" : 8,
+ "name" : "D",
+ "required" : true,
+ "type" : {
+ "type" : "list",
+ "element-id" : 9,
+ "element" : {
+ "type" : "list",
+ "element-id" : 10,
+ "element" : {
+ "type" : "struct",
+ "fields" : [ {
+ "id" : 11,
+ "name" : "E",
+ "required" : true,
+ "type" : "int"
+ }, {
+ "id" : 12,
+ "name" : "F",
+ "required" : true,
+ "type" : "string"
+ } ]
+ },
+ "element-required" : true
+ },
+ "element-required" : true
+ }
+ } ]
+ }
+ }, {
+ "id" : 6,
+ "name" : "G",
+ "required" : true,
+ "type" : {
+ "type" : "map",
+ "key-id" : 13,
+ "key" : "string",
+ "value-id" : 14,
+ "value" : {
+ "type" : "struct",
+ "fields" : [ {
+ "id" : 15,
+ "name" : "H",
+ "required" : true,
+ "type" : {
+ "type" : "struct",
+ "fields" : [ {
+ "id" : 16,
+ "name" : "I",
+ "required" : true,
+ "type" : {
+ "type" : "list",
+ "element-id" : 17,
+ "element" : "double",
+ "element-required" : true
+ }
+ } ]
+ }
+ } ]
+ },
+ "value-required" : true
+ }
+ } ]
+ }
+ } ]
+ },
+ "current-schema-id" : 0,
+ "schemas" : [ {
+ "type" : "struct",
+ "schema-id" : 0,
+ "fields" : [ {
+ "id" : 1,
+ "name" : "region",
+ "required" : false,
+ "type" : "string",
+ "doc" : "from deserializer"
+ }, {
+ "id" : 2,
+ "name" : "Nested_Struct",
+ "required" : false,
+ "type" : {
+ "type" : "struct",
+ "fields" : [ {
+ "id" : 3,
+ "name" : "A",
+ "required" : true,
+ "type" : "int"
+ }, {
+ "id" : 4,
+ "name" : "B",
+ "required" : true,
+ "type" : {
+ "type" : "list",
+ "element-id" : 7,
+ "element" : "int",
+ "element-required" : true
+ }
+ }, {
+ "id" : 5,
+ "name" : "C",
+ "required" : true,
+ "type" : {
+ "type" : "struct",
+ "fields" : [ {
+ "id" : 8,
+ "name" : "D",
+ "required" : true,
+ "type" : {
+ "type" : "list",
+ "element-id" : 9,
+ "element" : {
+ "type" : "list",
+ "element-id" : 10,
+ "element" : {
+ "type" : "struct",
+ "fields" : [ {
+ "id" : 11,
+ "name" : "E",
+ "required" : true,
+ "type" : "int"
+ }, {
+ "id" : 12,
+ "name" : "F",
+ "required" : true,
+ "type" : "string"
+ } ]
+ },
+ "element-required" : true
+ },
+ "element-required" : true
+ }
+ } ]
+ }
+ }, {
+ "id" : 6,
+ "name" : "G",
+ "required" : true,
+ "type" : {
+ "type" : "map",
+ "key-id" : 13,
+ "key" : "string",
+ "value-id" : 14,
+ "value" : {
+ "type" : "struct",
+ "fields" : [ {
+ "id" : 15,
+ "name" : "H",
+ "required" : true,
+ "type" : {
+ "type" : "struct",
+ "fields" : [ {
+ "id" : 16,
+ "name" : "I",
+ "required" : true,
+ "type" : {
+ "type" : "list",
+ "element-id" : 17,
+ "element" : "double",
+ "element-required" : true
+ }
+ } ]
+ }
+ } ]
+ },
+ "value-required" : true
+ }
+ } ]
+ }
+ } ]
+ } ],
+ "partition-spec" : [ ],
+ "default-spec-id" : 0,
+ "partition-specs" : [ {
+ "spec-id" : 0,
+ "fields" : [ ]
+ } ],
+ "last-partition-id" : 999,
+ "default-sort-order-id" : 0,
+ "sort-orders" : [ {
+ "order-id" : 0,
+ "fields" : [ ]
+ } ],
+ "properties" : {
+ "engine.hive.enabled" : "true",
+ "external.table.purge" : "TRUE",
+ "write.format.default" : "parquet",
+ "OBJCAPABILITIES" : "EXTREAD,EXTWRITE",
+ "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler"
+ },
+ "current-snapshot-id" : -1,
+ "snapshots" : [ ],
+ "snapshot-log" : [ ],
+ "metadata-log" : [ ]
+}
diff --git a/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/version-hint.txt b/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/version-hint.txt
new file mode 100644
index 0000000..d00491f
--- /dev/null
+++ b/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/version-hint.txt
@@ -0,0 +1 @@
+1
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index 21d5c16..a81f76d 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -3297,6 +3297,20 @@ STORED AS ICEBERG;
---- DATASET
functional
---- BASE_TABLE_NAME
+iceberg_uppercase_col
+---- CREATE
+CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
+STORED AS ICEBERG
+TBLPROPERTIES('write.format.default'='parquet', 'iceberg.catalog'='hadoop.catalog',
+ 'iceberg.catalog_location'='/test-warehouse/iceberg_test/hadoop_catalog',
+ 'iceberg.table_identifier'='ice.iceberg_uppercase_col');
+---- DEPENDENT_LOAD
+`hadoop fs -mkdir -p /test-warehouse/iceberg_test/hadoop_catalog/ice && \
+hadoop fs -put -f ${IMPALA_HOME}/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col /test-warehouse/iceberg_test/hadoop_catalog/ice
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
alltypes_date_partition_2
---- PARTITION_COLUMNS
date_col date
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index 0bb7711..c462d4a 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -77,6 +77,7 @@ table_name:iceberg_alltypes_part, constraint:restrict_to, table_format:parquet/n
table_name:iceberg_alltypes_part_orc, constraint:restrict_to, table_format:parquet/none/none
table_name:iceberg_legacy_partition_schema_evolution, constraint:restrict_to, table_format:parquet/none/none
table_name:iceberg_legacy_partition_schema_evolution_orc, constraint:restrict_to, table_format:parquet/none/none
+table_name:iceberg_uppercase_col, constraint:restrict_to, table_format:parquet/none/none
# TODO: Support Avro. Data loading currently fails for Avro because complex types
# cannot be converted to the corresponding Avro types yet.
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-query.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-query.test
index 4d2b590..8adb1d8 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-query.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-query.test
@@ -507,3 +507,17 @@ Path,Size,Partition
---- TYPES
STRING,STRING,STRING
====
+---- QUERY
+describe formatted iceberg_uppercase_col;
+---- RESULTS: VERIFY_IS_SUBSET
+'region','string','from deserializer'
+'nested_struct','struct<a:int,b:array<int>,c:struct<d:array<array<struct<e:int,f:string>>>>,g:map<string,struct<h:struct<i:array<double>>>>>','NULL'
+---- TYPES
+string, string, string
+====
+---- QUERY
+SELECT * FROM iceberg_uppercase_col;
+---- RESULTS
+---- TYPES
+STRING
+====