You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2022/03/31 21:39:30 UTC
[impala] 02/02: IMPALA-11210: Impala can only handle lowercase schema elements of Iceberg table

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 952f2af0ca5afd48bac828b66db467502da76ff2
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Wed Mar 30 16:18:46 2022 +0200

    IMPALA-11210: Impala can only handle lowercase schema elements of Iceberg table
    
    When Impala/Hive creates a table they lowercase the schema elements.
    When Spark creates an Iceberg table it doesn't lowercase the names
    of the columns in the Iceberg metadata. This triggers a precondition
    check in Impala which makes such Iceberg tables unloadable.
    
    This patch converts column names to lowercase when converting Iceberg
    schemas to Hive/Impala schemas.
    
    Testing:
     * added e2e test
    
    Change-Id: Iffd910f76844fbf34db805dda6c3053c5ad1cf79
    Reviewed-on: http://gerrit.cloudera.org:8080/18368
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../org/apache/impala/catalog/IcebergColumn.java   |   2 +-
 .../apache/impala/util/IcebergSchemaConverter.java |   2 +-
 testdata/data/README                               |   3 +
 .../metadata/v1.metadata.json                      | 237 +++++++++++++++++++++
 .../metadata/version-hint.txt                      |   1 +
 .../functional/functional_schema_template.sql      |  14 ++
 .../datasets/functional/schema_constraints.csv     |   1 +
 .../queries/QueryTest/iceberg-query.test           |  14 ++
 8 files changed, 272 insertions(+), 2 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/catalog/IcebergColumn.java b/fe/src/main/java/org/apache/impala/catalog/IcebergColumn.java
index a9be0d9..1b6f953 100644
--- a/fe/src/main/java/org/apache/impala/catalog/IcebergColumn.java
+++ b/fe/src/main/java/org/apache/impala/catalog/IcebergColumn.java
@@ -41,7 +41,7 @@ public class IcebergColumn extends Column {
 
   public IcebergColumn(String name, Type type, String comment, int position,
       int fieldId, int fieldMapKeyId, int fieldMapValueId, boolean isNullable) {
-    super(name, type, comment, position);
+    super(name.toLowerCase(), type, comment, position);
     fieldId_ = fieldId;
     fieldMapKeyId_ = fieldMapKeyId;
     fieldMapValueId_ = fieldMapValueId;
diff --git a/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java b/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java
index 131a7f0..9757210 100644
--- a/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java
+++ b/fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java
@@ -115,7 +115,7 @@ public class IcebergSchemaConverter {
     for (Types.NestedField column : schema.columns()) {
       Type colType = toImpalaType(column.type());
       // Update sd cols by iceberg NestedField
-      ret.add(new FieldSchema(column.name(), colType.toSql().toLowerCase(),
+      ret.add(new FieldSchema(column.name().toLowerCase(), colType.toSql().toLowerCase(),
           column.doc()));
     }
     return ret;
diff --git a/testdata/data/README b/testdata/data/README
index 804c5a1..af49e93 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -715,3 +715,6 @@ The tables that have the following schema changes since table migration:
 * Partition FLOAT column to DOUBLE
 * Partition DECIMAL(5,3) column to DECIMAL(8,3)
 * Non-partition column has been moved to end of the schema
+
+iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col:
+Generated by Impala, then modified the metadata.json file to contain uppercase characters.
diff --git a/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/v1.metadata.json b/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/v1.metadata.json
new file mode 100644
index 0000000..5de1476
--- /dev/null
+++ b/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/v1.metadata.json
@@ -0,0 +1,237 @@
+{
+  "format-version" : 1,
+  "table-uuid" : "3a93e4c0-5357-4203-a7e1-242168207af8",
+  "location" : "/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col",
+  "last-updated-ms" : 1648649057966,
+  "last-column-id" : 1,
+  "schema" : {
+    "type" : "struct",
+    "schema-id" : 0,
+    "fields" : [ {
+      "id" : 1,
+      "name" : "Region",
+      "required" : false,
+      "type" : "string",
+      "doc" : "from deserializer"
+    }, {
+      "id" : 2,
+      "name" : "Nested_Struct",
+      "required" : false,
+      "type" : {
+        "type" : "struct",
+        "fields" : [ {
+          "id" : 3,
+          "name" : "A",
+          "required" : true,
+          "type" : "int"
+        }, {
+          "id" : 4,
+          "name" : "B",
+          "required" : true,
+          "type" : {
+            "type" : "list",
+            "element-id" : 7,
+            "element" : "int",
+            "element-required" : true
+          }
+        }, {
+          "id" : 5,
+          "name" : "C",
+          "required" : true,
+          "type" : {
+            "type" : "struct",
+            "fields" : [ {
+              "id" : 8,
+              "name" : "D",
+              "required" : true,
+              "type" : {
+                "type" : "list",
+                "element-id" : 9,
+                "element" : {
+                  "type" : "list",
+                  "element-id" : 10,
+                  "element" : {
+                    "type" : "struct",
+                    "fields" : [ {
+                      "id" : 11,
+                      "name" : "E",
+                      "required" : true,
+                      "type" : "int"
+                    }, {
+                      "id" : 12,
+                      "name" : "F",
+                      "required" : true,
+                      "type" : "string"
+                    } ]
+                  },
+                  "element-required" : true
+                },
+                "element-required" : true
+              }
+            } ]
+          }
+        }, {
+          "id" : 6,
+          "name" : "G",
+          "required" : true,
+          "type" : {
+            "type" : "map",
+            "key-id" : 13,
+            "key" : "string",
+            "value-id" : 14,
+            "value" : {
+              "type" : "struct",
+              "fields" : [ {
+                "id" : 15,
+                "name" : "H",
+                "required" : true,
+                "type" : {
+                  "type" : "struct",
+                  "fields" : [ {
+                    "id" : 16,
+                    "name" : "I",
+                    "required" : true,
+                    "type" : {
+                      "type" : "list",
+                      "element-id" : 17,
+                      "element" : "double",
+                      "element-required" : true
+                    }
+                  } ]
+                }
+              } ]
+            },
+            "value-required" : true
+          }
+        } ]
+      }
+    } ]
+  },
+  "current-schema-id" : 0,
+  "schemas" : [ {
+    "type" : "struct",
+    "schema-id" : 0,
+    "fields" : [ {
+      "id" : 1,
+      "name" : "region",
+      "required" : false,
+      "type" : "string",
+      "doc" : "from deserializer"
+    }, {
+      "id" : 2,
+      "name" : "Nested_Struct",
+      "required" : false,
+      "type" : {
+        "type" : "struct",
+        "fields" : [ {
+          "id" : 3,
+          "name" : "A",
+          "required" : true,
+          "type" : "int"
+        }, {
+          "id" : 4,
+          "name" : "B",
+          "required" : true,
+          "type" : {
+            "type" : "list",
+            "element-id" : 7,
+            "element" : "int",
+            "element-required" : true
+          }
+        }, {
+          "id" : 5,
+          "name" : "C",
+          "required" : true,
+          "type" : {
+            "type" : "struct",
+            "fields" : [ {
+              "id" : 8,
+              "name" : "D",
+              "required" : true,
+              "type" : {
+                "type" : "list",
+                "element-id" : 9,
+                "element" : {
+                  "type" : "list",
+                  "element-id" : 10,
+                  "element" : {
+                    "type" : "struct",
+                    "fields" : [ {
+                      "id" : 11,
+                      "name" : "E",
+                      "required" : true,
+                      "type" : "int"
+                    }, {
+                      "id" : 12,
+                      "name" : "F",
+                      "required" : true,
+                      "type" : "string"
+                    } ]
+                  },
+                  "element-required" : true
+                },
+                "element-required" : true
+              }
+            } ]
+          }
+        }, {
+          "id" : 6,
+          "name" : "G",
+          "required" : true,
+          "type" : {
+            "type" : "map",
+            "key-id" : 13,
+            "key" : "string",
+            "value-id" : 14,
+            "value" : {
+              "type" : "struct",
+              "fields" : [ {
+                "id" : 15,
+                "name" : "H",
+                "required" : true,
+                "type" : {
+                  "type" : "struct",
+                  "fields" : [ {
+                    "id" : 16,
+                    "name" : "I",
+                    "required" : true,
+                    "type" : {
+                      "type" : "list",
+                      "element-id" : 17,
+                      "element" : "double",
+                      "element-required" : true
+                    }
+                  } ]
+                }
+              } ]
+            },
+            "value-required" : true
+          }
+        } ]
+      }
+    } ]
+  } ],
+  "partition-spec" : [ ],
+  "default-spec-id" : 0,
+  "partition-specs" : [ {
+    "spec-id" : 0,
+    "fields" : [ ]
+  } ],
+  "last-partition-id" : 999,
+  "default-sort-order-id" : 0,
+  "sort-orders" : [ {
+    "order-id" : 0,
+    "fields" : [ ]
+  } ],
+  "properties" : {
+    "engine.hive.enabled" : "true",
+    "external.table.purge" : "TRUE",
+    "write.format.default" : "parquet",
+    "OBJCAPABILITIES" : "EXTREAD,EXTWRITE",
+    "storage_handler" : "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler"
+  },
+  "current-snapshot-id" : -1,
+  "snapshots" : [ ],
+  "snapshot-log" : [ ],
+  "metadata-log" : [ ]
+}
diff --git a/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/version-hint.txt b/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/version-hint.txt
new file mode 100644
index 0000000..d00491f
--- /dev/null
+++ b/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col/metadata/version-hint.txt
@@ -0,0 +1 @@
+1
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index 21d5c16..a81f76d 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -3297,6 +3297,20 @@ STORED AS ICEBERG;
 ---- DATASET
 functional
 ---- BASE_TABLE_NAME
+iceberg_uppercase_col
+---- CREATE
+CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
+STORED AS ICEBERG
+TBLPROPERTIES('write.format.default'='parquet', 'iceberg.catalog'='hadoop.catalog',
+              'iceberg.catalog_location'='/test-warehouse/iceberg_test/hadoop_catalog',
+              'iceberg.table_identifier'='ice.iceberg_uppercase_col');
+---- DEPENDENT_LOAD
+`hadoop fs -mkdir -p /test-warehouse/iceberg_test/hadoop_catalog/ice && \
+hadoop fs -put -f ${IMPALA_HOME}/testdata/data/iceberg_test/hadoop_catalog/ice/iceberg_uppercase_col /test-warehouse/iceberg_test/hadoop_catalog/ice
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
 alltypes_date_partition_2
 ---- PARTITION_COLUMNS
 date_col date
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index 0bb7711..c462d4a 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -77,6 +77,7 @@ table_name:iceberg_alltypes_part, constraint:restrict_to, table_format:parquet/n
 table_name:iceberg_alltypes_part_orc, constraint:restrict_to, table_format:parquet/none/none
 table_name:iceberg_legacy_partition_schema_evolution, constraint:restrict_to, table_format:parquet/none/none
 table_name:iceberg_legacy_partition_schema_evolution_orc, constraint:restrict_to, table_format:parquet/none/none
+table_name:iceberg_uppercase_col, constraint:restrict_to, table_format:parquet/none/none
 
 # TODO: Support Avro. Data loading currently fails for Avro because complex types
 # cannot be converted to the corresponding Avro types yet.
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-query.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-query.test
index 4d2b590..8adb1d8 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-query.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-query.test
@@ -507,3 +507,17 @@ Path,Size,Partition
 ---- TYPES
 STRING,STRING,STRING
 ====
+---- QUERY
+describe formatted iceberg_uppercase_col;
+---- RESULTS: VERIFY_IS_SUBSET
+'region','string','from deserializer'
+'nested_struct','struct<a:int,b:array<int>,c:struct<d:array<array<struct<e:int,f:string>>>>,g:map<string,struct<h:struct<i:array<double>>>>>','NULL'
+---- TYPES
+string, string, string
+====
+---- QUERY
+SELECT * FROM iceberg_uppercase_col;
+---- RESULTS
+---- TYPES
+STRING
+====