You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mi...@apache.org on 2023/12/15 22:25:47 UTC

(impala) branch master updated: IMPALA-12465: Unicode column name support

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new c86582cfe IMPALA-12465: Unicode column name support
c86582cfe is described below

commit c86582cfe034c19a8ea80cdda518e0a720a2d313
Author: pranav.lodha <pr...@cloudera.com>
AuthorDate: Tue Nov 7 00:41:18 2023 +0530

    IMPALA-12465: Unicode column name support
    
    Impala depends on Hive functions for column name validation and uses
    validateName() function for the same. Since Hive already supports
    unicode column names, the patch just updates the column name validation
    function to validateColumnName(). validateName() checks for a certain
    conformance based on pattern matching standards while
    validateColumnName() places no restrictions on column names at the
    Metadata level.
    
    Testing: The support is tested and cross-checked with Hive. The tests
    can be found in unicode-column-name.test.
    
    Change-Id: I1ad9d63ac1b9631a0f4a433798bd5109aa2ed718
    Reviewed-on: http://gerrit.cloudera.org:8080/20506
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Michael Smith <mi...@cloudera.com>
---
 .../java/org/apache/impala/analysis/ColumnDef.java |   2 +-
 .../impala/catalog/Hive3MetastoreShimBase.java     |   8 +
 .../org/apache/impala/analysis/AnalyzeDDLTest.java |  38 +--
 .../queries/QueryTest/unicode-column-name.test     | 289 +++++++++++++++++++++
 tests/metadata/test_column_unicode.py              |  44 ++++
 tests/shell/test_shell_interactive.py              |  24 ++
 6 files changed, 380 insertions(+), 25 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/ColumnDef.java b/fe/src/main/java/org/apache/impala/analysis/ColumnDef.java
index 1e14595e6..5ba145f30 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ColumnDef.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ColumnDef.java
@@ -203,7 +203,7 @@ public class ColumnDef {
 
   public void analyze(Analyzer analyzer) throws AnalysisException {
     // Check whether the column name meets the Metastore's requirements.
-    if (!MetastoreShim.validateName(colName_)) {
+    if (!MetastoreShim.validateColumnName(colName_)) {
       throw new AnalysisException("Invalid column/field name: " + colName_);
     }
     if (typeDef_ != null) {
diff --git a/fe/src/main/java/org/apache/impala/catalog/Hive3MetastoreShimBase.java b/fe/src/main/java/org/apache/impala/catalog/Hive3MetastoreShimBase.java
index 4f4bca324..cdd72aad7 100644
--- a/fe/src/main/java/org/apache/impala/catalog/Hive3MetastoreShimBase.java
+++ b/fe/src/main/java/org/apache/impala/catalog/Hive3MetastoreShimBase.java
@@ -79,6 +79,7 @@ import org.apache.impala.util.HiveMetadataFormatUtils;
 import org.apache.thrift.TException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
 
 /**
  * Base class for Hive 3 MetastoreShim.
@@ -823,4 +824,11 @@ public class Hive3MetastoreShimBase {
     return wh.getDefaultTablePath(db, tbl.getTableName().toLowerCase(), isExternal)
         .toString();
   }
+
+  /**
+   * At the Metadata level there are no restrictions on column names.
+   */
+  public static boolean validateColumnName(String name) {
+    return MetaStoreUtils.validateColumnName(name);
+  }
 }
diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
index 88f9dbf09..1976a418c 100644
--- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
@@ -447,6 +447,8 @@ public class AnalyzeDDLTest extends FrontendTestBase {
     AnalyzesOk("alter table functional.alltypes add column NEW_COL int");
     AnalyzesOk("alter table functional.alltypes add column if not exists int_col int");
     AnalyzesOk("alter table functional.alltypes add column if not exists INT_COL int");
+    // Valid unicode column name.
+    AnalyzesOk("alter table functional.alltypes add column `???` int");
 
     // Column name must be unique for add.
     AnalysisError("alter table functional.alltypes add column int_col int",
@@ -462,9 +464,6 @@ public class AnalyzeDDLTest extends FrontendTestBase {
         "Column name conflicts with existing partition column: year");
     AnalysisError("alter table functional.alltypes add column if not exists YEAR int",
         "Column name conflicts with existing partition column: year");
-    // Invalid column name.
-    AnalysisError("alter table functional.alltypes add column `???` int",
-        "Invalid column/field name: ???");
 
     // Table/Db does not exist.
     AnalysisError("alter table db_does_not_exist.alltypes add column i int",
@@ -520,6 +519,9 @@ public class AnalyzeDDLTest extends FrontendTestBase {
     AnalyzesOk("alter table functional.alltypes add columns (c struct<f1:int>)");
     AnalyzesOk("alter table functional.alltypes add if not exists columns (int_col int)");
     AnalyzesOk("alter table functional.alltypes add if not exists columns (INT_COL int)");
+    // Valid unicode column name.
+    AnalyzesOk("alter table functional.alltypes add columns" +
+        "(`시스???पताED` int)");
 
     // Column name must be unique for add.
     AnalysisError("alter table functional.alltypes add columns (int_col int)",
@@ -529,9 +531,6 @@ public class AnalyzeDDLTest extends FrontendTestBase {
         "Column name conflicts with existing partition column: year");
     AnalysisError("alter table functional.alltypes add if not exists columns (year int)",
         "Column name conflicts with existing partition column: year");
-    // Invalid column name.
-    AnalysisError("alter table functional.alltypes add columns (`???` int)",
-        "Invalid column/field name: ???");
 
     // Duplicate column names.
     AnalysisError("alter table functional.alltypes add columns (c1 int, c1 int)",
@@ -600,9 +599,8 @@ public class AnalyzeDDLTest extends FrontendTestBase {
     AnalyzesOk("alter table functional.alltypes replace columns " +
         "(C1 int comment 'c', C2 int)");
     AnalyzesOk("alter table functional.alltypes replace columns (c array<string>)");
-    // Invalid column name.
-    AnalysisError("alter table functional.alltypes replace columns (`???` int)",
-        "Invalid column/field name: ???");
+    AnalyzesOk("alter table functional.alltypes replace columns" +
+        "(`?최종हिंदी` int)");
 
     // Replace should not throw an error if the column already exists.
     AnalyzesOk("alter table functional.alltypes replace columns (int_col int)");
@@ -699,6 +697,7 @@ public class AnalyzeDDLTest extends FrontendTestBase {
     AnalyzesOk("alter table functional.alltypes change column int_col int_col tinyint");
     // Add a comment to a column.
     AnalyzesOk("alter table functional.alltypes change int_col int_col int comment 'c'");
+    AnalyzesOk("alter table functional.alltypes change column int_col `汉字` int");
 
     AnalysisError("alter table functional.alltypes change column no_col c1 int",
         "Column 'no_col' does not exist in table: functional.alltypes");
@@ -710,9 +709,6 @@ public class AnalyzeDDLTest extends FrontendTestBase {
         "alter table functional.alltypes change column int_col Tinyint_col int",
         "Column already exists: tinyint_col");
 
-    // Invalid column name.
-    AnalysisError("alter table functional.alltypes change column int_col `???` int",
-        "Invalid column/field name: ???");
 
     // Table/Db does not exist
     AnalysisError("alter table db_does_not_exist.alltypes change c1 c2 int",
@@ -1481,9 +1477,7 @@ public class AnalyzeDDLTest extends FrontendTestBase {
         "select * from functional.alltypessmall a inner join " +
         "functional.alltypessmall b on a.id = b.id",
         "Duplicate column name: id");
-    // Invalid column name.
-    AnalysisError("alter view functional.alltypes_view as select 'abc' as `???`",
-        "Invalid column/field name: ???");
+    AnalyzesOk("alter view functional.alltypes_view as select 'abc' as `ㅛㅜ`");
     // Change the view definition to contain a subquery (IMPALA-1797)
     AnalyzesOk("alter view functional.alltypes_view as " +
         "select * from functional.alltypestiny where id in " +
@@ -2831,11 +2825,9 @@ public class AnalyzeDDLTest extends FrontendTestBase {
     // Invalid table/view name.
     AnalysisError("create table functional.`^&*` (x int) PARTITIONED BY (y int)",
         "Invalid table/view name: ^&*");
-    // Invalid column names.
-    AnalysisError("create table new_table (`???` int) PARTITIONED BY (i int)",
-        "Invalid column/field name: ???");
-    AnalysisError("create table new_table (i int) PARTITIONED BY (`^&*` int)",
-        "Invalid column/field name: ^&*");
+    // Valid unicode column names.
+    AnalyzesOk("create table new_table (`???` int) PARTITIONED BY (i int)");
+    AnalyzesOk("create table new_table (i int) PARTITIONED BY (`^&*` int)");
     // Test HMS constraint on comment length.
     AnalyzesOk(String.format("create table t (i int comment '%s')",
         StringUtils.repeat("c", MetaStoreUtil.CREATE_MAX_COMMENT_LENGTH)));
@@ -3236,10 +3228,8 @@ public class AnalyzeDDLTest extends FrontendTestBase {
         "Invalid database name: ???");
     AnalysisError("create view `^%&` as select 1, 2, 3",
         "Invalid table/view name: ^%&");
-    AnalysisError("create view foo as select 1 as `???`",
-        "Invalid column/field name: ???");
-    AnalysisError("create view foo(`%^&`) as select 1",
-        "Invalid column/field name: %^&");
+    AnalyzesOk("create view foo as select 1 as `???`");
+    AnalyzesOk("create view foo(`%^&`) as select 1");
 
     // Table/view already exists.
     AnalysisError("create view functional.alltypes as " +
diff --git a/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test b/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test
new file mode 100644
index 000000000..66b5bda94
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test
@@ -0,0 +1,289 @@
+====
+---- QUERY
+create table testtbl1(`세율대분류구분코드` int, s string COMMENT 'String col') stored as TEXTFILE;
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+# Make sure creating a table with the same name doesn't throw an error when
+# IF NOT EXISTS is specified.
+create table if not exists testtbl1(`세율대분류구분코드` int, s string)
+STORED AS TEXTFILE;
+---- RESULTS
+'Table already exists.'
+====
+---- QUERY
+show tables;
+---- RESULTS
+'testtbl1'
+---- TYPES
+STRING
+====
+---- QUERY
+describe testtbl1;
+---- RESULTS: RAW_STRING
+'세율대분류구분코드','int',''
+'s','string','String col'
+---- TYPES
+STRING, STRING, STRING
+====
+---- QUERY
+insert into table testtbl1 values(1, 'Alice');
+====
+---- QUERY
+select * from testtbl1;
+---- RESULTS
+1,'Alice'
+---- TYPES
+INT, STRING
+====
+---- QUERY
+drop table testtbl1;
+---- RESULTS
+'Table has been dropped.'
+====
+---- QUERY
+create table testtbl_kudu(`시스템처리최종사용자번호a` int, s string COMMENT 'String col',
+ Primary key(`시스템처리최종사용자번호a`)) stored as KUDU;
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+show tables;
+---- RESULTS
+'testtbl_kudu'
+---- TYPES
+STRING
+====
+---- QUERY
+describe testtbl_kudu;
+---- RESULTS: RAW_STRING
+'시스템처리최종사용자번호a','int','','true','true','false','','AUTO_ENCODING','DEFAULT_COMPRESSION','0'
+'s','string','String col','false','','true','','AUTO_ENCODING','DEFAULT_COMPRESSION','0'
+---- TYPES
+STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING, STRING
+====
+---- QUERY
+insert into table testtbl_kudu values(1, 'Alice');
+====
+---- QUERY
+select * from testtbl_kudu;
+---- RESULTS
+1,'Alice'
+---- TYPES
+INT, STRING
+====
+---- QUERY
+drop table testtbl_kudu;
+---- RESULTS
+'Table has been dropped.'
+====
+---- QUERY
+create table testtbl_iceberg(`我` int, s string COMMENT 'String col') stored as ICEBERG;
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+# Make sure creating a table with the same name doesn't throw an error when
+# IF NOT EXISTS is specified.
+create table if not exists testtbl_iceberg(`我` int, s string)
+STORED AS ICEBERG;
+---- RESULTS
+'Table already exists.'
+====
+---- QUERY
+show tables;
+---- RESULTS
+'testtbl_iceberg'
+---- TYPES
+STRING
+====
+---- QUERY
+describe testtbl_iceberg;
+---- RESULTS: RAW_STRING
+'我','int','','true'
+'s','string','String col','true'
+---- TYPES
+STRING, STRING, STRING, STRING
+====
+---- QUERY
+insert into table testtbl_iceberg values(1, 'Alice');
+====
+---- QUERY
+select * from testtbl_iceberg;
+---- RESULTS
+1,'Alice'
+---- TYPES
+INT, STRING
+====
+---- QUERY
+drop table testtbl_iceberg;
+---- RESULTS
+'Table has been dropped.'
+====
+---- QUERY
+create table $DATABASE.testtbl_orc(`我` int, s string COMMENT 'String col') stored as ORC;
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+# Make sure creating a table with the same name doesn't throw an error when
+# IF NOT EXISTS is specified.
+create table if not exists $DATABASE.testtbl_orc(`我` int, s string)
+STORED AS ORC;
+---- RESULTS
+'Table already exists.'
+====
+---- QUERY
+show tables;
+---- RESULTS
+'testtbl_orc'
+---- TYPES
+STRING
+====
+---- QUERY
+describe $DATABASE.testtbl_orc;
+---- RESULTS: RAW_STRING
+'我','int',''
+'s','string','String col'
+---- TYPES
+STRING, STRING, STRING
+====
+---- HIVE_QUERY
+insert into table $DATABASE.testtbl_orc values(1, 'Alice');
+====
+---- HIVE_QUERY
+select * from $DATABASE.testtbl_orc;
+---- RESULTS
+1,'Alice'
+---- TYPES
+INT, STRING
+====
+---- QUERY
+drop table $DATABASE.testtbl_orc;
+---- RESULTS
+'Table has been dropped.'
+====
+---- QUERY
+create table $DATABASE.testtbl_avro(`我` int, s string COMMENT 'String col') stored as AVRO;
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+# Make sure creating a table with the same name doesn't throw an error when
+# IF NOT EXISTS is specified.
+create table if not exists $DATABASE.testtbl_avro(`我` int, s string)
+STORED AS AVRO;
+---- RESULTS
+'Table already exists.'
+====
+---- QUERY
+show tables;
+---- RESULTS
+'testtbl_avro'
+---- TYPES
+STRING
+====
+---- QUERY
+describe $DATABASE.testtbl_avro;
+---- RESULTS: RAW_STRING
+'我','int','from deserializer'
+'s','string','String col'
+---- TYPES
+STRING, STRING, STRING
+====
+---- HIVE_QUERY
+insert into table $DATABASE.testtbl_avro values(1, 'Alice');
+====
+---- HIVE_QUERY
+select * from $DATABASE.testtbl_avro;
+---- RESULTS
+1,'Alice'
+---- TYPES
+INT, STRING
+====
+---- QUERY
+drop table $DATABASE.testtbl_avro;
+---- RESULTS
+'Table has been dropped.'
+====
+---- QUERY
+create table testtbl_part(`我` int, s string) PARTITIONED BY (`고객` int comment 'C') stored as TEXTFILE;
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+# Partition columns are displayed as part of DESCRIBE <table>
+describe testtbl_part;
+---- RESULTS: RAW_STRING
+'我','int',''
+'s','string',''
+'고객','int','C'
+---- TYPES
+STRING, STRING, STRING
+====
+---- QUERY
+insert into table testtbl_part partition(`고객`=24) values(1, 'Alice');
+====
+---- QUERY
+insert into table testtbl_part partition(`고객`=20) values(2, 'Alison');
+====
+---- QUERY
+insert into table testtbl_part partition(`고객`=23) values(3, 'Zack');
+====
+---- QUERY
+select * from testtbl_part;
+---- RESULTS
+1,'Alice',24
+2,'Alison',20
+3,'Zack',23
+---- TYPES
+INT, STRING, INT
+====
+---- QUERY
+drop table testtbl_part;
+---- RESULTS
+'Table has been dropped.'
+====
+---- QUERY
+create table testtbl_parquet(`我` int, s string COMMENT 'String col') stored as PARQUET;
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+# Make sure creating a table with the same name doesn't throw an error when
+# IF NOT EXISTS is specified.
+create table if not exists testtbl_parquet(`我` int, s string)
+STORED AS PARQUET;
+---- RESULTS
+'Table already exists.'
+====
+---- QUERY
+show tables;
+---- RESULTS
+'testtbl_parquet'
+---- TYPES
+STRING
+====
+---- QUERY
+describe testtbl_parquet;
+---- RESULTS: RAW_STRING
+'我','int',''
+'s','string','String col'
+---- TYPES
+STRING, STRING, STRING
+====
+---- QUERY
+insert into table testtbl_parquet values(1, 'Alice');
+====
+---- QUERY
+select * from testtbl_parquet;
+---- RESULTS
+1,'Alice'
+---- TYPES
+INT, STRING
+====
+---- QUERY
+drop table testtbl_parquet;
+---- RESULTS
+'Table has been dropped.'
diff --git a/tests/metadata/test_column_unicode.py b/tests/metadata/test_column_unicode.py
new file mode 100644
index 000000000..223d1c239
--- /dev/null
+++ b/tests/metadata/test_column_unicode.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env impala-python
+# -*- coding: utf-8 -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+from tests.common.impala_test_suite import ImpalaTestSuite
+from tests.common.test_dimensions import create_client_protocol_dimension
+
+
+class TestColumnUnicode(ImpalaTestSuite):
+
+    @classmethod
+    def get_workload(cls):
+        return 'functional-query'
+
+    @classmethod
+    def add_test_dimensions(cls):
+        super(TestColumnUnicode, cls).add_test_dimensions()
+        cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension())
+        # Since the test file already covers different file formats,
+        # a constraint is added to avoid redundancy.
+        cls.ImpalaTestMatrix.add_constraint(
+            lambda v: v.get_value('table_format').file_format == 'parquet'
+            and v.get_value('table_format').compression_codec == 'none')
+
+    def test_create_unicode_table(self, vector, unique_database):
+        self.run_test_case('QueryTest/unicode-column-name', vector,
+                           use_db=unique_database)
diff --git a/tests/shell/test_shell_interactive.py b/tests/shell/test_shell_interactive.py
index d0920cdcf..52b56af27 100755
--- a/tests/shell/test_shell_interactive.py
+++ b/tests/shell/test_shell_interactive.py
@@ -401,6 +401,30 @@ class TestImpalaShellInteractive(ImpalaTestSuite):
     child_proc.sendline('quit;')
     child_proc.wait()
 
+  def test_unicode_column(self, vector, unique_database):
+    """Tests unicode column name support"""
+    if vector.get_value('strict_hs2_protocol'):
+      pytest.skip("IMPALA-10827: Failed, need to investigate.")
+    args = ("create table {0}.test_tbl(`세율중분류구분코드` int, s string COMMENT 'String col')"
+            " STORED AS TEXTFILE;".format(unique_database))
+    result = run_impala_shell_interactive(vector, args)
+    assert "Fetched 1 row(s)" in result.stderr, result.stderr
+    args = ("describe {0}.test_tbl;"
+            .format(unique_database))
+    result = run_impala_shell_interactive(vector, args)
+    assert "Fetched 2 row(s)" in result.stderr, result.stderr
+    assert "세율중분류구분코드" in result.stdout
+    args = ("insert into table {0}.test_tbl values(1, 'Alice');"
+            .format(unique_database))
+    result = run_impala_shell_interactive(vector, args)
+    assert "Modified 1 row(s)" in result.stderr, result.stderr
+    args = ("select * from {0}.test_tbl;"
+            .format(unique_database))
+    result = run_impala_shell_interactive(vector, args)
+    assert "Fetched 1 row(s)" in result.stderr, result.stderr
+    assert "세율중분류구분코드" in result.stdout
+
+
   def test_welcome_string(self, vector):
     """Test that the shell's welcome message is only printed once
     when the shell is started. Ensure it is not reprinted on errors.