You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2022/01/06 23:19:43 UTC

[impala] 02/02: IMPALA-10934 (Part 2): Enable table definition over a single file

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 351e0374727be7ccef6843cf1036fd6168220d3e
Author: Fang-Yu Rao <fa...@cloudera.com>
AuthorDate: Wed Nov 17 12:48:40 2021 -0800

    IMPALA-10934 (Part 2): Enable table definition over a single file
    
    This patch adds an end-to-end test to validate and characterize HMS'
    behavior with respect to external table creation after HIVE-25569 via
    which a user is allowed to create an external table associated with a
    single file.
    
    Change-Id: Ia4f57f07a9f543c660b102ebf307a6cf590a6784
    Reviewed-on: http://gerrit.cloudera.org:8080/18033
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Aman Sinha <am...@cloudera.com>
---
 bin/rat_exclude_files.txt                          |   2 +
 testdata/data/sfs_d1.parq                          | Bin 0 -> 544 bytes
 testdata/data/sfs_d2.txt                           |   5 +
 testdata/data/sfs_d3.parq                          | Bin 0 -> 544 bytes
 testdata/data/sfs_d4.txt                           |   5 +
 .../functional-query/queries/QueryTest/sfs.test    | 286 +++++++++++++++++++++
 tests/common/file_utils.py                         |  16 +-
 tests/query_test/test_sfs.py                       |  52 ++++
 8 files changed, 360 insertions(+), 6 deletions(-)

diff --git a/bin/rat_exclude_files.txt b/bin/rat_exclude_files.txt
index aef14f3..32c12fd 100644
--- a/bin/rat_exclude_files.txt
+++ b/bin/rat_exclude_files.txt
@@ -154,6 +154,8 @@ testdata/data/widerow.txt
 testdata/data/local_tbl/00000.txt
 testdata/data/hudi_parquet/*
 testdata/data/iceberg_test/*
+testdata/data/sfs_d2.txt
+testdata/data/sfs_d4.txt
 testdata/datasets/functional/functional_schema_template.sql
 testdata/impala-profiles/README
 testdata/impala-profiles/impala_profile_log_tpcds_compute_stats
diff --git a/testdata/data/sfs_d1.parq b/testdata/data/sfs_d1.parq
new file mode 100644
index 0000000..5ff07b6
Binary files /dev/null and b/testdata/data/sfs_d1.parq differ
diff --git a/testdata/data/sfs_d2.txt b/testdata/data/sfs_d2.txt
new file mode 100644
index 0000000..0eb236c
--- /dev/null
+++ b/testdata/data/sfs_d2.txt
@@ -0,0 +1,5 @@
+u,1
+v,2
+w,3
+x,4
+y,5
diff --git a/testdata/data/sfs_d3.parq b/testdata/data/sfs_d3.parq
new file mode 100644
index 0000000..5ff07b6
Binary files /dev/null and b/testdata/data/sfs_d3.parq differ
diff --git a/testdata/data/sfs_d4.txt b/testdata/data/sfs_d4.txt
new file mode 100644
index 0000000..0eb236c
--- /dev/null
+++ b/testdata/data/sfs_d4.txt
@@ -0,0 +1,5 @@
+u,1
+v,2
+w,3
+x,4
+y,5
diff --git a/testdata/workloads/functional-query/queries/QueryTest/sfs.test b/testdata/workloads/functional-query/queries/QueryTest/sfs.test
new file mode 100644
index 0000000..20ba813
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/sfs.test
@@ -0,0 +1,286 @@
+====
+---- QUERY
+# We do not hardcode the host name to something like "localhost" since the host name may
+# be an IP address in a test environment.
+CREATE EXTERNAL TABLE test_tbl_01 (s STRING, i INT) STORED AS PARQUET
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/$DATABASE.db/sfs_d1.parq/#SINGLEFILE#'
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+CREATE EXTERNAL TABLE test_tbl_02 (s STRING, i INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+STORED AS TEXTFILE
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/$DATABASE.db/sfs_d2.txt/#SINGLEFILE#'
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+SELECT * FROM test_tbl_01;
+---- RESULTS
+'a',1
+'b',2
+'c',3
+'d',4
+'e',5
+---- TYPES
+STRING, INT
+====
+---- QUERY
+SELECT * FROM test_tbl_02;
+---- RESULTS
+'u',1
+'v',2
+'w',3
+'x',4
+'y',5
+---- TYPES
+STRING, INT
+====
+---- QUERY
+INSERT INTO TABLE test_tbl_01 VALUES ('x', 100);
+---- CATCH
+row_regex: .*Unable to INSERT into target table .+ because .+ is not a supported filesystem.*
+====
+---- QUERY
+INSERT INTO TABLE test_tbl_02 VALUES ('x', 100);
+---- CATCH
+row_regex: .*Unable to INSERT into target table .+ because .+ is not a supported filesystem.*
+====
+---- QUERY
+LOAD DATA INPATH 'hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/$DATABASE.db/sfs_d3.parq' INTO TABLE test_tbl_01
+---- CATCH
+Unsupported SFS filesystem operation!
+====
+---- QUERY
+LOAD DATA INPATH 'hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/$DATABASE.db/sfs_d4.txt' INTO TABLE test_tbl_02
+---- CATCH
+Unsupported SFS filesystem operation!
+====
+---- QUERY
+LOAD DATA INPATH 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/$DATABASE.db/sfs_d3.parq/#SINGLEFILE#' INTO TABLE test_tbl_01
+---- CATCH
+row_regex: .*INPATH location .+ must point to one of the supported filesystem URI scheme.*
+====
+---- QUERY
+LOAD DATA INPATH 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/$DATABASE.db/sfs_d4.txt/#SINGLEFILE#' INTO TABLE test_tbl_02
+---- CATCH
+row_regex: .*INPATH location .+ must point to one of the supported filesystem URI scheme.*
+====
+---- QUERY
+TRUNCATE TABLE $DATABASE.test_tbl_01
+---- CATCH
+Unsupported SFS filesystem operation!
+====
+---- QUERY
+TRUNCATE TABLE $DATABASE.test_tbl_02
+---- CATCH
+Unsupported SFS filesystem operation!
+====
+---- QUERY
+COMPUTE STATS $DATABASE.test_tbl_01
+---- RESULTS
+'Updated 1 partition(s) and 2 column(s).'
+====
+---- QUERY
+COMPUTE STATS $DATABASE.test_tbl_02
+---- RESULTS
+'Updated 1 partition(s) and 2 column(s).'
+====
+---- QUERY
+CREATE EXTERNAL TABLE test_tbl_03 (s STRING, i INT) STORED AS PARQUET
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/managed/$DATABASE.db/sfs_d3.parq/#SINGLEFILE#'
+---- CATCH
+An external table's location should not be located within managed warehouse root directory
+====
+---- QUERY
+CREATE EXTERNAL TABLE test_tbl_04 (s STRING, i INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+STORED AS TEXTFILE
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/managed/$DATABASE.db/sfs_d4.txt/#SINGLEFILE#'
+---- CATCH
+An external table's location should not be located within managed warehouse root directory
+====
+---- QUERY
+# The table can actually be created.
+CREATE TABLE test_tbl_03 (s STRING, i INT)
+STORED AS PARQUET
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/managed/$DATABASE.db/sfs_d3.parq/#SINGLEFILE#'
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+# The table can actually be created.
+CREATE TABLE test_tbl_04 (s STRING, i INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+STORED AS TEXTFILE
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/managed/$DATABASE.db/sfs_d4.txt/#SINGLEFILE#'
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+# test_tbl_03 is considered as an external table with the parameter
+# 'external.table.purge' set to true. But the associated single file will not be deleted
+# after the table is dropped.
+DESCRIBE FORMATTED test_tbl_03
+---- RESULTS: VERIFY_IS_SUBSET
+'Table Type:         ','EXTERNAL_TABLE      ','NULL'
+'','external.table.purge','TRUE                '
+'','EXTERNAL            ','TRUE                '
+---- TYPES
+STRING,STRING,STRING
+====
+---- QUERY
+# test_tbl_04 is considered as an external table with the parameter
+# 'external.table.purge' set to true. But the associated single file will not be deleted
+# after the table is dropped.
+DESCRIBE FORMATTED test_tbl_04
+---- RESULTS: VERIFY_IS_SUBSET
+'Table Type:         ','EXTERNAL_TABLE      ','NULL'
+'','external.table.purge','TRUE                '
+'','EXTERNAL            ','TRUE                '
+---- TYPES
+STRING,STRING,STRING
+====
+---- QUERY
+# The contents of the table can be correctly retrieved.
+SELECT * FROM test_tbl_03;
+---- RESULTS
+'a',1
+'b',2
+'c',3
+'d',4
+'e',5
+---- TYPES
+STRING, INT
+====
+---- QUERY
+# The contents of the table can be correctly retrieved.
+SELECT * FROM test_tbl_04;
+---- RESULTS
+'u',1
+'v',2
+'w',3
+'x',4
+'y',5
+---- TYPES
+STRING, INT
+====
+---- QUERY
+INSERT INTO TABLE test_tbl_03 VALUES ('x', 100);
+---- CATCH
+row_regex: .*Unable to INSERT into target table .+ because .+ is not a supported filesystem.*
+====
+---- QUERY
+INSERT INTO TABLE test_tbl_04 VALUES ('x', 100);
+---- CATCH
+row_regex: .*Unable to INSERT into target table .+ because .+ is not a supported filesystem.*
+====
+---- QUERY
+DROP TABLE test_tbl_03;
+DROP TABLE test_tbl_04
+====
+---- QUERY
+# test_tbl_05 can be created, which shows that sfs_d3.parq has not been deleted after
+# test_tbl_03 was dropped.
+CREATE TABLE test_tbl_05 (s STRING, i INT) STORED AS PARQUET
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/managed/$DATABASE.db/sfs_d3.parq/#SINGLEFILE#'
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+# test_tbl_06 can be created, which shows that sfs_d4.txt has not been deleted after
+# test_tbl_04 was dropped.
+CREATE TABLE test_tbl_06 (s STRING, i INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+STORED AS TEXTFILE
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/managed/$DATABASE.db/sfs_d4.txt/#SINGLEFILE#'
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+# The contents of the table can still be correctly retrieved.
+SELECT * FROM test_tbl_05;
+---- RESULTS
+'a',1
+'b',2
+'c',3
+'d',4
+'e',5
+---- TYPES
+STRING, INT
+====
+---- QUERY
+# The contents of the table can be still correctly retrieved.
+SELECT * FROM test_tbl_06;
+---- RESULTS
+'u',1
+'v',2
+'w',3
+'x',4
+'y',5
+---- TYPES
+STRING, INT
+====
+---- QUERY
+# We need to set the following query option to submit to HMS a request to create a
+# managed table.
+SET DEFAULT_TRANSACTIONAL_TYPE=INSERT_ONLY;
+CREATE TABLE test_tbl_03 (s STRING, i INT)
+STORED AS PARQUET
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/managed/$DATABASE.db/sfs_d3.parq/#SINGLEFILE#'
+---- CATCH
+A managed table's location should be located within managed warehouse root directory or within its database's managedLocationUri.
+====
+---- QUERY
+# We need to set the following query option to submit to HMS a request to create a
+# managed table.
+SET DEFAULT_TRANSACTIONAL_TYPE=INSERT_ONLY;
+CREATE TABLE test_tbl_04 (s STRING, i INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+STORED AS TEXTFILE
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/managed/$DATABASE.db/sfs_d4.txt/#SINGLEFILE#'
+---- CATCH
+A managed table's location should be located within managed warehouse root directory or within its database's managedLocationUri.
+====
+---- QUERY
+SET DEFAULT_TRANSACTIONAL_TYPE=INSERT_ONLY;
+CREATE TABLE test_tbl_03 (s STRING, i INT)
+STORED AS PARQUET
+LOCATION 'hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/managed/$DATABASE.db/sfs_d3.parq'
+---- CATCH
+Path is not a directory
+====
+---- QUERY
+SET DEFAULT_TRANSACTIONAL_TYPE=INSERT_ONLY;
+CREATE TABLE test_tbl_04 (s STRING, i INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+STORED AS TEXTFILE
+LOCATION 'hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/managed/$DATABASE.db/sfs_d4.txt'
+---- CATCH
+Path is not a directory
+====
+---- QUERY
+# The table can actually be created but the contents of the table cannot be retrieved.
+CREATE EXTERNAL TABLE test_tbl_03 (s STRING) PARTITIONED BY (i INT) STORED AS PARQUET
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/$DATABASE.db/sfs_d3.parq/#SINGLEFILE#'
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+# The table can actually be created but the contents of the table cannot be retrieved.
+CREATE EXTERNAL TABLE test_tbl_04 (s STRING) PARTITIONED BY (i INT)
+ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+STORED AS TEXTFILE
+LOCATION 'sfs+hdfs://$INTERNAL_LISTEN_HOST:20500/test-warehouse/$DATABASE.db/sfs_d4.txt/#SINGLEFILE#'
+---- RESULTS
+'Table has been created.'
+====
+---- QUERY
+SELECT * FROM test_tbl_03
+---- RESULTS
+====
+---- QUERY
+SELECT * FROM test_tbl_04
+---- RESULTS
+====
diff --git a/tests/common/file_utils.py b/tests/common/file_utils.py
index 529f72d..2de5549 100644
--- a/tests/common/file_utils.py
+++ b/tests/common/file_utils.py
@@ -65,12 +65,20 @@ def create_table_and_copy_files(impala_client, create_stmt, unique_database, tab
   create_stmt = create_stmt.format(db=unique_database, tbl=table_name)
   impala_client.execute(create_stmt)
 
+  hdfs_dir = get_fs_path(
+      os.path.join("/test-warehouse", unique_database + ".db", table_name))
+  copy_files_to_hdfs_dir(files, hdfs_dir)
+
+  # Refresh the table metadata to see the new files
+  refresh_stmt = "refresh {0}.{1}".format(unique_database, table_name)
+  impala_client.execute(refresh_stmt)
+
+
+def copy_files_to_hdfs_dir(files, hdfs_dir):
   # Copy the files
   #  - build a list of source files
   #  - issue a single put to the hdfs_dir ( -d skips a staging copy)
   source_files = []
-  hdfs_dir = get_fs_path(
-      os.path.join("/test-warehouse", unique_database + ".db", table_name))
   for local_file in files:
     # Cut off leading '/' to make os.path.join() happy
     local_file = local_file if local_file[0] != '/' else local_file[1:]
@@ -79,10 +87,6 @@ def create_table_and_copy_files(impala_client, create_stmt, unique_database, tab
     source_files.append(local_file)
   check_call(['hdfs', 'dfs', '-put', '-f', '-d'] + source_files + [hdfs_dir])
 
-  # Refresh the table metadata to see the new files
-  refresh_stmt = "refresh {0}.{1}".format(unique_database, table_name)
-  impala_client.execute(refresh_stmt)
-
 
 def grep_dir(dir, search, filename_search=""):
   '''Recursively search for files that contain 'search' and have a filename that matches
diff --git a/tests/query_test/test_sfs.py b/tests/query_test/test_sfs.py
new file mode 100644
index 0000000..3973a1c
--- /dev/null
+++ b/tests/query_test/test_sfs.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This test suite validates the functionality that allows users to create an external
+# table associated with a single file.
+
+from tests.common.file_utils import copy_files_to_hdfs_dir
+from tests.common.impala_test_suite import ImpalaTestSuite
+from tests.common.skip import SkipIf
+
+
+class TestSFS(ImpalaTestSuite):
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestSFS, cls).add_test_dimensions()
+    cls.ImpalaTestMatrix.add_constraint(lambda v:
+        v.get_value('table_format').file_format == 'text' and
+        v.get_value('table_format').compression_codec == 'none')
+    cls.ImpalaTestMatrix.add_constraint(lambda v:
+        v.get_value('exec_option')['disable_codegen'] is False)
+
+  @SkipIf.not_hdfs
+  def test_sfs(self, vector, unique_database):
+    files_for_external_tables = ["testdata/data/sfs_d1.parq", "testdata/data/sfs_d2.txt",
+                                 "testdata/data/sfs_d3.parq", "testdata/data/sfs_d4.txt"]
+    files_for_managed_tables = ["testdata/data/sfs_d3.parq", "testdata/data/sfs_d4.txt"]
+    hdfs_dir_for_external_tables = "/test-warehouse/{0}.db/".format(unique_database)
+    hdfs_dir_for_managed_tables =\
+        "/test-warehouse/managed/{0}.db/".format(unique_database)
+
+    copy_files_to_hdfs_dir(files_for_external_tables, hdfs_dir_for_external_tables)
+    copy_files_to_hdfs_dir(files_for_managed_tables, hdfs_dir_for_managed_tables)
+
+    self.run_test_case('QueryTest/sfs', vector, unique_database)