You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by st...@apache.org on 2022/08/23 22:24:22 UTC

[impala] branch branch-4.1.1 updated (22745b562 -> f6ee249ac)

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a change to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git


    from 22745b562 IMPALA-11489: Fix int overflow in >2GB ORC files
     new c90d088c6 IMPALA-11447: Fix crash when fetching arrays/structs with result caching
     new 635753492 IMPALA-11295: Deflake TestParquet.test_multiple_blocks_mt_dop
     new a1a13c60d IMPALA-11391: Fixed race condition in test_drop_managed_kudu_table
     new f6ee249ac IMPALA-11317/IMPALA-11316/IMPALA-11315: impala-shell Python 3 fixes

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 be/src/service/query-result-set.cc    |  8 ++++-
 shell/ImpalaHttpClient.py             |  2 +-
 shell/impala_shell.py                 |  5 ++-
 tests/custom_cluster/test_kudu.py     |  9 +++--
 tests/hs2/test_fetch_first.py         | 66 ++++++++++++++++++++++-------------
 tests/query_test/test_scanners.py     | 18 +++++++---
 tests/shell/test_shell_commandline.py | 11 ++++--
 7 files changed, 83 insertions(+), 36 deletions(-)

[impala] 03/04: IMPALA-11391: Fixed race condition in test_drop_managed_kudu_table

Posted by st...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git

commit a1a13c60d3365f5e87d798e1c18b4875ebde2405
Author: Gergely Fürnstáhl <gf...@cloudera.com>
AuthorDate: Fri Jun 24 15:25:34 2022 +0200

    IMPALA-11391: Fixed race condition in test_drop_managed_kudu_table
    
    test_drop_managed_kudu_table uses exception to verify the deleted table
    is really missing. Depending on timing, this exception could have been
    raised in several control pathes with different content. Now the test
    waits for event processing, meaning Analyzer will consistently catch
    the missing table and raise the same exception.
    
    Change-Id: I857098c87fcd44d945dd33108bcfdfaa2ca939df
    Reviewed-on: http://gerrit.cloudera.org:8080/18667
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-on: http://gerrit.cloudera.org:8080/18893
    Tested-by: Quanlong Huang <hu...@gmail.com>
    Reviewed-by: Quanlong Huang <hu...@gmail.com>
---
 tests/custom_cluster/test_kudu.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/custom_cluster/test_kudu.py b/tests/custom_cluster/test_kudu.py
index 8fbc6c938..c706c984f 100644
--- a/tests/custom_cluster/test_kudu.py
+++ b/tests/custom_cluster/test_kudu.py
@@ -24,8 +24,9 @@ from time import sleep
 from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.kudu_test_suite import KuduTestSuite
-from tests.common.skip import SkipIfKudu, SkipIfHive3, SkipIfBuildType
+from tests.common.skip import SkipIfKudu, SkipIfBuildType
 from tests.common.test_dimensions import add_exec_option_dimension
+from tests.util.event_processor_utils import EventProcessorUtils
 
 KUDU_MASTER_HOSTS = pytest.config.option.kudu_master_hosts
 LOG = logging.getLogger(__name__)
@@ -291,12 +292,16 @@ class TestKuduHMSIntegration(CustomKuduTest):
     assert kudu_client.table_exists(kudu_tbl_name)
     kudu_client.delete_table(kudu_tbl_name)
     assert not kudu_client.table_exists(kudu_tbl_name)
+
+    # Wait for events to prevent race condition
+    EventProcessorUtils.wait_for_event_processing(self)
+
     try:
       cursor.execute("DROP TABLE %s" % kudu_tbl_name)
       assert False
     except Exception as e:
       LOG.info(str(e))
-      assert "Table %s no longer exists in the Hive MetaStore." % kudu_tbl_name in str(e)
+      "Table does not exist: %s" % kudu_tbl_name in str(e)
 
   @pytest.mark.execute_serially
   def test_drop_external_kudu_table(self, cursor, kudu_client, unique_database):

[impala] 01/04: IMPALA-11447: Fix crash when fetching arrays/structs with result caching

Posted by st...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c90d088c6f9cd37b3db463b038ad32cc130c02e9
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Thu Jul 21 14:53:42 2022 +0200

    IMPALA-11447: Fix crash when fetching arrays/structs with result caching
    
    Some parts of HS2ColumnarResultSet were not prepared for returning
    non-scalar types. This code only runs if impala.resultset.cache.size
    is set, which is not the case in most of tests. The issue was caught
    with Hue, which uses result caching.
    
    Testing:
    - Added a regression test in test_fetch_first.py, which contained
      other tests that used result caching.
    - It turned out that some tests in the file did not run at all,
      as @needs_session() needs the parenthesis at the end. For this
      reason some test fixes were added to run them correctly, though
      these changes are totally unrelated to the current issue.
    
    Backport issue:
    - Test fails due to STRUCT in SelectList not supported on Parquet.
      Changed to use the corresponding ORC table.
    
    Change-Id: Ia4dd8f76187dc3555207e2d30d46d811e0a7a126
    Reviewed-on: http://gerrit.cloudera.org:8080/18768
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Wenzhe Zhou <wz...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-on: http://gerrit.cloudera.org:8080/18889
    Tested-by: Quanlong Huang <hu...@gmail.com>
---
 be/src/service/query-result-set.cc |  8 ++++-
 tests/hs2/test_fetch_first.py      | 66 ++++++++++++++++++++++++--------------
 2 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/be/src/service/query-result-set.cc b/be/src/service/query-result-set.cc
index 7ae6dc613..9d0300954 100644
--- a/be/src/service/query-result-set.cc
+++ b/be/src/service/query-result-set.cc
@@ -351,7 +351,13 @@ int HS2ColumnarResultSet::AddRows(
   for (int j = 0; j < metadata_.columns.size(); ++j) {
     ThriftTColumn* from = &o->result_set_->columns[j];
     ThriftTColumn* to = &result_set_->columns[j];
-    switch (metadata_.columns[j].columnType.types[0].scalar_type.type) {
+    const TColumnType& colType = metadata_.columns[j].columnType;
+    TPrimitiveType::type primitiveType = colType.types[0].scalar_type.type;
+    if (colType.types[0].type != TTypeNodeType::SCALAR) {
+      DCHECK(from->__isset.stringVal);
+      primitiveType = TPrimitiveType::STRING;
+    }
+    switch (primitiveType) {
       case TPrimitiveType::NULL_TYPE:
       case TPrimitiveType::BOOLEAN:
         StitchNulls(
diff --git a/tests/hs2/test_fetch_first.py b/tests/hs2/test_fetch_first.py
index b925c66cb..3bd1b3835 100644
--- a/tests/hs2/test_fetch_first.py
+++ b/tests/hs2/test_fetch_first.py
@@ -128,8 +128,18 @@ class TestFetchFirst(HS2TestSuite):
   def test_query_stmts_v1_with_result_spooling(self):
     self.run_query_stmts_test({'spool_query_results': 'true'})
 
+  def run_query_expect_success(self, query, options):
+    """Executes a query and returns its handle."""
+    execute_statement_req = TCLIService.TExecuteStatementReq()
+    execute_statement_req.sessionHandle = self.session_handle
+    execute_statement_req.confOverlay = options
+    execute_statement_req.statement = query
+    execute_statement_resp = self.hs2_client.ExecuteStatement(execute_statement_req)
+    HS2TestSuite.check_response(execute_statement_resp)
+    return execute_statement_resp.operationHandle
+
   @pytest.mark.execute_serially
-  @needs_session
+  @needs_session()
   def test_rows_materialized_counters(self):
     """Test that NumRowsFetched is updated even when a fetch request is served by the
     results cache, and that RowsMaterialized is only updated when rows are first created
@@ -140,29 +150,25 @@ class TestFetchFirst(HS2TestSuite):
     num_rows_fetched_from_cache = "NumRowsFetchedFromCache: {0} ({0})"
 
     # Execute the query with the results cache enabled.
-    execute_statement_req = TCLIService.TExecuteStatementReq()
-    execute_statement_req.confOverlay[self.IMPALA_RESULT_CACHING_OPT] = str(num_rows)
-    execute_statement_req.statement = statement
-    execute_statement_resp = self.hs2_client.ExecuteStatement(execute_statement_req)
-    HS2TestSuite.check_response(execute_statement_resp)
+    options = {self.IMPALA_RESULT_CACHING_OPT: str(num_rows)}
+    handle = self.run_query_expect_success(statement, options)
 
     # Fetch all rows from the query and verify they have been cached.
-    self.fetch_until(execute_statement_resp.operationHandle,
-        TCLIService.TFetchOrientation.FETCH_NEXT, num_rows)
+    self.fetch_until(handle, TCLIService.TFetchOrientation.FETCH_NEXT, num_rows)
     self.__verify_num_cached_rows(num_rows)
 
     # Get the runtime profile and validate that NumRowsFetched and RowsMaterialized both
     # equal the number of rows fetched by the query.
-    profile = self.__get_runtime_profile(execute_statement_resp.operationHandle)
+    profile = self.__get_runtime_profile(handle)
     assert num_rows_fetched.format(num_rows) in profile
 
     # Fetch all rows again and confirm that RowsMaterialized is unchanged, but
     # NumRowsFetched is double the number of rows returned by the query.
-    self.fetch_until(execute_statement_resp.operationHandle,
-        TCLIService.TFetchOrientation.FETCH_FIRST, num_rows)
-    profile = self.__get_runtime_profile(execute_statement_resp.operationHandle)
+    self.fetch_until(handle, TCLIService.TFetchOrientation.FETCH_FIRST, num_rows)
+    profile = self.__get_runtime_profile(handle)
     assert num_rows_fetched.format(num_rows) in profile
     assert num_rows_fetched_from_cache.format(num_rows) in profile
+    self.close(handle)
 
   def __get_runtime_profile(self, op_handle):
     """Helper method to get the runtime profile from a given operation handle."""
@@ -319,7 +325,7 @@ class TestFetchFirst(HS2TestSuite):
     self.close(execute_statement_resp.operationHandle)
 
   @pytest.mark.execute_serially
-  @needs_session
+  @needs_session()
   def test_constant_query_stmts(self):
     """Tests query stmts that return a constant result set. These queries are handled
     somewhat specially by Impala, therefore, we test them separately. We expect
@@ -363,7 +369,7 @@ class TestFetchFirst(HS2TestSuite):
     self.close(execute_statement_resp.operationHandle)
 
   @pytest.mark.execute_serially
-  @needs_session
+  @needs_session()
   def test_non_query_stmts(self):
     """Tests Impala's limited support for the FETCH_FIRST fetch orientation for
     non-query stmts that return a result set, such as SHOW, COMPUTE STATS, etc.
@@ -434,9 +440,10 @@ class TestFetchFirst(HS2TestSuite):
     # FETCH_NEXT asking for 100 rows. There are only 20 remaining rows.
     self.fetch_until(execute_statement_resp.operationHandle,
                      TCLIService.TFetchOrientation.FETCH_NEXT, 100, 20)
+    self.close(execute_statement_resp.operationHandle)
 
   @pytest.mark.execute_serially
-  @needs_session
+  @needs_session()
   def test_parallel_insert(self):
     """Tests parallel inserts with result set caching on.
     Parallel inserts have a coordinator instance but no coordinator
@@ -447,12 +454,23 @@ class TestFetchFirst(HS2TestSuite):
     self.client.set_configuration({'sync_ddl': 1})
     self.client.execute("create database %s" % self.TEST_DB)
     self.client.execute("create table %s.orderclone like tpch.orders" % self.TEST_DB)
-    execute_statement_req = TCLIService.TExecuteStatementReq()
-    execute_statement_req.sessionHandle = self.session_handle
-    execute_statement_req.confOverlay = dict()
-    execute_statement_req.confOverlay[self.IMPALA_RESULT_CACHING_OPT] = "10"
-    execute_statement_req.statement = ("insert overwrite %s.orderclone "
-                                      "select * from tpch.orders "
-                                      "where o_orderkey < 0" % self.TEST_DB)
-    execute_statement_resp = self.hs2_client.ExecuteStatement(execute_statement_req)
-    HS2TestSuite.check_response(execute_statement_resp)
+    options = {self.IMPALA_RESULT_CACHING_OPT: "10"}
+    handle = self.run_query_expect_success("insert overwrite %s.orderclone "
+                                 "select * from tpch.orders "
+                                 "where o_orderkey < 0" % self.TEST_DB, options)
+    self.close(handle)
+
+  @pytest.mark.execute_serially
+  @needs_session()
+  def test_complex_types_result_caching(self):
+    """Regression test for IMPALA-11447. Returning complex types in select list
+    was crashing in hs2 if result caching was enabled.
+    """
+    options = {self.IMPALA_RESULT_CACHING_OPT: "1024", "disable_codegen": "true"}
+    handle = self.run_query_expect_success(
+        "select int_array from functional_orc_def.complextypestbl", options)
+    self.fetch_until(handle, TCLIService.TFetchOrientation.FETCH_NEXT, 10, 8)
+    handle = self.run_query_expect_success(
+        "select alltypes from functional_orc_def.complextypes_structs", options)
+    self.fetch_until(handle, TCLIService.TFetchOrientation.FETCH_NEXT, 10, 6)
+    self.close(handle)

[impala] 04/04: IMPALA-11317/IMPALA-11316/IMPALA-11315: impala-shell Python 3 fixes

Posted by st...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git

commit f6ee249ac96fb09293cd3f9f1c0adeafb923cd5d
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Sat May 21 19:04:45 2022 -0700

    IMPALA-11317/IMPALA-11316/IMPALA-11315: impala-shell Python 3 fixes
    
    This fixes a few impala-shell Python 3 issues:
    1. In ImpalaShell's do_history(), the decode() call needs to be
       avoided in Python 3, because in Python 3 the cmd is already
       a string and doesn't need further decoding. (IMPALA-11315)
    2. TestImpalaShell.test_http_socket_timeout() gets a different
       error message in Python 3. It throws the "BlockingIOError"
       rather than "socker.error". (IMPALA-11316)
    3. ImpalaHttpClient.py's code to retrieve the body when
       handling an HTTP error needs to have a decode() call
       for the body. Otherwise, the body remains bytes and
       causes TestImpalaShellInteractive.test_http_interactions_extra()
       to fail. (IMPALA-11317)
    
    Testing:
     - Ran shell tests in the standard way
     - Ran shell tests with the impala-shell executable coming from
       a Python 3 virtualenv using the PyPi package
    
    Change-Id: Ie58380a17d7e011f4ce96b27d34717509a0b80a6
    Reviewed-on: http://gerrit.cloudera.org:8080/18556
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Wenzhe Zhou <wz...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-on: http://gerrit.cloudera.org:8080/18885
    Reviewed-by: Quanlong Huang <hu...@gmail.com>
---
 shell/ImpalaHttpClient.py             |  2 +-
 shell/impala_shell.py                 |  5 ++++-
 tests/shell/test_shell_commandline.py | 11 ++++++++---
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/shell/ImpalaHttpClient.py b/shell/ImpalaHttpClient.py
index 947cecfc1..bbf1e4d6f 100644
--- a/shell/ImpalaHttpClient.py
+++ b/shell/ImpalaHttpClient.py
@@ -271,5 +271,5 @@ class ImpalaHttpClient(TTransportBase):
     if self.code >= 300:
       # Report any http response code that is not 1XX (informational response) or
       # 2XX (successful).
-      body = self.readBody()
+      body = self.readBody().decode('utf-8')
       raise HttpError(self.code, self.message, body, self.headers)
diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index acaa02b88..8a061c736 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py
@@ -1508,7 +1508,10 @@ class ImpalaShell(cmd.Cmd, object):
     if self.readline and self.readline.get_current_history_length() > 0:
       for index in xrange(1, self.readline.get_current_history_length() + 1):
         cmd = self.readline.get_history_item(index)
-        print('[%d]: %s' % (index, cmd.decode('utf-8', 'replace')), file=sys.stderr)
+        if sys.version_info.major == 2:
+          print('[%d]: %s' % (index, cmd.decode('utf-8', 'replace')), file=sys.stderr)
+        else:
+          print('[%d]: %s' % (index, cmd), file=sys.stderr)
     else:
       print(READLINE_UNAVAILABLE_ERROR, file=sys.stderr)
 
diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py
index ef6536e26..cd53404c2 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -1220,9 +1220,14 @@ class TestImpalaShell(ImpalaTestSuite):
     args = ['--quiet', '-B', '--query', 'select 0;']
     result = run_impala_shell_cmd(vector, args + ['--http_socket_timeout_s=0'],
                                   expect_success=False)
-    expected_err = ("Caught exception [Errno 115] Operation now in progress, "
-                   "type=<class 'socket.error'> in OpenSession. Num remaining tries: 3")
-    assert result.stderr.splitlines()[0] == expected_err
+    expected_err_py2 = (
+      "Caught exception [Errno 115] Operation now in progress, "
+      "type=<class 'socket.error'> in OpenSession. Num remaining tries: 3")
+    expected_err_py3 = (
+      "Caught exception [Errno 115] Operation now in progress, "
+      "type=<class 'BlockingIOError'> in OpenSession. Num remaining tries: 3")
+    actual_err = result.stderr.splitlines()[0]
+    assert actual_err == expected_err_py2 or actual_err == expected_err_py3
 
     # Test http_socket_timeout_s=-1, expect errors
     result = run_impala_shell_cmd(vector, args + ['--http_socket_timeout_s=-1'],

[impala] 02/04: IMPALA-11295: Deflake TestParquet.test_multiple_blocks_mt_dop

Posted by st...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 6357534926a24c1ec8e95c37e7b23f5ac29571e6
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Tue May 17 20:20:55 2022 +0800

    IMPALA-11295: Deflake TestParquet.test_multiple_blocks_mt_dop
    
    TestParquet.test_multiple_blocks_mt_dop runs a query on 6 scan ranges
    using mt_dop=2. It then verifies the sum of ranges read on a backend is
    2 (6/3). The test assumes that counters of the 2 instances on the same
    host are printed consecutively. However, this is not always true. They
    could be interleaving.
    
    This patch makes the test more robust by grouping the counters based on
    the host.
    
    Test
     - I can't reproduce the issue locally. But I'm able to run the new test
       100 times without any error.
    
    Change-Id: I16c576c41a212f83dda82a83931ab336a78a41e4
    Reviewed-on: http://gerrit.cloudera.org:8080/18533
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-on: http://gerrit.cloudera.org:8080/18892
    Reviewed-by: Csaba Ringhofer <cs...@cloudera.com>
    Tested-by: Quanlong Huang <hu...@gmail.com>
---
 tests/query_test/test_scanners.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 9a8553349..53a15f0ae 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -688,6 +688,11 @@ class TestParquet(ImpalaTestSuite):
       assert len(num_rows_read_list) == 7
       assert len(ranges_complete_list) == 7
 
+      # Extract the host for each fragment instance. The first is the coordinator
+      # fragment instance.
+      host_list = re.findall(r'host=(\S+:[0-9]*)', result.runtime_profile)
+      assert len(host_list) == 7
+
       total_rows_read = 0
       # Skip the Averaged Fragment; it comes first in the runtime profile.
       for num_row_read in num_rows_read_list[1:]:
@@ -695,10 +700,15 @@ class TestParquet(ImpalaTestSuite):
       assert total_rows_read == TOTAL_ROWS
 
       # Again skip the Averaged Fragment; it comes first in the runtime profile.
-      # With mt_dop 2, every backend will have 2 instances which are printed consecutively
-      # in the profile.
-      for i in range(1, len(ranges_complete_list), 2):
-        assert int(ranges_complete_list[i]) + int(ranges_complete_list[i + 1]) == 2
+      # With mt_dop 2, every backend will have 2 instances.
+      ranges_per_host = {}
+      for i in range(1, 7):
+        host = host_list[i]
+        if host not in ranges_per_host:
+          ranges_per_host[host] = 0
+        ranges_per_host[host] += int(ranges_complete_list[i])
+      for host in ranges_per_host:
+        assert ranges_per_host[host] == 2
     finally:
       self.client.clear_configuration()