You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2021/04/01 05:34:30 UTC

[impala] branch master updated (ede22a6 -> d8ef956)

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from ede22a6  IMPALA-10608 followup: Detect the virtualenv tarball version
     new 4ef67c2  IMPALA-10607: Fixed test_ctas_exprs failure for S3 build
     new 553af0a  IMPALA-10582: Fix wrong summary numbers in the webpage of catalogd operations
     new d8ef956  Only fetch needed branches in compare_branches.py

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 be/src/catalog/catalog-server.cc         |  2 +-
 be/src/exec/hdfs-table-sink.cc           | 14 ++++++++--
 bin/compare_branches.py                  |  9 ++++--
 tests/query_test/test_decimal_queries.py | 47 ++++++++++++++++++++++++--------
 4 files changed, 54 insertions(+), 18 deletions(-)

[impala] 02/03: IMPALA-10582: Fix wrong summary numbers in the webpage of catalogd operations

Posted by st...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 553af0a1a18e38a854f0b15c4148188bfc9ba82e
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Fri Mar 12 15:56:54 2021 +0800

    IMPALA-10582: Fix wrong summary numbers in the webpage of catalogd operations
    
    Webpage of catalogd operations doesn't sum up requests correctly.
    Instead, the current meaning is summing by tables. As the column name
    is "Number of requests", we should sum up by requests.
    
    Tests:
     - Manually run test_concurrent_inserts and verify the number is
       correct.
    
    Change-Id: I1c5361d981832d6f28db5f203a2c2538fe8ebb5e
    Reviewed-on: http://gerrit.cloudera.org:8080/17177
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/catalog/catalog-server.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/be/src/catalog/catalog-server.cc b/be/src/catalog/catalog-server.cc
index 9731dd3..fde90ba 100644
--- a/be/src/catalog/catalog-server.cc
+++ b/be/src/catalog/catalog-server.cc
@@ -707,7 +707,7 @@ void CatalogServer::OperationUsageUrlCallback(
   // Create a summary and add it to the document
   map<string, int> aggregated_operations;
   for (const auto& catalog_op : opeartion_usage.catalog_op_counters) {
-    ++aggregated_operations[catalog_op.catalog_op_name];
+    aggregated_operations[catalog_op.catalog_op_name] += catalog_op.op_counter;
   }
   Value catalog_op_summary(kArrayType);
   for (const auto& catalog_op : aggregated_operations) {

[impala] 03/03: Only fetch needed branches in compare_branches.py

Posted by st...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit d8ef9562cc6491f5d4de1a2229d0abc35dd44662
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Wed Mar 31 16:44:42 2021 +0800

    Only fetch needed branches in compare_branches.py
    
    Fetching all branches of a remote repo will be time-consuming. This
    changes compare_branches.py to only fetch the needed branches, i.e.
    source_branch and target_branch.
    
    Tests:
     - Before this change, it can't finish in 30 mins when comparing two
       downstream branches. After this change, it finishes in one minute.
    
    Change-Id: Ia0c70ad4de1fa79498ca32853b6ea99aee2d40a7
    Reviewed-on: http://gerrit.cloudera.org:8080/17246
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/compare_branches.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/bin/compare_branches.py b/bin/compare_branches.py
index a3ae299..f57aad1 100755
--- a/bin/compare_branches.py
+++ b/bin/compare_branches.py
@@ -213,13 +213,16 @@ def main():
   # Ensure all branches are up to date, unless remotes are disabled
   # by specifying them with an empty string.
   if options.source_remote_name != "":
-    subprocess.check_call(['git', 'fetch', options.source_remote_name])
+    subprocess.check_call(['git', 'fetch', options.source_remote_name,
+        options.source_branch])
     full_source_branch_name = options.source_remote_name + '/' + options.source_branch
   else:
     full_source_branch_name = options.source_branch
   if options.target_remote_name != "":
-    if options.source_remote_name != options.target_remote_name:
-      subprocess.check_call(['git', 'fetch', options.target_remote_name])
+    if options.source_remote_name != options.target_remote_name\
+        or options.source_branch != options.target_branch:
+      subprocess.check_call(['git', 'fetch', options.target_remote_name,
+          options.target_branch])
     full_target_branch_name = options.target_remote_name + '/' + options.target_branch
   else:
     full_target_branch_name = options.target_branch

[impala] 01/03: IMPALA-10607: Fixed test_ctas_exprs failure for S3 build

Posted by st...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 4ef67c21153157d552a6d5db09f4c1e15cbe8ac0
Author: wzhou-code <wz...@cloudera.com>
AuthorDate: Thu Mar 25 01:21:01 2021 -0700

    IMPALA-10607: Fixed test_ctas_exprs failure for S3 build
    
    New test case TestDecimalOverflowExprs::test_ctas_exprs was added
    in the patch for IMPALA-10564. But it failed in S3 build with
    Parquet format and complained the Parquet file had an invalid file
    length when accessing a table. The table was created by CTAS which
    finished with error "decimal expression overflowed". Verified this
    issue does not happen if query option s3_skip_insert_staging is set
    as false.
    When s3_skip_insert_staging is set true by default, INSERT writing
    to S3 goes directly to their final location rather than being
    copied there by the coordinator. If CTAS finishs with error during
    INSERT, the parquet partition file is left in un-finalized without
    file footer.  This causes subsequent query failed with error like
    "have an invalid file length on S3" when the query attemps to
    access the same table.
    
    This patch fixed the issue by deleting the un-finalized file in
    its final location when AppendRows() return error and staging has
    been skipped.
    
    Testing:
     - Reproduced the test failure in local box with defaultFS as s3.
       Verified the fixing by running test_ctas_exprs with defaultFS
       as s3.
     - Passed core tests.
    
    Change-Id: Ic2f64ab987aeada2cda41502e8c5dbbc229daefd
    Reviewed-on: http://gerrit.cloudera.org:8080/17234
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/hdfs-table-sink.cc           | 14 ++++++++--
 tests/query_test/test_decimal_queries.py | 47 ++++++++++++++++++++++++--------
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/be/src/exec/hdfs-table-sink.cc b/be/src/exec/hdfs-table-sink.cc
index 7ff2419..ce028ae 100644
--- a/be/src/exec/hdfs-table-sink.cc
+++ b/be/src/exec/hdfs-table-sink.cc
@@ -313,8 +313,18 @@ Status HdfsTableSink::WriteRowsToPartition(
   bool new_file;
   while (true) {
     OutputPartition* output_partition = partition_pair->first.get();
-    RETURN_IF_ERROR(
-        output_partition->writer->AppendRows(batch, partition_pair->second, &new_file));
+    Status status =
+        output_partition->writer->AppendRows(batch, partition_pair->second, &new_file);
+    if (!status.ok()) {
+      // IMPALA-10607: Deletes partition file if staging is skipped when appending rows
+      // fails. Otherwise, it leaves the file in un-finalized state.
+      if (ShouldSkipStaging(state, output_partition)) {
+        status.MergeStatus(ClosePartitionFile(state, output_partition));
+        hdfsDelete(output_partition->hdfs_connection,
+            output_partition->current_file_name.c_str(), 0);
+      }
+      return status;
+    }
     if (!new_file) break;
     RETURN_IF_ERROR(FinalizePartitionFile(state, output_partition));
     RETURN_IF_ERROR(CreateNewTmpFile(state, output_partition));
diff --git a/tests/query_test/test_decimal_queries.py b/tests/query_test/test_decimal_queries.py
index 62b5f77..13c64e3 100644
--- a/tests/query_test/test_decimal_queries.py
+++ b/tests/query_test/test_decimal_queries.py
@@ -162,9 +162,27 @@ class TestDecimalOverflowExprs(ImpalaTestSuite):
     query_2 = stmt_2.format(TBL_NAME_2, TBL_NAME_3)
     query_3 = stmt_3.format(TBL_NAME_3)
 
-    # Query_1 is aborted with error message "Decimal expression overflowed" and NULL is
-    # not inserted into table.
     self.execute_query_expect_success(self.client, "SET decimal_v2=true")
+    # Verify the table on s3a could be accessed after CTAS is finished with error and
+    # NULL is not inserted into table if s3_skip_insert_staging is set as false.
+    if IS_S3:
+      self.execute_query_expect_success(self.client, "SET s3_skip_insert_staging=false")
+      self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s"
+          % TBL_NAME_1)
+      try:
+        self.execute_query_using_client(self.client, query_1, vector)
+        assert False, "Query was expected to fail"
+      except ImpalaBeeswaxException, e:
+        assert "Decimal expression overflowed" in str(e)
+
+      result = self.execute_query_expect_success(self.client,
+          "SELECT count(*) FROM %s WHERE d_28 is null" % TBL_NAME_1)
+      assert int(result.get_data()) == 0
+      # Set s3_skip_insert_staging as default value.
+      self.execute_query_expect_success(self.client, "SET s3_skip_insert_staging=true")
+
+    # Verify query_1 is aborted with error message "Decimal expression overflowed" and
+    # NULL is not inserted into table.
     self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s" % TBL_NAME_1)
     try:
       self.execute_query_using_client(self.client, query_1, vector)
@@ -172,11 +190,18 @@ class TestDecimalOverflowExprs(ImpalaTestSuite):
     except ImpalaBeeswaxException, e:
       assert "Decimal expression overflowed" in str(e)
 
-    # TODO (IMPALA-10607): Following query failed for S3 build with Parquet table format.
-    if not ('parquet' in str(vector.get_value('table_format')) and IS_S3):
-      result = self.execute_query_expect_success(self.client,
-          "SELECT count(*) FROM %s" % TBL_NAME_1)
-      assert int(result.get_data()) == 0
+    result = self.execute_query_expect_success(self.client,
+        "SELECT count(*) FROM %s WHERE d_28 is null" % TBL_NAME_1)
+    assert int(result.get_data()) == 0
+
+    # Verify that valid data could be inserted into the new table which is created by
+    # CTAS and the CTAS finished with an error.
+    self.execute_query_expect_success(self.client,
+        "INSERT INTO TABLE %s VALUES(100, cast(654964569154.9565 as decimal (28,10)))" %
+        TBL_NAME_1)
+    result = self.execute_query_expect_success(self.client,
+        "SELECT count(*) FROM %s WHERE d_28 is not null" % TBL_NAME_1)
+    assert int(result.get_data()) == 1
 
     # Create table 3 and insert data to table 3.
     self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s" % TBL_NAME_3)
@@ -193,8 +218,6 @@ class TestDecimalOverflowExprs(ImpalaTestSuite):
     except ImpalaBeeswaxException, e:
       assert "Decimal expression overflowed" in str(e)
 
-    # TODO (IMPALA-10607): Following query failed for S3 build with Parquet table format.
-    if not ('parquet' in str(vector.get_value('table_format')) and IS_S3):
-      result = self.execute_query_expect_success(self.client,
-          "SELECT count(*) FROM %s" % TBL_NAME_2)
-      assert int(result.get_data()) == 0
+    result = self.execute_query_expect_success(self.client,
+        "SELECT count(*) FROM %s" % TBL_NAME_2)
+    assert int(result.get_data()) == 0