You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2021/03/24 19:00:45 UTC
[impala] 01/02: IMPALA-10564: Return error when inserting an invalid decimal value

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 410c3e79e4eeba0a3f1ad62f6bf2f11b2de48819
Author: wzhou-code <wz...@cloudera.com>
AuthorDate: Mon Mar 22 17:40:22 2021 -0700

    IMPALA-10564: Return error when inserting an invalid decimal value
    
    When using CTAS statements or INSERT-SELECT statements to insert rows to
    table with decimal columns, Impala insert NULL for overflowed decimal
    values, instead of returning error. This issue happens when the data
    expression for the decimal column in SELECT sub-query consists at least
    one alias.
    This issue is similar as IMPALA-6340, but IMPALA-6340 only fixed the
    issue for the cases with the data expression for the decimal columns as
    constants.
    
    This patch fixed the issue by calling RuntimeState::CheckQueryState()
    in the end of HdfsTableWriter::AppendRows() and KuduTableSink::Send().
    If there is an invalid decimal error, the query will be failed without
    inserting NULL for decimal column.
    We did not change the behaviour for decimal_v1. NULL will be inserted
    to the table for invalid decimal values with warning message.
    
    Tests:
     - Added unit-tests for INSERT-SELECT and CTAS statements with
       overflowed decimal values to be inserted into tables. The
       overflowed decimal values are expressed as a constant expression,
       or as an expression with aliases.
       Also added cases to verify behaviour of decimal_v1 is unchanged.
     - Passed exhaustive tests.
    
    Change-Id: I64ce4ed194af81ef06401ffc1124e12f05b8da98
    Reviewed-on: http://gerrit.cloudera.org:8080/17168
    Reviewed-by: Thomas Tauber-Marshall <tm...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/hdfs-text-table-writer.cc              |   1 +
 be/src/exec/kudu-table-sink.cc                     |   1 +
 be/src/exec/parquet/hdfs-parquet-table-writer.cc   |   1 +
 .../QueryTest/decimal-insert-overflow-exprs.test   | 116 +++++++++++++++++++++
 tests/query_test/test_decimal_queries.py           |  99 ++++++++++++++++++
 5 files changed, 218 insertions(+)

diff --git a/be/src/exec/hdfs-text-table-writer.cc b/be/src/exec/hdfs-text-table-writer.cc
index 6b77852..7c6effc 100644
--- a/be/src/exec/hdfs-text-table-writer.cc
+++ b/be/src/exec/hdfs-text-table-writer.cc
@@ -121,6 +121,7 @@ Status HdfsTextTableWriter::AppendRows(
   *new_file = false;
   if (rowbatch_stringstream_.tellp() >= flush_size_) RETURN_IF_ERROR(Flush());
 
+  RETURN_IF_ERROR(state_->CheckQueryState());
   return Status::OK();
 }
 
diff --git a/be/src/exec/kudu-table-sink.cc b/be/src/exec/kudu-table-sink.cc
index 17969fa..d5f0d77 100644
--- a/be/src/exec/kudu-table-sink.cc
+++ b/be/src/exec/kudu-table-sink.cc
@@ -291,6 +291,7 @@ Status KuduTableSink::Send(RuntimeState* state, RowBatch* batch) {
 }
 
 Status KuduTableSink::CheckForErrors(RuntimeState* state) {
+  RETURN_IF_ERROR(state->CheckQueryState());
   if (session_->CountPendingErrors() == 0) return Status::OK();
 
   vector<KuduError*> errors;
diff --git a/be/src/exec/parquet/hdfs-parquet-table-writer.cc b/be/src/exec/parquet/hdfs-parquet-table-writer.cc
index 9a3c8bf..84f7785 100644
--- a/be/src/exec/parquet/hdfs-parquet-table-writer.cc
+++ b/be/src/exec/parquet/hdfs-parquet-table-writer.cc
@@ -1337,6 +1337,7 @@ Status HdfsParquetTableWriter::AppendRows(
   // Reset the row_idx_ when we exhaust the batch.  We can exit before exhausting
   // the batch if we run out of file space and will continue from the last index.
   row_idx_ = 0;
+  RETURN_IF_ERROR(state_->CheckQueryState());
   return Status::OK();
 }
 
diff --git a/testdata/workloads/functional-query/queries/QueryTest/decimal-insert-overflow-exprs.test b/testdata/workloads/functional-query/queries/QueryTest/decimal-insert-overflow-exprs.test
new file mode 100644
index 0000000..2851f37
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/decimal-insert-overflow-exprs.test
@@ -0,0 +1,116 @@
+====
+---- QUERY
+# Verify that decimal value, which is expressed with constant decimal expression and its
+# value is not overflowed, is inserted into the table.
+set decimal_v2=true;
+insert into table overflowed_decimal_tbl_1
+select 1, cast(cast(65496456343.9565 as decimal (28,7))*44658*2.111
+as decimal (28,10));
+---- RESULTS
+: 1
+====
+---- QUERY
+# Verify that decimal value, which is expressed with decimal expression with one
+# alias and its value is not overflowed, is inserted into the table.
+set decimal_v2=true;
+insert into table overflowed_decimal_tbl_1
+select 2, cast(a*44658*2.111 as decimal (28,10)) from
+(select cast(65496456.9565 as decimal (28,7)) as a) q;
+---- RESULTS
+: 1
+====
+---- QUERY
+# Verify that decimal value, which is expressed with decimal expression with three
+# aliases and its value is not overflowed, is inserted into the table.
+set decimal_v2=true;
+insert into table overflowed_decimal_tbl_1
+select 3, cast(a*b*c as decimal (28,10)) from
+(select cast(65496456.9565 as decimal (28,7)) as a,
+ cast(44658 as decimal (28,7)) as b,
+ cast(2.111 as decimal (28,7)) as c) q;
+---- RESULTS
+: 1
+====
+---- QUERY
+# Verify that overflow behaviour of decimal_v1 is unchanged.
+# Verify decimal value, which is expressed with constant decimal expression
+# and its value is overflowed, is inserted into the table as NULL when
+# decimal_v2 is set as false.
+set decimal_v2=false;
+insert into table overflowed_decimal_tbl_1
+select 4, cast(cast(654964569154.9565 as decimal (28,7))*44658554984*2.111
+as decimal (28,10));
+select count(*) from overflowed_decimal_tbl_1 where d_28 is null;
+---- RESULTS
+1
+====
+---- QUERY
+# Verify that overflow behaviour of decimal_v1 is unchanged.
+# Verify that decimal value, which is expressed with decimal expression with one
+# alias and its value is overflowed, is inserted into the table as NULL when
+# decimal_v2 is set as false.
+set decimal_v2=false;
+insert into table overflowed_decimal_tbl_1
+select 5, cast(a*44658554984*2.111 as decimal (28,10)) from
+(select cast(654964569154.9565 as decimal (28,7)) as a) q;
+select count(*) from overflowed_decimal_tbl_1 where d_28 is null;
+---- RESULTS
+2
+====
+---- QUERY
+# Verify that overflow behaviour of decimal_v1 is unchanged.
+# Verify that decimal value, which is expressed with decimal expression with three
+# alias and its value is overflowed, is inserted into the table as NULL when
+# decimal_v2 is set as false.
+set decimal_v2=false;
+insert into table overflowed_decimal_tbl_1
+select 6, cast(a*b*c as decimal (28,10)) from
+(select cast(654964569154.9565 as decimal (28,7)) as a,
+ cast(44658554984 as decimal (28,7)) as b,
+ cast(2.111 as decimal (28,7)) as c) q;
+select count(*) from overflowed_decimal_tbl_1 where d_28 is null;
+---- RESULTS
+3
+====
+---- QUERY
+# Verify that decimal value, which is expressed with constant decimal expression
+# and its value is overflowed, cause query aborted with an error.
+set decimal_v2=true;
+insert into table overflowed_decimal_tbl_1
+select 7, cast(cast(654964569154.9565 as decimal (28,7))*44658554984*2.111
+as decimal (28,10));
+---- CATCH
+Decimal expression overflowed
+====
+---- QUERY
+# Verify that decimal value, which is expressed with decimal expression with one
+# alias and its value is overflowed, cause query aborted with an error.
+set decimal_v2=true;
+insert into table overflowed_decimal_tbl_1
+select 8, cast(a*44658554984*2.111 as decimal (28,10)) from
+(select cast(654964569154.9565 as decimal (28,7)) as a) q;
+---- CATCH
+Decimal expression overflowed
+====
+---- QUERY
+# Verify that decimal value, which is expressed with decimal expression with three
+# aliases and its value is overflowed, cause query aborted with an error.
+set decimal_v2=true;
+insert into table overflowed_decimal_tbl_1
+select 9, cast(a*b*c as decimal (28,10)) from
+(select cast(654964569154.9565 as decimal (28,7)) as a,
+ cast(44658554984 as decimal (28,7)) as b,
+ cast(2.111 as decimal (28,7)) as c) q;
+---- CATCH
+Decimal expression overflowed
+====
+---- QUERY
+# Verify that decimal value, which is expressed with selection from another table
+# and its value is overflowed, cause query aborted with an error.
+set decimal_v2=true;
+insert into table overflowed_decimal_tbl_2
+select i, cast(d_28*d_28*d_28 as decimal (28,10))
+from overflowed_decimal_tbl_1 where d_28 is not null;
+---- CATCH
+Decimal expression overflowed
+====
diff --git a/tests/query_test/test_decimal_queries.py b/tests/query_test/test_decimal_queries.py
index 9659062..2dde7a4 100644
--- a/tests/query_test/test_decimal_queries.py
+++ b/tests/query_test/test_decimal_queries.py
@@ -18,7 +18,9 @@
 # Targeted tests for decimal type.
 
 from copy import copy
+import pytest
 
+from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.test_dimensions import (create_exec_option_dimension_from_dict,
     create_client_protocol_dimension, hs2_parquet_constraint)
@@ -87,3 +89,100 @@ class TestAvroDecimalQueries(ImpalaTestSuite):
 
   def test_avro_queries(self, vector):
     self.run_test_case('QueryTest/decimal_avro', vector)
+
+
+# Tests involving DECIMAL typed expressions with data overflow. The results depend on
+# whether DECIMAL version 2 is enabled, so the .test file itself toggles the DECIMAL_V2
+# query option.
+@pytest.mark.execute_serially
+class TestDecimalOverflowExprs(ImpalaTestSuite):
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestDecimalOverflowExprs, cls).add_test_dimensions()
+    cls.ImpalaTestMatrix.add_constraint(lambda v:
+        (v.get_value('table_format').file_format in ['kudu', 'parquet', 'text']))
+
+  def test_insert_select_exprs(self, vector, unique_database):
+    TBL_NAME_1 = '`{0}`.`overflowed_decimal_tbl_1`'.format(unique_database)
+    TBL_NAME_2 = '`{0}`.`overflowed_decimal_tbl_2`'.format(unique_database)
+    # Create table with decimal data type of column.
+    if 'parquet' in str(vector.get_value('table_format')):
+      stmt = "CREATE TABLE {0} (i int, d_28 decimal(28,10)) STORED AS PARQUET"
+    elif 'kudu' in str(vector.get_value('table_format')):
+      stmt = "CREATE TABLE {0} (i int primary key, d_28 decimal(28,10)) STORED AS KUDU"
+    else:
+      stmt = "CREATE TABLE {0} (i int, d_28 decimal(28,10))"
+    query_1 = stmt.format(TBL_NAME_1)
+    query_2 = stmt.format(TBL_NAME_2)
+
+    self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s" % TBL_NAME_1)
+    self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s" % TBL_NAME_2)
+    self.execute_query_expect_success(self.client, query_1)
+    self.execute_query_expect_success(self.client, query_2)
+
+    # Run INSERT-SELECT queries.
+    self.run_test_case('QueryTest/decimal-insert-overflow-exprs', vector,
+        use_db=unique_database)
+
+  def test_ctas_exprs(self, vector, unique_database):
+    TBL_NAME_1 = '`{0}`.`overflowed_decimal_tbl_1`'.format(unique_database)
+    TBL_NAME_2 = '`{0}`.`overflowed_decimal_tbl_2`'.format(unique_database)
+    if 'parquet' in str(vector.get_value('table_format')):
+      stmt_1 = "CREATE TABLE {0} STORED AS PARQUET " \
+          "AS SELECT 1 as i, cast(a*a*a as decimal (28,10)) as d_28 FROM " \
+          "(SELECT cast(654964569154.9565 as decimal (28,7)) as a) q"
+      stmt_2 = "CREATE TABLE {0} STORED AS PARQUET " \
+          "AS SELECT i, cast(d_28*d_28*d_28 as decimal (28,10)) as d_28 FROM {1} " \
+          "WHERE d_28 is not null"
+    elif 'kudu' in str(vector.get_value('table_format')):
+      stmt_1 = "CREATE TABLE {0} PRIMARY KEY (i) STORED AS KUDU " \
+          "AS SELECT 1 as i, cast(a*a*a as decimal (28,10)) as d_28 FROM " \
+          "(SELECT cast(654964569154.9565 as decimal (28,7)) as a) q"
+      stmt_2 = "CREATE TABLE {0} PRIMARY KEY (i) STORED AS KUDU " \
+          "AS SELECT i, cast(d_28*d_28*d_28 as decimal (28,10)) as d_28 FROM {1} " \
+          "WHERE d_28 is not null"
+    else:
+      stmt_1 = "CREATE TABLE {0} " \
+          "AS SELECT 1 as i, cast(a*a*a as decimal (28,10)) as d_28 FROM " \
+          "(SELECT cast(654964569154.9565 as decimal (28,7)) as a) q"
+      stmt_2 = "CREATE TABLE {0} " \
+          "AS SELECT i, cast(d_28*d_28*d_28 as decimal (28,10)) as d_28 FROM {1} " \
+          "WHERE d_28 is not null"
+    query_1 = stmt_1.format(TBL_NAME_1)
+    # CTAS with selection from another table.
+    query_2 = stmt_2.format(TBL_NAME_2, TBL_NAME_1)
+
+    # Query_1 is aborted with error message "Decimal expression overflowed" and NULL is
+    # not inserted into table.
+    self.execute_query_expect_success(self.client, "SET decimal_v2=true")
+    self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s" % TBL_NAME_1)
+    try:
+      self.execute_query_using_client(self.client, query_1, vector)
+      assert False, "Query was expected to fail"
+    except ImpalaBeeswaxException, e:
+      assert "Decimal expression overflowed" in str(e)
+
+    result = self.execute_query_expect_success(self.client,
+        "SELECT count(*) FROM %s" % TBL_NAME_1)
+    assert int(result.get_data()) == 0
+
+    # Insert data to table 1.
+    self.execute_query_expect_success(self.client,
+        "INSERT INTO TABLE %s VALUES(100, cast(654964569154.9565 as decimal (28,10)))" %
+        TBL_NAME_1)
+    # Query_2 is aborted with error message "Decimal expression overflowed" and NULL is
+    # not inserted into table.
+    self.execute_query_expect_success(self.client, "DROP TABLE IF EXISTS %s" % TBL_NAME_2)
+    try:
+      self.execute_query_using_client(self.client, query_2, vector)
+      assert False, "Query was expected to fail"
+    except ImpalaBeeswaxException, e:
+      assert "Decimal expression overflowed" in str(e)
+
+    result = self.execute_query_expect_success(self.client,
+        "SELECT count(*) FROM %s" % TBL_NAME_2)
+    assert int(result.get_data()) == 0