You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/05/23 15:40:39 UTC

[13/17] incubator-impala git commit: IMPALA-3579: Strict handling of numeric overflow in text parsing

IMPALA-3579: Strict handling of numeric overflow in text parsing

Adds a query option 'strict_mode' which treats integer and
floating pt overflows as parse errors. In the past,
overflows were ignored and the max value was returned. When
this query option is set, overflowing values are treated as if
they were completely invalid data, i.e. NULL is returned.
When abort_on_error is enabled, this means the query is
aborted.

Notes:
* DECIMAL overflow/underflow is already treated as an error.
* The handling in text-converter treats underflows the same
  as overflows, so they would result in the same behavior.
  However, floating point parsing never returns an underflow
  today.
* We may also want to handle numeric values that are truncated
  when parsing to integer types, e.g. 10.5 -> 10.

Change-Id: I7409c31ec0cb6fe0b2d9842b9f58fe1670914836
Reviewed-on: http://gerrit.cloudera.org:8080/3150
Reviewed-by: Matthew Jacobs <mj...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/f413e236
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/f413e236
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/f413e236

Branch: refs/heads/master
Commit: f413e236a8474f355426d0612862e5e8c8cb5c57
Parents: fa30a0c
Author: Matthew Jacobs <mj...@cloudera.com>
Authored: Thu May 19 16:23:52 2016 -0700
Committer: Tim Armstrong <ta...@cloudera.com>
Committed: Mon May 23 08:40:20 2016 -0700

----------------------------------------------------------------------
 be/src/exec/hdfs-scanner.cc                     |  2 +-
 be/src/exec/hdfs-text-scanner.cc                |  3 +-
 be/src/exec/text-converter.cc                   | 29 ++++++++++++++----
 be/src/exec/text-converter.h                    | 11 +++++--
 be/src/exec/text-converter.inline.h             | 10 ++++---
 be/src/runtime/runtime-state.h                  |  3 ++
 be/src/service/query-options.cc                 |  5 ++++
 be/src/service/query-options.h                  |  6 ++--
 common/thrift/ImpalaInternalService.thrift      |  3 ++
 common/thrift/ImpalaService.thrift              |  5 +++-
 .../functional/functional_schema_template.sql   |  2 --
 .../datasets/functional/schema_constraints.csv  |  7 +++--
 .../queries/QueryTest/show.test                 |  1 -
 .../queries/QueryTest/strict-mode-abort.test    | 31 ++++++++++++++++++++
 .../queries/QueryTest/strict-mode.test          | 28 ++++++++++++++++++
 tests/query_test/test_queries.py                |  8 +++++
 16 files changed, 132 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/be/src/exec/hdfs-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index af6ddf7..6e9eb61 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -354,7 +354,7 @@ Function* HdfsScanner::CodegenWriteCompleteTuple(
     SlotDescriptor* slot_desc = node->materialized_slots()[i];
     Function* fn = TextConverter::CodegenWriteSlot(codegen, tuple_desc, slot_desc,
         node->hdfs_table()->null_column_value().data(),
-        node->hdfs_table()->null_column_value().size(), true);
+        node->hdfs_table()->null_column_value().size(), true, state->strict_mode());
     if (fn == NULL) return NULL;
     slot_fns.push_back(fn);
   }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/be/src/exec/hdfs-text-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc
index dd0d524..f3b669d 100644
--- a/be/src/exec/hdfs-text-scanner.cc
+++ b/be/src/exec/hdfs-text-scanner.cc
@@ -215,7 +215,8 @@ Status HdfsTextScanner::InitNewRange() {
       scan_node_->is_materialized_col(), hdfs_partition->line_delim(),
       field_delim, collection_delim, hdfs_partition->escape_char()));
   text_converter_.reset(new TextConverter(hdfs_partition->escape_char(),
-      scan_node_->hdfs_table()->null_column_value()));
+      scan_node_->hdfs_table()->null_column_value(), true,
+      state_->strict_mode()));
 
   RETURN_IF_ERROR(ResetScanner());
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/be/src/exec/text-converter.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc
index 03cfc07..8e71ef4 100644
--- a/be/src/exec/text-converter.cc
+++ b/be/src/exec/text-converter.cc
@@ -30,10 +30,11 @@ using namespace impala;
 using namespace llvm;
 
 TextConverter::TextConverter(char escape_char, const string& null_col_val,
-    bool check_null)
+    bool check_null, bool strict_mode)
   : escape_char_(escape_char),
     null_col_val_(null_col_val),
-    check_null_(check_null) {
+    check_null_(check_null),
+    strict_mode_(strict_mode) {
 }
 
 void TextConverter::UnescapeString(const char* src, char* dest, int* len,
@@ -90,9 +91,19 @@ void TextConverter::UnescapeString(const char* src, char* dest, int* len,
 //   call void @SetNull({ i8, i32 }* %tuple_arg)
 //   ret i1 false
 // }
+//
+// If strict_mode = true, then 'parse_slot' also treats overflows errors, e.g.:
+// parse_slot:                                       ; preds = %check_zero
+//   %slot = getelementptr inbounds { i8, i32 }* %tuple_arg, i32 0, i32 1
+//   %1 = call i32 @IrStringToInt32(i8* %data, i32 %len, i32* %parse_result)
+//   %parse_result1 = load i32, i32* %parse_result
+//   %failed = icmp eq i32 %parse_result1, 1
+//   %overflowed = icmp eq i32 %parse_result1, 2
+//   %failed_or = or i1 %failed, %overflowed
+//   br i1 %failed_or, label %parse_fail, label %parse_success
 Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
     TupleDescriptor* tuple_desc, SlotDescriptor* slot_desc,
-    const char* null_col_val, int len, bool check_null) {
+    const char* null_col_val, int len, bool check_null, bool strict_mode) {
   if (slot_desc->type().type == TYPE_CHAR) {
     LOG(INFO) << "Char isn't supported for CodegenWriteSlot";
     return NULL;
@@ -226,15 +237,23 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
         &parse_success_block, &parse_failed_block);
     LlvmCodeGen::NamedVariable parse_result("parse_result", codegen->GetType(TYPE_INT));
     Value* parse_result_ptr = codegen->CreateEntryBlockAlloca(fn, parse_result);
-    Value* failed_value = codegen->GetIntConstant(TYPE_INT, StringParser::PARSE_FAILURE);
 
     // Call Impala's StringTo* function
     Value* result = builder.CreateCall(parse_fn,
         ArrayRef<Value*>({args[1], args[2], parse_result_ptr}));
     Value* parse_result_val = builder.CreateLoad(parse_result_ptr, "parse_result");
+    Value* failed_value = codegen->GetIntConstant(TYPE_INT, StringParser::PARSE_FAILURE);
 
-    // Check for parse error.  TODO: handle overflow
+    // Check for parse error.
     Value* parse_failed = builder.CreateICmpEQ(parse_result_val, failed_value, "failed");
+    if (strict_mode) {
+      // In strict_mode, also check if parse_result is PARSE_OVERFLOW.
+      Value* overflow_value = codegen->GetIntConstant(TYPE_INT,
+          StringParser::PARSE_OVERFLOW);
+      Value* parse_overflow = builder.CreateICmpEQ(parse_result_val, overflow_value,
+          "overflowed");
+      parse_failed = builder.CreateOr(parse_failed, parse_overflow, "failed_or");
+    }
     builder.CreateCondBr(parse_failed, parse_failed_block, parse_success_block);
 
     // Parse succeeded

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/be/src/exec/text-converter.h
----------------------------------------------------------------------
diff --git a/be/src/exec/text-converter.h b/be/src/exec/text-converter.h
index 3651146..9f533d5 100644
--- a/be/src/exec/text-converter.h
+++ b/be/src/exec/text-converter.h
@@ -42,8 +42,10 @@ class TextConverter {
   /// null_col_val: Special string to indicate NULL column values.
   /// check_null: If set, then the WriteSlot() functions set the target slot to NULL
   /// if their input string matches null_vol_val.
+  /// strict_mode: If set, numerical overflow/underflow are considered to be parse
+  /// errors.
   TextConverter(char escape_char, const std::string& null_col_val,
-      bool check_null = true);
+      bool check_null = true, bool strict_mode = false);
 
   /// Converts slot data, of length 'len',  into type of slot_desc,
   /// and writes the result into the tuples's slot.
@@ -74,9 +76,11 @@ class TextConverter {
   /// if its input string matches null_vol_val.
   /// The codegenerated function does not support escape characters and should not
   /// be used for partitions that contain escapes.
+  /// strict_mode: If set, numerical overflow/underflow are considered to be parse
+  /// errors.
   static llvm::Function* CodegenWriteSlot(LlvmCodeGen* codegen,
       TupleDescriptor* tuple_desc, SlotDescriptor* slot_desc,
-      const char* null_col_val, int len, bool check_null);
+      const char* null_col_val, int len, bool check_null, bool strict_mode = false);
 
  private:
   char escape_char_;
@@ -84,6 +88,9 @@ class TextConverter {
   std::string null_col_val_;
   /// Indicates whether we should check for null_col_val_ and set slots to NULL.
   bool check_null_;
+  /// Indicates whether numerical overflow/underflow are considered to be parse
+  /// errors.
+  bool strict_mode_;
 };
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/be/src/exec/text-converter.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/text-converter.inline.h b/be/src/exec/text-converter.inline.h
index d50b66f..052f1b3 100644
--- a/be/src/exec/text-converter.inline.h
+++ b/be/src/exec/text-converter.inline.h
@@ -167,10 +167,12 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc, Tuple* tup
       break;
   }
 
-  // TODO: add warning for overflow case
-  if (parse_result == StringParser::PARSE_FAILURE) {
-    tuple->SetNull(slot_desc->null_indicator_offset());
-    return false;
+  if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) {
+    if (parse_result == StringParser::PARSE_FAILURE ||
+        (strict_mode_ && parse_result == StringParser::PARSE_OVERFLOW)) {
+      tuple->SetNull(slot_desc->null_indicator_offset());
+      return false;
+    }
   }
 
   return true;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/be/src/runtime/runtime-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-state.h b/be/src/runtime/runtime-state.h
index f7c0248..860692f 100644
--- a/be/src/runtime/runtime-state.h
+++ b/be/src/runtime/runtime-state.h
@@ -100,6 +100,9 @@ class RuntimeState {
   bool abort_on_error() const {
     return query_ctx().request.query_options.abort_on_error;
   }
+  bool strict_mode() const {
+    return query_ctx().request.query_options.strict_mode;
+  }
   bool abort_on_default_limit_exceeded() const {
     return query_ctx().request.query_options.abort_on_default_limit_exceeded;
   }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/be/src/service/query-options.cc
----------------------------------------------------------------------
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index 83e3548..b82b5c3 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -401,6 +401,11 @@ Status impala::SetQueryOption(const string& key, const string& value,
         }
         break;
       }
+      case TImpalaQueryOptions::STRICT_MODE: {
+        query_options->__set_strict_mode(
+            iequals(value, "true") || iequals(value, "1"));
+        break;
+      }
       default:
         // We hit this DCHECK(false) if we forgot to add the corresponding entry here
         // when we add a new query option.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/be/src/service/query-options.h
----------------------------------------------------------------------
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index c86bc6b..4104f2b 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -32,7 +32,7 @@ class TQueryOptions;
 // the DCHECK.
 #define QUERY_OPTS_TABLE\
   DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\
-      TImpalaQueryOptions::PREFETCH_MODE + 1);\
+      TImpalaQueryOptions::STRICT_MODE + 1);\
   QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\
   QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR)\
   QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\
@@ -79,7 +79,9 @@ class TQueryOptions;
   QUERY_OPT_FN(s3_skip_insert_staging, S3_SKIP_INSERT_STAGING)\
   QUERY_OPT_FN(runtime_filter_min_size, RUNTIME_FILTER_MIN_SIZE)\
   QUERY_OPT_FN(runtime_filter_max_size, RUNTIME_FILTER_MAX_SIZE)\
-  QUERY_OPT_FN(prefetch_mode, PREFETCH_MODE);
+  QUERY_OPT_FN(prefetch_mode, PREFETCH_MODE)\
+  QUERY_OPT_FN(strict_mode, STRICT_MODE);
+
 
 /// Converts a TQueryOptions struct into a map of key, value pairs.
 void TQueryOptionsToMap(const TQueryOptions& query_options,

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/common/thrift/ImpalaInternalService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift
index ada1a20..55dd838 100644
--- a/common/thrift/ImpalaInternalService.thrift
+++ b/common/thrift/ImpalaInternalService.thrift
@@ -196,6 +196,9 @@ struct TQueryOptions {
 
   // Prefetching behavior during hash tables' building and probing.
   48: optional Types.TPrefetchMode prefetch_mode = Types.TPrefetchMode.HT_BUCKET
+
+  // Additional strict handling of invalid data parsing and type conversions.
+  49: optional bool strict_mode = false
 }
 
 // Impala currently has two types of sessions: Beeswax and HiveServer2

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/common/thrift/ImpalaService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift
index 29ae4cc..0777c32 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -222,7 +222,10 @@ enum TImpalaQueryOptions {
   RUNTIME_FILTER_MIN_SIZE,
 
   // Prefetching behavior during hash tables' building and probing.
-  PREFETCH_MODE
+  PREFETCH_MODE,
+
+  // Additional strict handling of invalid data parsing and type conversions.
+  STRICT_MODE
 }
 
 // The summary of an insert.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/testdata/datasets/functional/functional_schema_template.sql
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index 4a54d5f..894fa8b 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -927,8 +927,6 @@ float_col float
 double_col double
 ---- ROW_FORMAT
 delimited fields terminated by ','  escaped by '\\'
----- DEPENDENT_LOAD
-INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name};
 ---- LOAD
 LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/overflow.txt' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/testdata/datasets/functional/schema_constraints.csv
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index fb787ac..d95f3c4 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -102,9 +102,10 @@ table_name:tblwithraggedcolumns, constraint:exclude, table_format:hbase/none/non
 table_name:greptiny, constraint:exclude, table_format:hbase/none/none
 table_name:tinyinttable, constraint:exclude, table_format:hbase/none/none
 
-# overflow has a bigint that's too big. hbase may lose precision, hence this
-# table cannot be loaded.
-table_name:overflow, constraint:exclude, table_format:hbase/none/none
+# overflow uses a manually constructed text file which doesn't make sense to write to
+# other table formats since the values that would be written are different (e.g. already
+# truncated.)
+table_name:overflow, constraint:restrict_to, table_format:text/none/none
 
 # widerow has a single column with a single row containing a 10MB string. hbase doesn't
 # seem to like this.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/testdata/workloads/functional-query/queries/QueryTest/show.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/show.test b/testdata/workloads/functional-query/queries/QueryTest/show.test
index 6579f9b..af54c74 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/show.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/show.test
@@ -77,7 +77,6 @@ show tables '*'
 'nullinsert'
 'nullinsert_alt'
 'nulltable'
-'overflow'
 'rankingssmall'
 'stringpartitionkey'
 'tblwithraggedcolumns'

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test b/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test
new file mode 100644
index 0000000..8dccc07
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test
@@ -0,0 +1,31 @@
+====
+---- QUERY
+select tinyint_col from overflow
+---- CATCH
+Error converting column: 0 TO TINYINT (Data is: 1000)
+====
+---- QUERY
+select smallint_col from overflow
+---- CATCH
+Error converting column: 1 TO SMALLINT (Data is: 100000)
+====
+---- QUERY
+select int_col from overflow
+---- CATCH
+Error converting column: 2 TO INT (Data is: 10000000000000000)
+====
+---- QUERY
+select bigint_col from overflow
+---- CATCH
+Error converting column: 3 TO BIGINT (Data is: 10000000000000000000)
+====
+---- QUERY
+select float_col from overflow
+---- CATCH
+Error converting column: 4 TO FLOAT (Data is: 1e1000000)
+====
+---- QUERY
+select double_col from overflow
+---- CATCH
+Error converting column: 5 TO DOUBLE (Data is: 1e10000)
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test b/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test
new file mode 100644
index 0000000..2d85a74
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test
@@ -0,0 +1,28 @@
+====
+---- QUERY
+select * from overflow
+---- ERRORS
+Error converting column: 0 TO TINYINT (Data is: 1000)
+Error converting column: 1 TO SMALLINT (Data is: 100000)
+Error converting column: 2 TO INT (Data is: 10000000000000000)
+Error converting column: 3 TO BIGINT (Data is: 10000000000000000000)
+Error converting column: 4 TO FLOAT (Data is: 1e1000000)
+Error converting column: 5 TO DOUBLE (Data is: 1e10000)
+file: hdfs://regex:.$
+record: 1000,100000,10000000000000000,10000000000000000000,1e1000000,1e10000
+Error converting column: 0 TO TINYINT (Data is: -1000)
+Error converting column: 1 TO SMALLINT (Data is: -100000)
+Error converting column: 2 TO INT (Data is: -10000000000000000)
+Error converting column: 3 TO BIGINT (Data is: -10000000000000000000)
+Error converting column: 4 TO FLOAT (Data is: -1e1000000)
+Error converting column: 5 TO DOUBLE (Data is: -1e10000)
+file: hdfs://regex:.$
+record: -1000,-100000,-10000000000000000,-10000000000000000000,-1e1000000,-1e10000
+
+---- RESULTS
+1,2,3,4,5.5,6.6
+NULL,NULL,NULL,NULL,NULL,NULL
+NULL,NULL,NULL,NULL,NULL,NULL
+---- TYPES
+tinyint, smallint, int, bigint, float, double
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f413e236/tests/query_test/test_queries.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_queries.py b/tests/query_test/test_queries.py
index 71fd21c..f69c9b7 100644
--- a/tests/query_test/test_queries.py
+++ b/tests/query_test/test_queries.py
@@ -134,6 +134,14 @@ class TestQueriesTextTables(ImpalaTestSuite):
   def test_overflow(self, vector):
     self.run_test_case('QueryTest/overflow', vector)
 
+  def test_strict_mode(self, vector):
+    vector.get_value('exec_option')['strict_mode'] = 1
+    vector.get_value('exec_option')['abort_on_error'] = 0
+    self.run_test_case('QueryTest/strict-mode', vector)
+
+    vector.get_value('exec_option')['abort_on_error'] = 1
+    self.run_test_case('QueryTest/strict-mode-abort', vector)
+
   def test_data_source_tables(self, vector):
     self.run_test_case('QueryTest/data-source-tables', vector)