You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/01/04 06:48:11 UTC
[2/3] impala git commit: IMPALA-6295: Fix mix/max handling of 'nan' and 'inf'

IMPALA-6295: Fix mix/max handling of 'nan' and 'inf'

This patch fixes several issues related to the min/max aggregate
functions and their handling of 'nan' and 'inf':
- Previously, if 'inf' or '-inf' was the only value for the min/max
  and codegen was being used, the result would be incorrect. This
  occurred, for example in the case of 'inf' and 'min', because we
  set an initial value of numeric_limits::max, which is less than
  'inf', so the returned min was numeric_limits::max when it should be
  'inf'. The fix is to set the initial value to
  numeric_limits::infinity.
- Previously, if one of the values was 'nan', the result of min/max
  was non-deterministic depending on the order the values were
  evaluated in. This occurs because 'nan' < or > 'any value' is always
  false, so if the first value added was 'nan', all other comparisons
  would be false and 'nan' would be returned, whereas if the first
  value wasn't 'nan' then the 'nan' wouldn't be returned. The fix is
  to treat 'nan' specially and to always return 'nan' if there is a
  single 'nan' value.

Testing:
- Added e2e tests for both scenarios, as well as adding a little extra
  nan/inf coverage for other aggregate functions.

Change-Id: Ia1e206105937ce5afc75ca5044597d39b3dc6a81
Reviewed-on: http://gerrit.cloudera.org:8080/8854
Reviewed-by: Bikramjeet Vig <bi...@cloudera.com>
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/96b976af
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/96b976af
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/96b976af

Branch: refs/heads/master
Commit: 96b976aff38de29619ca97dabc47566382e90bf8
Parents: 7810d1f
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
Authored: Thu Dec 14 16:40:55 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu Jan 4 01:23:43 2018 +0000

----------------------------------------------------------------------
 be/src/codegen/llvm-codegen.cc                  | 43 +++++++++-----------
 be/src/codegen/llvm-codegen.h                   |  6 ++-
 be/src/exec/partitioned-aggregation-node.cc     |  9 ++--
 be/src/exprs/aggregate-functions-ir.cc          | 27 ++++++++++++
 be/src/exprs/expr-value.h                       |  8 ++--
 .../queries/QueryTest/aggregation.test          | 39 ++++++++++++++++++
 6 files changed, 97 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/96b976af/be/src/codegen/llvm-codegen.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc
index 600d709..e1a606c 100644
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -1334,14 +1334,9 @@ Status LlvmCodeGen::GetSymbols(const string& file, const string& module_id,
 // ret_v2:                                           ; preds = %entry
 //   ret i32 %v2
 // }
-llvm::Function* LlvmCodeGen::CodegenMinMax(const ColumnType& type, bool min) {
-  LlvmCodeGen::FnPrototype prototype(this, min ? "Min" : "Max", GetType(type));
-  prototype.AddArgument(LlvmCodeGen::NamedVariable("v1", GetType(type)));
-  prototype.AddArgument(LlvmCodeGen::NamedVariable("v2", GetType(type)));
-
-  llvm::Value* params[2];
-  LlvmBuilder builder(context());
-  llvm::Function* fn = prototype.GeneratePrototype(&builder, &params[0]);
+void LlvmCodeGen::CodegenMinMax(LlvmBuilder* builder, const ColumnType& type,
+    llvm::Value* src, llvm::Value* dst_slot_ptr, bool min, llvm::Function* fn) {
+  llvm::Value* dst = builder->CreateLoad(dst_slot_ptr, "dst_val");
 
   llvm::Value* compare = NULL;
   switch (type.type) {
@@ -1351,10 +1346,10 @@ llvm::Function* LlvmCodeGen::CodegenMinMax(const ColumnType& type, bool min) {
     case TYPE_BOOLEAN:
       if (min) {
         // For min, return x && y
-        compare = builder.CreateAnd(params[0], params[1]);
+        compare = builder->CreateAnd(src, dst);
       } else {
         // For max, return x || y
-        compare = builder.CreateOr(params[0], params[1]);
+        compare = builder->CreateOr(src, dst);
       }
       break;
     case TYPE_TINYINT:
@@ -1363,38 +1358,40 @@ llvm::Function* LlvmCodeGen::CodegenMinMax(const ColumnType& type, bool min) {
     case TYPE_BIGINT:
     case TYPE_DECIMAL:
       if (min) {
-        compare = builder.CreateICmpSLT(params[0], params[1]);
+        compare = builder->CreateICmpSLT(src, dst);
       } else {
-        compare = builder.CreateICmpSGT(params[0], params[1]);
+        compare = builder->CreateICmpSGT(src, dst);
       }
       break;
     case TYPE_FLOAT:
     case TYPE_DOUBLE:
       if (min) {
-        compare = builder.CreateFCmpULT(params[0], params[1]);
+        // OLT is true if 'src' < 'dst' and neither 'src' nor 'dst' is 'nan'.
+        compare = builder->CreateFCmpOLT(src, dst);
       } else {
-        compare = builder.CreateFCmpUGT(params[0], params[1]);
+        // OGT is true if 'src' > 'dst' and neither 'src' nor 'dst' is 'nan'.
+        compare = builder->CreateFCmpOGT(src, dst);
       }
+      // UNE is true if the operands are not equal or if either operand is a 'nan'. Since
+      // we're comparing 'src' to itself, the UNE will only be true if 'src' is 'nan'.
+      compare = builder->CreateOr(compare, builder->CreateFCmpUNE(src, src));
       break;
     default:
       DCHECK(false);
   }
 
   if (type.type == TYPE_BOOLEAN) {
-    builder.CreateRet(compare);
+    builder->CreateStore(compare, dst_slot_ptr);
   } else {
     llvm::BasicBlock *ret_v1, *ret_v2;
     CreateIfElseBlocks(fn, "ret_v1", "ret_v2", &ret_v1, &ret_v2);
 
-    builder.CreateCondBr(compare, ret_v1, ret_v2);
-    builder.SetInsertPoint(ret_v1);
-    builder.CreateRet(params[0]);
-    builder.SetInsertPoint(ret_v2);
-    builder.CreateRet(params[1]);
+    builder->CreateCondBr(compare, ret_v1, ret_v2);
+    builder->SetInsertPoint(ret_v1);
+    builder->CreateStore(src, dst_slot_ptr);
+    builder->CreateBr(ret_v2);
+    builder->SetInsertPoint(ret_v2);
   }
-
-  fn = FinalizeFunction(fn);
-  return fn;
 }
 
 // Intrinsics are loaded one by one.  Some are overloaded (e.g. memcpy) and the types must

http://git-wip-us.apache.org/repos/asf/impala/blob/96b976af/be/src/codegen/llvm-codegen.h
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.h b/be/src/codegen/llvm-codegen.h
index da5d965..33fcdba 100644
--- a/be/src/codegen/llvm-codegen.h
+++ b/be/src/codegen/llvm-codegen.h
@@ -508,8 +508,10 @@ class LlvmCodeGen {
   static Status GetSymbols(const string& file, const string& module_id,
       boost::unordered_set<std::string>* symbols);
 
-  /// Generates function to return min/max(v1, v2)
-  llvm::Function* CodegenMinMax(const ColumnType& type, bool min);
+  /// Codegen at the current builder location in function 'fn' to store the
+  /// max/min('src', value in 'dst_slot_ptr') in 'dst_slot_ptr'
+  void CodegenMinMax(LlvmBuilder* builder, const ColumnType& type,
+      llvm::Value* dst_slot_ptr, llvm::Value* src, bool min, llvm::Function* fn);
 
   /// Codegen to call llvm memcpy intrinsic at the current builder location
   /// dst & src must be pointer types. size is the number of bytes to copy.

http://git-wip-us.apache.org/repos/asf/impala/blob/96b976af/be/src/exec/partitioned-aggregation-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.cc b/be/src/exec/partitioned-aggregation-node.cc
index 623fe7f..c90613b 100644
--- a/be/src/exec/partitioned-aggregation-node.cc
+++ b/be/src/exec/partitioned-aggregation-node.cc
@@ -1554,12 +1554,9 @@ Status PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen, int a
   } else if ((agg_op == AggFn::MIN || agg_op == AggFn::MAX) && dst_is_numeric_or_bool) {
     bool is_min = agg_op == AggFn::MIN;
     src.CodegenBranchIfNull(&builder, ret_block);
-    llvm::Function* min_max_fn = codegen->CodegenMinMax(slot_desc->type(), is_min);
-    llvm::Value* dst_value = builder.CreateLoad(dst_slot_ptr, "dst_val");
-    llvm::Value* min_max_args[] = {dst_value, src.GetVal()};
-    llvm::Value* result =
-        builder.CreateCall(min_max_fn, min_max_args, is_min ? "min_value" : "max_value");
-    builder.CreateStore(result, dst_slot_ptr);
+    codegen->CodegenMinMax(
+        &builder, slot_desc->type(), src.GetVal(), dst_slot_ptr, is_min, *fn);
+
     // Dst may have been NULL, make sure to unset the NULL bit.
     DCHECK(slot_desc->is_nullable());
     slot_desc->CodegenSetNullIndicator(

http://git-wip-us.apache.org/repos/asf/impala/blob/96b976af/be/src/exprs/aggregate-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc
index 40c0c16..a8fbe57 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -602,6 +602,33 @@ void AggregateFunctions::InitNullString(FunctionContext* c, StringVal* dst) {
   dst->len = 0;
 }
 
+// For DoubleVal and FloatVal, we have to handle NaN specially.  If 'val' != 'val', then
+// 'val' must be NaN, and if any of the values that are inserted are NaN, then we return
+// NaN. So, if 'src.val != src.val', set 'dst' to it.
+template <>
+void AggregateFunctions::Min(FunctionContext*, const FloatVal& src, FloatVal* dst) {
+  if (src.is_null) return;
+  if (dst->is_null || src.val < dst->val || src.val != src.val) *dst = src;
+}
+
+template <>
+void AggregateFunctions::Max(FunctionContext*, const FloatVal& src, FloatVal* dst) {
+  if (src.is_null) return;
+  if (dst->is_null || src.val > dst->val || src.val != src.val) *dst = src;
+}
+
+template <>
+void AggregateFunctions::Min(FunctionContext*, const DoubleVal& src, DoubleVal* dst) {
+  if (src.is_null) return;
+  if (dst->is_null || src.val < dst->val || src.val != src.val) *dst = src;
+}
+
+template <>
+void AggregateFunctions::Max(FunctionContext*, const DoubleVal& src, DoubleVal* dst) {
+  if (src.is_null) return;
+  if (dst->is_null || src.val > dst->val || src.val != src.val) *dst = src;
+}
+
 template<>
 void AggregateFunctions::Min(FunctionContext* ctx, const StringVal& src, StringVal* dst) {
   if (src.is_null) return;

http://git-wip-us.apache.org/repos/asf/impala/blob/96b976af/be/src/exprs/expr-value.h
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-value.h b/be/src/exprs/expr-value.h
index d53f6c4..e47438f 100644
--- a/be/src/exprs/expr-value.h
+++ b/be/src/exprs/expr-value.h
@@ -139,10 +139,10 @@ struct ExprValue {
       case TYPE_FLOAT:
         // For floats and doubles, numeric_limits::min() is the smallest positive
         // representable value.
-        float_val = -std::numeric_limits<float>::max();
+        float_val = -std::numeric_limits<float>::infinity();
         return &float_val;
       case TYPE_DOUBLE:
-        double_val = -std::numeric_limits<double>::max();
+        double_val = -std::numeric_limits<double>::infinity();
         return &double_val;
       default:
         DCHECK(false);
@@ -183,10 +183,10 @@ struct ExprValue {
             return &decimal16_val;
         }
       case TYPE_FLOAT:
-        float_val = std::numeric_limits<float>::max();
+        float_val = std::numeric_limits<float>::infinity();
         return &float_val;
       case TYPE_DOUBLE:
-        double_val = std::numeric_limits<double>::max();
+        double_val = std::numeric_limits<double>::infinity();
         return &double_val;
       default:
         DCHECK(false);

http://git-wip-us.apache.org/repos/asf/impala/blob/96b976af/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
index 3cc2f20..7bb510b 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
@@ -1331,3 +1331,42 @@ select count(*) from (
 ---- TYPES
 bigint
 ====
+---- QUERY
+# IMPALA-6295: min/max where nan/inf/-inf are the only values
+with x as (select cast('nan' as float) a, cast('inf' as float) b, cast('-inf' as float) c)
+select min(a), min(b), min(c), max(a), max(b), max(c) from x
+---- RESULTS
+NaN,Infinity,-Infinity,NaN,Infinity,-Infinity
+---- TYPES
+FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT
+====
+---- QUERY
+# IMPALA-6295: min/max/sum/avg should return nan if any of the values is nan
+with x as (values (0), (1), (cast('nan' as double)), (cast('inf' as double)),
+  (cast('-inf' as double)))
+select min(`0`), max(`0`), sum(`0`), avg(`0`) from x
+---- RESULTS
+NaN,NaN,NaN,NaN
+---- TYPES
+DOUBLE,DOUBLE,DOUBLE,DOUBLE
+====
+---- QUERY
+# Test behavior of inf
+with x as (values (0), (cast('inf' as double)), (5.2))
+select min(`0`), max(`0`), sum(`0`), avg(`0`) from x
+---- RESULTS
+0,Infinity,Infinity,Infinity
+---- TYPES
+DOUBLE,DOUBLE,DOUBLE,DOUBLE
+====
+---- QUERY
+# Test behavior of -inf
+with x as (values (cast('-inf' as double)), (0), (-10))
+select min(`cast('-inf' as double)`), max(`cast('-inf' as double)`),
+  sum(`cast('-inf' as double)`), avg(`cast('-inf' as double)`)
+from x
+---- RESULTS
+-Infinity,0,-Infinity,-Infinity
+---- TYPES
+DOUBLE,DOUBLE,DOUBLE,DOUBLE
+====