You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/01/31 04:16:49 UTC

[doris] branch branch-hive-test updated (39aef0c1a7 -> 18fe70ee74)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a change to branch branch-hive-test
in repository https://gitbox.apache.org/repos/asf/doris.git


    from 39aef0c1a7 [tmpfix] add table name to file scan node explain string
     new c5c9fa54dd [Refactor](function) opt the exec of function with null column
     new 18fe70ee74 [fix](multi catalog)Collect decimal and date type min max statistic value (#16262)

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 be/src/vec/exprs/vectorized_fn_call.cpp            |   2 +
 be/src/vec/functions/function.cpp                  |  14 +--
 be/src/vec/functions/function_cast.h               |  14 +--
 be/src/vec/functions/function_helpers.cpp          | 118 +++++++++------------
 be/src/vec/functions/function_helpers.h            |  26 ++---
 .../doris/catalog/external/HMSExternalTable.java   |   1 +
 .../apache/doris/statistics/HiveAnalysisTask.java  |  45 ++++++--
 7 files changed, 116 insertions(+), 104 deletions(-)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[doris] 02/02: [fix](multi catalog)Collect decimal and date type min max statistic value (#16262)

Posted by mo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-hive-test
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 18fe70ee74648375d9242358315948c066ff4b8d
Author: Jibing-Li <64...@users.noreply.github.com>
AuthorDate: Tue Jan 31 11:58:56 2023 +0800

    [fix](multi catalog)Collect decimal and date type min max statistic value (#16262)
    
    The min and max value of decimal and date columns in hive external table are incorrect,
    this pr is to parse the min max value in HMS correctly.
---
 .../doris/catalog/external/HMSExternalTable.java   |  1 +
 .../apache/doris/statistics/HiveAnalysisTask.java  | 45 +++++++++++++++++-----
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
index a0b9535f89..638c1642ec 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
@@ -255,6 +255,7 @@ public class HMSExternalTable extends ExternalTable {
      * get the dla type for scan node to get right information.
      */
     public DLAType getDlaType() {
+        makeSureInitialized();
         return dlaType;
     }
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HiveAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HiveAnalysisTask.java
index 836e3c6ae7..d22e2abe78 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HiveAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HiveAnalysisTask.java
@@ -28,6 +28,7 @@ import org.apache.commons.text.StringSubstitutor;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
 import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
+import org.apache.hadoop.hive.metastore.api.Decimal;
 import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData;
 import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
 import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
@@ -36,7 +37,10 @@ import org.apache.hadoop.hive.metastore.api.StringColumnStatsData;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
+import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.text.SimpleDateFormat;
+import java.time.LocalDate;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
@@ -139,8 +143,8 @@ public class HiveAnalysisTask extends HMSAnalysisTask {
     private void getStatData(ColumnStatisticsData data, Map<String, String> params) {
         long ndv = 0;
         long nulls = 0;
-        String min;
-        String max;
+        String min = "";
+        String max = "";
         // Collect ndv, nulls, min and max for different data type.
         if (data.isSetLongStats()) {
             LongColumnStatsData longStats = data.getLongStats();
@@ -152,15 +156,25 @@ public class HiveAnalysisTask extends HMSAnalysisTask {
             StringColumnStatsData stringStats = data.getStringStats();
             ndv = stringStats.getNumDVs();
             nulls = stringStats.getNumNulls();
-            min = "No value";
-            max = String.valueOf(stringStats.getMaxColLen());
         } else if (data.isSetDecimalStats()) {
-            // TODO: Need a more accurate way to collect decimal values.
             DecimalColumnStatsData decimalStats = data.getDecimalStats();
             ndv = decimalStats.getNumDVs();
             nulls = decimalStats.getNumNulls();
-            min = decimalStats.getLowValue().toString();
-            max = decimalStats.getHighValue().toString();
+            if (decimalStats.isSetLowValue()) {
+                Decimal lowValue = decimalStats.getLowValue();
+                if (lowValue != null) {
+                    BigDecimal lowDecimal = new BigDecimal(new BigInteger(lowValue.getUnscaled()), lowValue.getScale());
+                    min = lowDecimal.toString();
+                }
+            }
+            if (decimalStats.isSetHighValue()) {
+                Decimal highValue = decimalStats.getHighValue();
+                if (highValue != null) {
+                    BigDecimal highDecimal = new BigDecimal(
+                            new BigInteger(highValue.getUnscaled()), highValue.getScale());
+                    max = highDecimal.toString();
+                }
+            }
         } else if (data.isSetDoubleStats()) {
             DoubleColumnStatsData doubleStats = data.getDoubleStats();
             ndv = doubleStats.getNumDVs();
@@ -168,12 +182,23 @@ public class HiveAnalysisTask extends HMSAnalysisTask {
             min = String.valueOf(doubleStats.getLowValue());
             max = String.valueOf(doubleStats.getHighValue());
         } else if (data.isSetDateStats()) {
-            // TODO: Need a more accurate way to collect date values.
             DateColumnStatsData dateStats = data.getDateStats();
             ndv = dateStats.getNumDVs();
             nulls = dateStats.getNumNulls();
-            min = dateStats.getLowValue().toString();
-            max = dateStats.getHighValue().toString();
+            if (dateStats.isSetLowValue()) {
+                org.apache.hadoop.hive.metastore.api.Date lowValue = dateStats.getLowValue();
+                if (lowValue != null) {
+                    LocalDate lowDate = LocalDate.ofEpochDay(lowValue.getDaysSinceEpoch());
+                    min = lowDate.toString();
+                }
+            }
+            if (dateStats.isSetHighValue()) {
+                org.apache.hadoop.hive.metastore.api.Date highValue = dateStats.getHighValue();
+                if (highValue != null) {
+                    LocalDate highDate = LocalDate.ofEpochDay(highValue.getDaysSinceEpoch());
+                    max = highDate.toString();
+                }
+            }
         } else {
             throw new RuntimeException("Not supported data type.");
         }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org


[doris] 01/02: [Refactor](function) opt the exec of function with null column

Posted by mo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-hive-test
in repository https://gitbox.apache.org/repos/asf/doris.git

commit c5c9fa54dd37d77ba7489a4966e959ccaf775391
Author: HappenLee <ha...@hotmail.com>
AuthorDate: Mon Jan 30 18:04:54 2023 +0800

    [Refactor](function) opt the exec of function with null column
---
 be/src/vec/exprs/vectorized_fn_call.cpp   |   2 +
 be/src/vec/functions/function.cpp         |  14 ++--
 be/src/vec/functions/function_cast.h      |  14 ++--
 be/src/vec/functions/function_helpers.cpp | 118 +++++++++++++-----------------
 be/src/vec/functions/function_helpers.h   |  26 +++----
 5 files changed, 80 insertions(+), 94 deletions(-)

diff --git a/be/src/vec/exprs/vectorized_fn_call.cpp b/be/src/vec/exprs/vectorized_fn_call.cpp
index 8a41e332b9..641a5c3b96 100644
--- a/be/src/vec/exprs/vectorized_fn_call.cpp
+++ b/be/src/vec/exprs/vectorized_fn_call.cpp
@@ -42,6 +42,8 @@ doris::Status VectorizedFnCall::prepare(doris::RuntimeState* state,
     argument_template.reserve(_children.size());
     std::vector<std::string_view> child_expr_name;
     for (auto child : _children) {
+        // TODO: rethink we really create column here. maybe only need nullptr just to
+        // get the function
         auto column = child->data_type()->create_column();
         argument_template.emplace_back(std::move(column), child->data_type(), child->expr_name());
         child_expr_name.emplace_back(child->expr_name());
diff --git a/be/src/vec/functions/function.cpp b/be/src/vec/functions/function.cpp
index 41f3141c06..662a2a58af 100644
--- a/be/src/vec/functions/function.cpp
+++ b/be/src/vec/functions/function.cpp
@@ -217,11 +217,12 @@ Status PreparedFunctionImpl::default_implementation_for_nulls(
     }
 
     if (null_presence.has_nullable) {
-        Block temporary_block = create_block_with_nested_columns(block, args, result);
+        auto [temporary_block, new_args, new_result] =
+                create_block_with_nested_columns(block, args, result);
         RETURN_IF_ERROR(execute_without_low_cardinality_columns(
-                context, temporary_block, args, result, temporary_block.rows(), dry_run));
+                context, temporary_block, new_args, new_result, temporary_block.rows(), dry_run));
         block.get_by_position(result).column =
-                wrap_in_nullable(temporary_block.get_by_position(result).column, block, args,
+                wrap_in_nullable(temporary_block.get_by_position(new_result).column, block, args,
                                  result, input_rows_count);
         *executed = true;
         return Status::OK();
@@ -295,10 +296,9 @@ DataTypePtr FunctionBuilderImpl::get_return_type_without_low_cardinality(
         }
         if (null_presence.has_nullable) {
             ColumnNumbers numbers(arguments.size());
-            for (size_t i = 0; i < arguments.size(); i++) {
-                numbers[i] = i;
-            }
-            Block nested_block = create_block_with_nested_columns(Block(arguments), numbers);
+            std::iota(numbers.begin(), numbers.end(), 0);
+            auto [nested_block, _] =
+                    create_block_with_nested_columns(Block(arguments), numbers, false);
             auto return_type = get_return_type_impl(
                     ColumnsWithTypeAndName(nested_block.begin(), nested_block.end()));
             return make_nullable(return_type);
diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h
index e3baaecdd2..62b716d1a9 100644
--- a/be/src/vec/functions/function_cast.h
+++ b/be/src/vec/functions/function_cast.h
@@ -1592,9 +1592,10 @@ private:
                 Block tmp_block;
                 size_t tmp_res_index = 0;
                 if (source_is_nullable) {
-                    tmp_block = create_block_with_nested_columns_only_args(block, arguments);
-                    tmp_res_index = tmp_block.columns();
-                    tmp_block.insert({nullptr, nested_type, ""});
+                    auto [t_block, tmp_args, tmp_res] =
+                            create_block_with_nested_columns(block, arguments, result);
+                    tmp_block = std::move(t_block);
+                    tmp_res_index = tmp_res;
 
                     /// Perform the requested conversion.
                     RETURN_IF_ERROR(
@@ -1624,7 +1625,8 @@ private:
             return [wrapper, skip_not_null_check](FunctionContext* context, Block& block,
                                                   const ColumnNumbers& arguments,
                                                   const size_t result, size_t input_rows_count) {
-                Block tmp_block = create_block_with_nested_columns(block, arguments, result);
+                auto [tmp_block, tmp_args, tmp_res] =
+                        create_block_with_nested_columns(block, arguments, result);
 
                 /// Check that all values are not-NULL.
                 /// Check can be skipped in case if LowCardinality dictionary is transformed.
@@ -1640,8 +1642,8 @@ private:
                     }
                 }
 
-                RETURN_IF_ERROR(wrapper(context, tmp_block, arguments, result, input_rows_count));
-                block.get_by_position(result).column = tmp_block.get_by_position(result).column;
+                RETURN_IF_ERROR(wrapper(context, tmp_block, tmp_args, tmp_res, input_rows_count));
+                block.get_by_position(result).column = tmp_block.get_by_position(tmp_res).column;
                 return Status::OK();
             };
         } else {
diff --git a/be/src/vec/functions/function_helpers.cpp b/be/src/vec/functions/function_helpers.cpp
index fcfdd4b3a2..560f54e1a5 100644
--- a/be/src/vec/functions/function_helpers.cpp
+++ b/be/src/vec/functions/function_helpers.cpp
@@ -26,82 +26,68 @@
 
 namespace doris::vectorized {
 
-Block create_block_with_nested_columns_only_args(const Block& block, const ColumnNumbers& args) {
-    std::set<size_t> args_set(args.begin(), args.end());
+std::tuple<Block, ColumnNumbers> create_block_with_nested_columns(const Block& block,
+                                                                  const ColumnNumbers& args,
+                                                                  const bool need_check_same) {
     Block res;
-
-    for (auto i : args_set) {
-        const auto& col = block.get_by_position(i);
-
-        if (col.type->is_nullable()) {
-            const DataTypePtr& nested_type =
-                    static_cast<const DataTypeNullable&>(*col.type).get_nested_type();
-
-            if (!col.column) {
-                res.insert({nullptr, nested_type, col.name});
-            } else if (auto* nullable = check_and_get_column<ColumnNullable>(*col.column)) {
-                const auto& nested_col = nullable->get_nested_column_ptr();
-                res.insert({nested_col, nested_type, col.name});
-            } else if (auto* const_column = check_and_get_column<ColumnConst>(*col.column)) {
-                const auto& nested_col =
-                        check_and_get_column<ColumnNullable>(const_column->get_data_column())
-                                ->get_nested_column_ptr();
-                res.insert({ColumnConst::create(nested_col, col.column->size()), nested_type,
-                            col.name});
-            } else {
-                LOG(FATAL) << "Illegal column for DataTypeNullable";
+    ColumnNumbers res_args(args.size());
+
+    // only build temp block by args column, if args[i] == args[j]
+    // just keep one
+    for (size_t i = 0; i < args.size(); ++i) {
+        bool is_in_res = false;
+        size_t pre_loc = 0;
+
+        if (need_check_same) {
+            for (int j = 0; j < i; ++j) {
+                if (args[j] == args[i]) {
+                    is_in_res = true;
+                    pre_loc = j;
+                    break;
+                }
             }
-        } else {
-            res.insert(col);
         }
-    }
-
-    return res;
-}
 
-static Block create_block_with_nested_columns_impl(const Block& block,
-                                                   const std::unordered_set<size_t>& args) {
-    Block res;
-    size_t columns = block.columns();
-
-    for (size_t i = 0; i < columns; ++i) {
-        const auto& col = block.get_by_position(i);
-
-        if (args.count(i) && col.type->is_nullable()) {
-            const DataTypePtr& nested_type =
-                    static_cast<const DataTypeNullable&>(*col.type).get_nested_type();
-
-            if (!col.column) {
-                res.insert({nullptr, nested_type, col.name});
-            } else if (auto* nullable = check_and_get_column<ColumnNullable>(*col.column)) {
-                const auto& nested_col = nullable->get_nested_column_ptr();
-                res.insert({nested_col, nested_type, col.name});
-            } else if (auto* const_column = check_and_get_column<ColumnConst>(*col.column)) {
-                const auto& nested_col =
-                        check_and_get_column<ColumnNullable>(const_column->get_data_column())
-                                ->get_nested_column_ptr();
-                res.insert({ColumnConst::create(nested_col, col.column->size()), nested_type,
-                            col.name});
+        if (!is_in_res) {
+            const auto& col = block.get_by_position(i);
+            if (col.type->is_nullable()) {
+                const DataTypePtr& nested_type =
+                        static_cast<const DataTypeNullable&>(*col.type).get_nested_type();
+
+                if (!col.column) {
+                    res.insert({nullptr, nested_type, col.name});
+                } else if (auto* nullable = check_and_get_column<ColumnNullable>(*col.column)) {
+                    const auto& nested_col = nullable->get_nested_column_ptr();
+                    res.insert({nested_col, nested_type, col.name});
+                } else if (auto* const_column = check_and_get_column<ColumnConst>(*col.column)) {
+                    const auto& nested_col =
+                            check_and_get_column<ColumnNullable>(const_column->get_data_column())
+                                    ->get_nested_column_ptr();
+                    res.insert({ColumnConst::create(nested_col, col.column->size()), nested_type,
+                                col.name});
+                } else {
+                    LOG(FATAL) << "Illegal column for DataTypeNullable";
+                }
             } else {
-                LOG(FATAL) << "Illegal column for DataTypeNullable";
+                res.insert(col);
             }
-        } else
-            res.insert(col);
-    }
 
-    return res;
-}
+            res_args[i] = res.columns() - 1;
+        } else {
+            res_args[i] = pre_loc;
+        }
+    }
 
-Block create_block_with_nested_columns(const Block& block, const ColumnNumbers& args) {
-    std::unordered_set<size_t> args_set(args.begin(), args.end());
-    return create_block_with_nested_columns_impl(block, args_set);
+    return {res, res_args};
 }
 
-Block create_block_with_nested_columns(const Block& block, const ColumnNumbers& args,
-                                       size_t result) {
-    std::unordered_set<size_t> args_set(args.begin(), args.end());
-    args_set.insert(result);
-    return create_block_with_nested_columns_impl(block, args_set);
+std::tuple<Block, ColumnNumbers, size_t> create_block_with_nested_columns(const Block& block,
+                                                                          const ColumnNumbers& args,
+                                                                          size_t result) {
+    auto [res, res_args] = create_block_with_nested_columns(block, args, true);
+    // insert result column in temp block
+    res.insert(block.get_by_position(result));
+    return {res, res_args, res.columns() - 1};
 }
 
 void validate_argument_type(const IFunction& func, const DataTypes& arguments,
diff --git a/be/src/vec/functions/function_helpers.h b/be/src/vec/functions/function_helpers.h
index 777bd415dc..474ac435bf 100644
--- a/be/src/vec/functions/function_helpers.h
+++ b/be/src/vec/functions/function_helpers.h
@@ -86,21 +86,17 @@ inline std::enable_if_t<IsDecimalNumber<T>, Field> to_field(const T& x, UInt32 s
 
 Columns convert_const_tuple_to_constant_elements(const ColumnConst& column);
 
-/// Returns the copy of a given block in which each column specified in
-/// the "arguments" parameter is replaced with its respective nested
-/// column if it is nullable.
-Block create_block_with_nested_columns(const Block& block, const ColumnNumbers& args);
-
-/// Similar function as above. Additionally transform the result type if needed.
-Block create_block_with_nested_columns(const Block& block, const ColumnNumbers& args,
-                                       size_t result);
-
-/// Returns the copy of a given block in only args column specified in
-/// the "arguments" parameter is replaced with its respective nested
-/// column if it is nullable.
-/// TODO: the old funciton `create_block_with_nested_columns` have performance problem, replace all
-/// by the function and delete old one.
-Block create_block_with_nested_columns_only_args(const Block& block, const ColumnNumbers& args);
+/// Returns the copy of a tmp block and temp args order same as args
+/// in which only args column each column specified in the "arguments"
+/// parameter is replaced with its respective nested column if it is nullable.
+std::tuple<Block, ColumnNumbers> create_block_with_nested_columns(const Block& block,
+                                                                  const ColumnNumbers& args,
+                                                                  const bool need_check_same);
+
+// Same as above and return the new_res loc in tuple
+std::tuple<Block, ColumnNumbers, size_t> create_block_with_nested_columns(const Block& block,
+                                                                          const ColumnNumbers& args,
+                                                                          size_t result);
 
 /// Checks argument type at specified index with predicate.
 /// throws if there is no argument at specified index or if predicate returns false.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org