You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by pa...@apache.org on 2023/04/21 22:12:09 UTC

[doris] branch master updated: [feature](function) Modified cast as time to behave more like MySQL (#18565)

This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new de0e89d1b4 [feature](function) Modified cast as time to behave more like MySQL  (#18565)
de0e89d1b4 is described below

commit de0e89d1b4bb2135aa3f20f8b530e85acbb71a89
Author: Mryange <59...@users.noreply.github.com>
AuthorDate: Sat Apr 22 06:11:59 2023 +0800

    [feature](function) Modified cast as time to behave more like MySQL  (#18565)
    
    Because the underlying type of time was float64, select cast("19:22:18" as time) would result in a null value in the past.
    Results in the following:
---
 be/src/vec/core/call_on_type_index.h               |   4 +-
 be/src/vec/core/types.h                            |   3 +
 be/src/vec/data_types/data_type.cpp                |   2 +
 be/src/vec/data_types/data_type_factory.cpp        |   7 ++
 be/src/vec/data_types/data_type_time.h             |   1 +
 be/src/vec/functions/function_cast.h               | 131 +++++++++++++++++++--
 gensrc/proto/types.proto                           |   1 +
 .../data/correctness/test_cast_as_time.out         |   9 ++
 .../suites/correctness/test_cast_as_time.groovy    |  48 ++++++++
 9 files changed, 197 insertions(+), 9 deletions(-)

diff --git a/be/src/vec/core/call_on_type_index.h b/be/src/vec/core/call_on_type_index.h
index 88e8fca5bf..bc822d9902 100644
--- a/be/src/vec/core/call_on_type_index.h
+++ b/be/src/vec/core/call_on_type_index.h
@@ -23,6 +23,7 @@
 #include <utility>
 
 #include "vec/core/types.h"
+#include "vec/data_types/data_type_time.h"
 
 namespace doris::vectorized {
 
@@ -202,7 +203,8 @@ bool call_on_index_and_data_type(TypeIndex number, F&& f) {
         return f(TypePair<DataTypeNumber<Float32>, T>());
     case TypeIndex::Float64:
         return f(TypePair<DataTypeNumber<Float64>, T>());
-
+    case TypeIndex::Time:
+        return f(TypePair<DataTypeTime, T>());
     case TypeIndex::Decimal32:
         return f(TypePair<DataTypeDecimal<Decimal32>, T>());
     case TypeIndex::Decimal64:
diff --git a/be/src/vec/core/types.h b/be/src/vec/core/types.h
index c88b3f3948..fc1a4332dc 100644
--- a/be/src/vec/core/types.h
+++ b/be/src/vec/core/types.h
@@ -91,6 +91,7 @@ enum class TypeIndex {
     Struct = 40,
     VARIANT = 41,
     QuantileState = 42,
+    Time = 43
 };
 
 struct Consted {
@@ -626,6 +627,8 @@ inline const char* getTypeName(TypeIndex idx) {
         return "Struct";
     case TypeIndex::QuantileState:
         return TypeName<QuantileState<double>>::get();
+    case TypeIndex::Time:
+        return "Time";
     }
 
     __builtin_unreachable();
diff --git a/be/src/vec/data_types/data_type.cpp b/be/src/vec/data_types/data_type.cpp
index 458a204576..a5ebc78eab 100644
--- a/be/src/vec/data_types/data_type.cpp
+++ b/be/src/vec/data_types/data_type.cpp
@@ -171,6 +171,8 @@ PGenericType_TypeId IDataType::get_pdata_type(const IDataType* data_type) {
         return PGenericType::JSONB;
     case TypeIndex::Map:
         return PGenericType::MAP;
+    case TypeIndex::Time:
+        return PGenericType::TIME;
     default:
         return PGenericType::UNKNOWN;
     }
diff --git a/be/src/vec/data_types/data_type_factory.cpp b/be/src/vec/data_types/data_type_factory.cpp
index c1ca9245c7..92e1db8357 100644
--- a/be/src/vec/data_types/data_type_factory.cpp
+++ b/be/src/vec/data_types/data_type_factory.cpp
@@ -286,6 +286,9 @@ DataTypePtr DataTypeFactory::create_data_type(const TypeIndex& type_index, bool
     case TypeIndex::DateV2:
         nested = std::make_shared<vectorized::DataTypeDateV2>();
         break;
+    case TypeIndex::Time:
+        nested = std::make_shared<DataTypeTime>();
+        break;
     case TypeIndex::DateTimeV2:
         nested = std::make_shared<DataTypeDateTimeV2>();
         break;
@@ -522,6 +525,10 @@ DataTypePtr DataTypeFactory::create_data_type(const PColumnMeta& pcolumn) {
         nested = std::make_shared<DataTypeQuantileStateDouble>();
         break;
     }
+    case PGenericType::TIME: {
+        nested = std::make_shared<DataTypeTime>();
+        break;
+    }
     default: {
         LOG(FATAL) << fmt::format("Unknown data type: {}", pcolumn.type());
         return nullptr;
diff --git a/be/src/vec/data_types/data_type_time.h b/be/src/vec/data_types/data_type_time.h
index 5f508f56d4..0bff06c869 100644
--- a/be/src/vec/data_types/data_type_time.h
+++ b/be/src/vec/data_types/data_type_time.h
@@ -76,6 +76,7 @@ public:
     DataTypeSerDeSPtr get_serde() const override {
         return std::make_shared<DataTypeNumberSerDe<Float64>>();
     };
+    TypeIndex get_type_id() const override { return TypeIndex::Time; }
 };
 
 } // namespace doris::vectorized
diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h
index 7af176395a..6f79522ab8 100644
--- a/be/src/vec/functions/function_cast.h
+++ b/be/src/vec/functions/function_cast.h
@@ -75,6 +75,7 @@
 #include "vec/data_types/data_type_number.h"
 #include "vec/data_types/data_type_string.h"
 #include "vec/data_types/data_type_struct.h"
+#include "vec/data_types/data_type_time.h"
 #include "vec/data_types/data_type_time_v2.h"
 #include "vec/functions/function.h"
 #include "vec/functions/function_helpers.h"
@@ -110,7 +111,92 @@ inline UInt32 extract_to_decimal_scale(const ColumnWithTypeAndName& named_column
     named_column.column->get(0, field);
     return field.get<UInt32>();
 }
+/** Cast from string or number to Time.
+  * In Doris, the underlying storage type of the Time class is Float64.
+  */
+struct TimeCast {
+    // Cast from string
+    // Some examples of conversions.
+    // '300' -> 00:03:00 '20:23' ->  20:23:00 '20:23:24' -> 20:23:24
+    template <typename T>
+    static bool try_parse_time(char* s, size_t len, T& x) {
+        char* first_char = s;
+        char* end_char = s + len;
+        int hour = 0, minute = 0, second = 0;
+        auto parse_from_str_to_int = [](char* begin, size_t len, auto& num) {
+            StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
+            auto int_value = StringParser::string_to_unsigned_int<uint64_t>(
+                    reinterpret_cast<char*>(begin), len, &parse_result);
+            if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) {
+                return false;
+            }
+            num = int_value;
+            return true;
+        };
+        if (char* first_colon {nullptr};
+            (first_colon = (char*)memchr(first_char, ':', len)) != nullptr) {
+            if (char* second_colon {nullptr};
+                (second_colon = (char*)memchr(first_colon + 1, ':', end_char - first_colon - 1)) !=
+                nullptr) {
+                // find two colon
+                // parse hour
+                if (!parse_from_str_to_int(first_char, first_colon - first_char, hour)) {
+                    // hour  failed
+                    return false;
+                }
+                // parse minute
+                if (!parse_from_str_to_int(first_colon + 1, second_colon - first_colon - 1,
+                                           minute)) {
+                    return false;
+                }
+                // parse second
+                if (!parse_from_str_to_int(second_colon + 1, end_char - second_colon - 1, second)) {
+                    return false;
+                }
+            } else {
+                // find one colon
+                // parse hour
+                if (!parse_from_str_to_int(first_char, first_colon - first_char, hour)) {
+                    return false;
+                }
+                // parse minute
+                if (!parse_from_str_to_int(first_colon + 1, end_char - first_colon - 1, minute)) {
+                    return false;
+                }
+            }
+        } else {
+            // no colon ,so try to parse as a number
+            size_t from {};
+            if (!parse_from_str_to_int(first_char, len, from)) {
+                return false;
+            }
+            return try_parse_time(from, x);
+        }
+        // minute second must be < 60
+        if (minute >= 60 || second >= 60) {
+            return false;
+        }
+        x = hour * 3600 + minute * 60 + second;
+        return true;
+    }
 
+    // Cast from number
+    template <typename T, typename S>
+    static bool try_parse_time(T from, S& x) {
+        int64 seconds = from / 100;
+        int64 hour = 0, minute = 0, second = 0;
+        second = from - 100 * seconds;
+        from /= 100;
+        seconds = from / 100;
+        minute = from - 100 * seconds;
+        hour = seconds;
+        if (minute >= 60 || second >= 60) {
+            return false;
+        }
+        x = hour * 3600 + minute * 60 + second;
+        return true;
+    }
+};
 /** Conversion of number types to each other, enums to numbers, dates and datetimes to numbers and back: done by straight assignment.
   *  (Date is represented internally as number of days from some day; DateTime - as unix timestamp)
   */
@@ -275,11 +361,25 @@ struct ConvertImpl {
                     }
                 }
             } else {
-                for (size_t i = 0; i < size; ++i) {
-                    vec_to[i] = static_cast<ToFieldType>(vec_from[i]);
+                if constexpr (IsDataTypeNumber<FromDataType> &&
+                              std::is_same_v<ToDataType, DataTypeTime>) {
+                    // 300 -> 00:03:00  360 will be parse failed , so value maybe null
+                    ColumnUInt8::MutablePtr col_null_map_to;
+                    ColumnUInt8::Container* vec_null_map_to = nullptr;
+                    col_null_map_to = ColumnUInt8::create(size);
+                    vec_null_map_to = &col_null_map_to->get_data();
+                    for (size_t i = 0; i < size; ++i) {
+                        (*vec_null_map_to)[i] = !TimeCast::try_parse_time(vec_from[i], vec_to[i]);
+                    }
+                    block.get_by_position(result).column =
+                            ColumnNullable::create(std::move(col_to), std::move(col_null_map_to));
+                    return Status::OK();
+                } else {
+                    for (size_t i = 0; i < size; ++i) {
+                        vec_to[i] = static_cast<ToFieldType>(vec_from[i]);
+                    }
                 }
             }
-
             // TODO: support boolean cast more reasonable
             if constexpr (std::is_same_v<uint8_t, ToFieldType>) {
                 for (int i = 0; i < size; ++i) {
@@ -699,7 +799,7 @@ struct NameToDateTime {
     static constexpr auto name = "toDateTime";
 };
 
-template <typename DataType, typename Additions = void*>
+template <typename DataType, typename Additions = void*, typename FromDataType = void*>
 bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, const DateLUTImpl*,
                     Additions additions [[maybe_unused]] = Additions()) {
     if constexpr (IsDateTimeType<DataType>) {
@@ -719,6 +819,15 @@ bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, const DateL
         return try_read_datetime_v2_text(x, rb, scale);
     }
 
+    if constexpr (std::is_same_v<DataTypeString, FromDataType> &&
+                  std::is_same_v<DataTypeTime, DataType>) {
+        // cast from string to time(float64)
+        auto len = rb.count();
+        auto s = rb.position();
+        rb.position() = rb.end(); // make is_all_read = true
+        return TimeCast::try_parse_time(s, len, x);
+    }
+
     if constexpr (std::is_floating_point_v<typename DataType::FieldType>) {
         return try_read_float_text(x, rb);
     }
@@ -1002,6 +1111,8 @@ using FunctionToFloat32 =
         FunctionConvert<DataTypeFloat32, NameToFloat32, ToNumberMonotonicity<Float32>>;
 using FunctionToFloat64 =
         FunctionConvert<DataTypeFloat64, NameToFloat64, ToNumberMonotonicity<Float64>>;
+
+using FunctionToTime = FunctionConvert<DataTypeTime, NameToFloat64, ToNumberMonotonicity<Float64>>;
 using FunctionToString = FunctionConvert<DataTypeString, NameToString, ToStringMonotonicity>;
 using FunctionToDecimal32 =
         FunctionConvert<DataTypeDecimal<Decimal32>, NameToDecimal32, UnknownMonotonicity>;
@@ -1096,7 +1207,10 @@ template <>
 struct FunctionTo<DataTypeDateTimeV2> {
     using Type = FunctionToDateTimeV2;
 };
-
+template <>
+struct FunctionTo<DataTypeTime> {
+    using Type = FunctionToTime;
+};
 class PreparedFunctionCast : public PreparedFunctionImpl {
 public:
     using WrapperType = std::function<Status(FunctionContext* context, Block&, const ColumnNumbers&,
@@ -1186,7 +1300,6 @@ struct ConvertThroughParsing {
         }
 
         size_t current_offset = 0;
-
         for (size_t i = 0; i < size; ++i) {
             size_t next_offset = std::is_same_v<FromDataType, DataTypeString>
                                          ? (*offsets)[i]
@@ -1207,7 +1320,8 @@ struct ConvertThroughParsing {
                 parsed = try_parse_impl<ToDataType>(vec_to[i], read_buffer, local_time_zone,
                                                     type->get_scale());
             } else {
-                parsed = try_parse_impl<ToDataType>(vec_to[i], read_buffer, local_time_zone);
+                parsed = try_parse_impl<ToDataType, void*, FromDataType>(vec_to[i], read_buffer,
+                                                                         local_time_zone);
             }
             (*vec_null_map_to)[i] = !parsed || !is_all_read(read_buffer);
 
@@ -1837,7 +1951,8 @@ private:
                           std::is_same_v<ToDataType, DataTypeDate> ||
                           std::is_same_v<ToDataType, DataTypeDateTime> ||
                           std::is_same_v<ToDataType, DataTypeDateV2> ||
-                          std::is_same_v<ToDataType, DataTypeDateTimeV2>) {
+                          std::is_same_v<ToDataType, DataTypeDateTimeV2> ||
+                          std::is_same_v<ToDataType, DataTypeTime>) {
                 ret = create_wrapper(from_type, check_and_get_data_type<ToDataType>(to_type.get()),
                                      requested_result_is_nullable);
                 return true;
diff --git a/gensrc/proto/types.proto b/gensrc/proto/types.proto
index 12b1e34df9..171a91c4eb 100644
--- a/gensrc/proto/types.proto
+++ b/gensrc/proto/types.proto
@@ -107,6 +107,7 @@ message PGenericType {
         DECIMAL128I = 32;
         VARIANT = 33;
         QUANTILE_STATE = 34;
+        TIME = 35;
         UNKNOWN = 999;
     }
     required TypeId id = 2;
diff --git a/regression-test/data/correctness/test_cast_as_time.out b/regression-test/data/correctness/test_cast_as_time.out
new file mode 100644
index 0000000000..d216f2e72f
--- /dev/null
+++ b/regression-test/data/correctness/test_cast_as_time.out
@@ -0,0 +1,9 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select1 --
+00:03:00
+\\N
+20:20:20
+-- !select2 --
+19:18:17
+30:20:00
+00:04:00
\ No newline at end of file
diff --git a/regression-test/suites/correctness/test_cast_as_time.groovy b/regression-test/suites/correctness/test_cast_as_time.groovy
new file mode 100644
index 0000000000..a13af65679
--- /dev/null
+++ b/regression-test/suites/correctness/test_cast_as_time.groovy
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_cast_as_time") {
+     sql """ DROP TABLE IF EXISTS tbl_cast_as_time """
+      sql """
+        CREATE TABLE tbl_cast_as_time (
+            id INT DEFAULT '10',
+            str VARCHAR(32) DEFAULT ''
+        ) ENGINE=OLAP
+        AGGREGATE KEY(id,str)
+        DISTRIBUTED BY HASH(id) BUCKETS 10
+        PROPERTIES (
+         "replication_allocation" = "tag.location.default: 1",
+         "in_memory" = "false",
+         "storage_format" = "V2"
+        );
+    """
+     sql """
+        insert into tbl_cast_as_time values(300,'19:18:17')
+    """
+    sql """
+        insert into tbl_cast_as_time values(360,'30:20')
+    """
+    sql """
+        insert into tbl_cast_as_time values(202020,'400')
+    """
+    qt_select1 """
+        select cast(id as time) from tbl_cast_as_time order by id
+    """
+    qt_select2 """
+        select cast(str as time) from tbl_cast_as_time order by id
+    """
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org