You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/01/13 06:13:56 UTC

[doris] 05/05: [fix](parquet-reader) fix coredump when load datatime data to doris from parquet (#15794)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 750bf85030e609c8c3dd7433ff1fb5201a9cde5a
Author: Ashin Gau <As...@users.noreply.github.com>
AuthorDate: Fri Jan 13 11:51:11 2023 +0800

    [fix](parquet-reader) fix coredump when load datatime data to doris from parquet (#15794)
    
    `date_time_v2` will check scale when constructed datatimev2:
    ```
    LOG(FATAL) << fmt::format("Scale {} is out of bounds", scale);
    ```
    
    This [PR](https://github.com/apache/doris/pull/15510) has fixed this issue, but parquet does not use constructor to create `TypeDescriptor`, leading the `scale = -1` when reading datetimev2 data.
---
 be/src/runtime/types.h                         |  1 +
 be/src/vec/exec/format/parquet/schema_desc.cpp | 63 ++++++++++++++------------
 2 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h
index 06629a655e..aca6336f6b 100644
--- a/be/src/runtime/types.h
+++ b/be/src/runtime/types.h
@@ -74,6 +74,7 @@ struct TypeDescriptor {
             precision = 27;
             scale = 9;
         } else if (type == TYPE_DATETIMEV2) {
+            precision = 18;
             scale = 6;
         }
     }
diff --git a/be/src/vec/exec/format/parquet/schema_desc.cpp b/be/src/vec/exec/format/parquet/schema_desc.cpp
index 2af4d40ea2..b8b9b07184 100644
--- a/be/src/vec/exec/format/parquet/schema_desc.cpp
+++ b/be/src/vec/exec/format/parquet/schema_desc.cpp
@@ -167,24 +167,27 @@ TypeDescriptor FieldDescriptor::get_doris_type(const tparquet::SchemaElement& ph
     if (type.type == INVALID_TYPE) {
         switch (physical_schema.type) {
         case tparquet::Type::BOOLEAN:
-            type.type = TYPE_BOOLEAN;
+            type = TypeDescriptor(TYPE_BOOLEAN);
             break;
         case tparquet::Type::INT32:
-            type.type = TYPE_INT;
+            type = TypeDescriptor(TYPE_INT);
             break;
         case tparquet::Type::INT64:
+            type = TypeDescriptor(TYPE_BIGINT);
+            break;
         case tparquet::Type::INT96:
-            type.type = TYPE_BIGINT;
+            // in most cases, it's a nano timestamp
+            type = TypeDescriptor(TYPE_DATETIMEV2);
             break;
         case tparquet::Type::FLOAT:
-            type.type = TYPE_FLOAT;
+            type = TypeDescriptor(TYPE_FLOAT);
             break;
         case tparquet::Type::DOUBLE:
-            type.type = TYPE_DOUBLE;
+            type = TypeDescriptor(TYPE_DOUBLE);
             break;
         case tparquet::Type::BYTE_ARRAY:
         case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
-            type.type = TYPE_STRING;
+            type = TypeDescriptor(TYPE_STRING);
             break;
         default:
             break;
@@ -196,33 +199,31 @@ TypeDescriptor FieldDescriptor::get_doris_type(const tparquet::SchemaElement& ph
 TypeDescriptor FieldDescriptor::convert_to_doris_type(tparquet::LogicalType logicalType) {
     TypeDescriptor type;
     if (logicalType.__isset.STRING) {
-        type.type = TYPE_STRING;
+        type = TypeDescriptor(TYPE_STRING);
     } else if (logicalType.__isset.DECIMAL) {
-        type.type = TYPE_DECIMALV2;
-        type.precision = 27;
-        type.scale = 9;
+        type = TypeDescriptor(TYPE_DECIMALV2);
     } else if (logicalType.__isset.DATE) {
-        type.type = TYPE_DATEV2;
+        type = TypeDescriptor(TYPE_DATEV2);
     } else if (logicalType.__isset.INTEGER) {
         if (logicalType.INTEGER.isSigned) {
             if (logicalType.INTEGER.bitWidth <= 32) {
-                type.type = TYPE_INT;
+                type = TypeDescriptor(TYPE_INT);
             } else {
-                type.type = TYPE_BIGINT;
+                type = TypeDescriptor(TYPE_BIGINT);
             }
         } else {
             if (logicalType.INTEGER.bitWidth <= 16) {
-                type.type = TYPE_INT;
+                type = TypeDescriptor(TYPE_INT);
             } else {
-                type.type = TYPE_BIGINT;
+                type = TypeDescriptor(TYPE_BIGINT);
             }
         }
     } else if (logicalType.__isset.TIME) {
-        type.type = TYPE_TIMEV2;
+        type = TypeDescriptor(TYPE_TIMEV2);
     } else if (logicalType.__isset.TIMESTAMP) {
-        type.type = TYPE_DATETIMEV2;
+        type = TypeDescriptor(TYPE_DATETIMEV2);
     } else {
-        type.type = INVALID_TYPE;
+        type = TypeDescriptor(INVALID_TYPE);
     }
     return type;
 }
@@ -231,39 +232,41 @@ TypeDescriptor FieldDescriptor::convert_to_doris_type(tparquet::ConvertedType::t
     TypeDescriptor type;
     switch (convertedType) {
     case tparquet::ConvertedType::type::UTF8:
-        type.type = TYPE_STRING;
+        type = TypeDescriptor(TYPE_STRING);
         break;
     case tparquet::ConvertedType::type::DECIMAL:
-        type.type = TYPE_DECIMALV2;
-        type.precision = 27;
-        type.scale = 9;
+        type = TypeDescriptor(TYPE_DECIMALV2);
         break;
     case tparquet::ConvertedType::type::DATE:
-        type.type = TYPE_DATEV2;
+        type = TypeDescriptor(TYPE_DATEV2);
         break;
     case tparquet::ConvertedType::type::TIME_MILLIS:
     case tparquet::ConvertedType::type::TIME_MICROS:
-        type.type = TYPE_TIMEV2;
+        type = TypeDescriptor(TYPE_TIMEV2);
         break;
     case tparquet::ConvertedType::type::TIMESTAMP_MILLIS:
     case tparquet::ConvertedType::type::TIMESTAMP_MICROS:
-        type.type = TYPE_DATETIMEV2;
+        type = TypeDescriptor(TYPE_DATETIMEV2);
         break;
-    case tparquet::ConvertedType::type::UINT_8:
-    case tparquet::ConvertedType::type::UINT_16:
     case tparquet::ConvertedType::type::INT_8:
+        type = TypeDescriptor(TYPE_TINYINT);
+        break;
+    case tparquet::ConvertedType::type::UINT_8:
     case tparquet::ConvertedType::type::INT_16:
+        type = TypeDescriptor(TYPE_SMALLINT);
+        break;
+    case tparquet::ConvertedType::type::UINT_16:
     case tparquet::ConvertedType::type::INT_32:
-        type.type = TYPE_INT;
+        type = TypeDescriptor(TYPE_INT);
         break;
     case tparquet::ConvertedType::type::UINT_32:
     case tparquet::ConvertedType::type::UINT_64:
     case tparquet::ConvertedType::type::INT_64:
-        type.type = TYPE_BIGINT;
+        type = TypeDescriptor(TYPE_BIGINT);
         break;
     default:
         LOG(WARNING) << "Not supported parquet ConvertedType: " << convertedType;
-        type = INVALID_TYPE;
+        type = TypeDescriptor(INVALID_TYPE);
         break;
     }
     return type;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org