You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/01/13 06:13:56 UTC
[doris] 05/05: [fix](parquet-reader) fix coredump when load datatime data to doris from parquet (#15794)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git
commit 750bf85030e609c8c3dd7433ff1fb5201a9cde5a
Author: Ashin Gau <As...@users.noreply.github.com>
AuthorDate: Fri Jan 13 11:51:11 2023 +0800
[fix](parquet-reader) fix coredump when load datatime data to doris from parquet (#15794)
`date_time_v2` will check scale when constructed datatimev2:
```
LOG(FATAL) << fmt::format("Scale {} is out of bounds", scale);
```
This [PR](https://github.com/apache/doris/pull/15510) has fixed this issue, but parquet does not use constructor to create `TypeDescriptor`, leading the `scale = -1` when reading datetimev2 data.
---
be/src/runtime/types.h | 1 +
be/src/vec/exec/format/parquet/schema_desc.cpp | 63 ++++++++++++++------------
2 files changed, 34 insertions(+), 30 deletions(-)
diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h
index 06629a655e..aca6336f6b 100644
--- a/be/src/runtime/types.h
+++ b/be/src/runtime/types.h
@@ -74,6 +74,7 @@ struct TypeDescriptor {
precision = 27;
scale = 9;
} else if (type == TYPE_DATETIMEV2) {
+ precision = 18;
scale = 6;
}
}
diff --git a/be/src/vec/exec/format/parquet/schema_desc.cpp b/be/src/vec/exec/format/parquet/schema_desc.cpp
index 2af4d40ea2..b8b9b07184 100644
--- a/be/src/vec/exec/format/parquet/schema_desc.cpp
+++ b/be/src/vec/exec/format/parquet/schema_desc.cpp
@@ -167,24 +167,27 @@ TypeDescriptor FieldDescriptor::get_doris_type(const tparquet::SchemaElement& ph
if (type.type == INVALID_TYPE) {
switch (physical_schema.type) {
case tparquet::Type::BOOLEAN:
- type.type = TYPE_BOOLEAN;
+ type = TypeDescriptor(TYPE_BOOLEAN);
break;
case tparquet::Type::INT32:
- type.type = TYPE_INT;
+ type = TypeDescriptor(TYPE_INT);
break;
case tparquet::Type::INT64:
+ type = TypeDescriptor(TYPE_BIGINT);
+ break;
case tparquet::Type::INT96:
- type.type = TYPE_BIGINT;
+ // in most cases, it's a nano timestamp
+ type = TypeDescriptor(TYPE_DATETIMEV2);
break;
case tparquet::Type::FLOAT:
- type.type = TYPE_FLOAT;
+ type = TypeDescriptor(TYPE_FLOAT);
break;
case tparquet::Type::DOUBLE:
- type.type = TYPE_DOUBLE;
+ type = TypeDescriptor(TYPE_DOUBLE);
break;
case tparquet::Type::BYTE_ARRAY:
case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
- type.type = TYPE_STRING;
+ type = TypeDescriptor(TYPE_STRING);
break;
default:
break;
@@ -196,33 +199,31 @@ TypeDescriptor FieldDescriptor::get_doris_type(const tparquet::SchemaElement& ph
TypeDescriptor FieldDescriptor::convert_to_doris_type(tparquet::LogicalType logicalType) {
TypeDescriptor type;
if (logicalType.__isset.STRING) {
- type.type = TYPE_STRING;
+ type = TypeDescriptor(TYPE_STRING);
} else if (logicalType.__isset.DECIMAL) {
- type.type = TYPE_DECIMALV2;
- type.precision = 27;
- type.scale = 9;
+ type = TypeDescriptor(TYPE_DECIMALV2);
} else if (logicalType.__isset.DATE) {
- type.type = TYPE_DATEV2;
+ type = TypeDescriptor(TYPE_DATEV2);
} else if (logicalType.__isset.INTEGER) {
if (logicalType.INTEGER.isSigned) {
if (logicalType.INTEGER.bitWidth <= 32) {
- type.type = TYPE_INT;
+ type = TypeDescriptor(TYPE_INT);
} else {
- type.type = TYPE_BIGINT;
+ type = TypeDescriptor(TYPE_BIGINT);
}
} else {
if (logicalType.INTEGER.bitWidth <= 16) {
- type.type = TYPE_INT;
+ type = TypeDescriptor(TYPE_INT);
} else {
- type.type = TYPE_BIGINT;
+ type = TypeDescriptor(TYPE_BIGINT);
}
}
} else if (logicalType.__isset.TIME) {
- type.type = TYPE_TIMEV2;
+ type = TypeDescriptor(TYPE_TIMEV2);
} else if (logicalType.__isset.TIMESTAMP) {
- type.type = TYPE_DATETIMEV2;
+ type = TypeDescriptor(TYPE_DATETIMEV2);
} else {
- type.type = INVALID_TYPE;
+ type = TypeDescriptor(INVALID_TYPE);
}
return type;
}
@@ -231,39 +232,41 @@ TypeDescriptor FieldDescriptor::convert_to_doris_type(tparquet::ConvertedType::t
TypeDescriptor type;
switch (convertedType) {
case tparquet::ConvertedType::type::UTF8:
- type.type = TYPE_STRING;
+ type = TypeDescriptor(TYPE_STRING);
break;
case tparquet::ConvertedType::type::DECIMAL:
- type.type = TYPE_DECIMALV2;
- type.precision = 27;
- type.scale = 9;
+ type = TypeDescriptor(TYPE_DECIMALV2);
break;
case tparquet::ConvertedType::type::DATE:
- type.type = TYPE_DATEV2;
+ type = TypeDescriptor(TYPE_DATEV2);
break;
case tparquet::ConvertedType::type::TIME_MILLIS:
case tparquet::ConvertedType::type::TIME_MICROS:
- type.type = TYPE_TIMEV2;
+ type = TypeDescriptor(TYPE_TIMEV2);
break;
case tparquet::ConvertedType::type::TIMESTAMP_MILLIS:
case tparquet::ConvertedType::type::TIMESTAMP_MICROS:
- type.type = TYPE_DATETIMEV2;
+ type = TypeDescriptor(TYPE_DATETIMEV2);
break;
- case tparquet::ConvertedType::type::UINT_8:
- case tparquet::ConvertedType::type::UINT_16:
case tparquet::ConvertedType::type::INT_8:
+ type = TypeDescriptor(TYPE_TINYINT);
+ break;
+ case tparquet::ConvertedType::type::UINT_8:
case tparquet::ConvertedType::type::INT_16:
+ type = TypeDescriptor(TYPE_SMALLINT);
+ break;
+ case tparquet::ConvertedType::type::UINT_16:
case tparquet::ConvertedType::type::INT_32:
- type.type = TYPE_INT;
+ type = TypeDescriptor(TYPE_INT);
break;
case tparquet::ConvertedType::type::UINT_32:
case tparquet::ConvertedType::type::UINT_64:
case tparquet::ConvertedType::type::INT_64:
- type.type = TYPE_BIGINT;
+ type = TypeDescriptor(TYPE_BIGINT);
break;
default:
LOG(WARNING) << "Not supported parquet ConvertedType: " << convertedType;
- type = INVALID_TYPE;
+ type = TypeDescriptor(INVALID_TYPE);
break;
}
return type;
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org