You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2022/03/04 18:02:32 UTC
[impala] 01/02: IMPALA-10948: Default scale and DecimalType
This is an automated email from the ASF dual-hosted git repository.
boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 71c904e5c234087b32a908b9e13c30aa3f878a5d
Author: Gergely Fürnstáhl <gf...@cloudera.com>
AuthorDate: Fri Feb 11 14:14:12 2022 +0100
IMPALA-10948: Default scale and DecimalType
Added default 0 for scale if it is not set to comply with parquet spec.
Wrapped reading scale and precision in a function to support reading
LogicalType.DecimalType if it is set, falling back to old ones if it is
not, for backward compatibility.
Regenerated bad_parquet_decimals table with filled DecimalType, moved
missing scale test, as it is no longer a bad table.
Added no_scale.parquet table to test reading table without set scale.
Checked it with parquet-tools:
message schema {
optional fixed_len_byte_array(2) d1 (DECIMAL(4,0));
}
Change-Id: I003220b6e2ef39d25d1c33df62c8432803fdc6eb
Reviewed-on: http://gerrit.cloudera.org:8080/18224
Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exec/parquet/parquet-data-converter.h | 29 +++++++++--
be/src/exec/parquet/parquet-metadata-utils.cc | 55 ++++++++++++++++-----
testdata/bad_parquet_data/README | 37 ++++++++++----
testdata/bad_parquet_data/illegal_decimals.parq | Bin 1957 -> 1722 bytes
testdata/data/README | 11 +++++
testdata/data/no_scale.parquet | Bin 0 -> 335 bytes
.../queries/QueryTest/default-scale.test | 6 +++
.../queries/QueryTest/parquet-abort-on-error.test | 16 ++----
.../QueryTest/parquet-continue-on-error.test | 16 ++----
tests/query_test/test_scanners.py | 4 ++
10 files changed, 126 insertions(+), 48 deletions(-)
diff --git a/be/src/exec/parquet/parquet-data-converter.h b/be/src/exec/parquet/parquet-data-converter.h
index feb0cab..387967b 100644
--- a/be/src/exec/parquet/parquet-data-converter.h
+++ b/be/src/exec/parquet/parquet-data-converter.h
@@ -57,6 +57,26 @@ class ParquetDataConverter {
}
private:
+ int32_t GetScale() const {
+ if (parquet_element_->__isset.logicalType
+ && parquet_element_->logicalType.__isset.DECIMAL) {
+ return parquet_element_->logicalType.DECIMAL.scale;
+ }
+
+ if (parquet_element_->__isset.scale) return parquet_element_->scale;
+
+ // If not specified, the scale is 0
+ return 0;
+ }
+
+ int32_t GetPrecision() const {
+ if (parquet_element_->__isset.logicalType
+ && parquet_element_->logicalType.__isset.DECIMAL) {
+ return parquet_element_->logicalType.DECIMAL.precision;
+ }
+
+ return parquet_element_->precision;
+ }
/// Returns true if we need to do a conversion from the Parquet type to the slot type.
bool CheckIfNeedsConversion() {
if (!MATERIALIZED) return false;
@@ -67,17 +87,16 @@ class ParquetDataConverter {
return true;
}
if (col_type_->type == TYPE_DECIMAL) {
- if (col_type_->precision != parquet_element_->precision) {
+ if (col_type_->precision != GetPrecision()) {
// Decimal values can be stored by Decimal4Value (4 bytes), Decimal8Value, and
// Decimal16Value. We only need to do a conversion for different precision if
// the values require different types (different byte size).
- if (ColumnType::GetDecimalByteSize(parquet_element_->precision)
- != col_type_->GetByteSize()) {
+ if (ColumnType::GetDecimalByteSize(GetPrecision()) != col_type_->GetByteSize()) {
return true;
}
}
// Different scales require decimal conversion.
- if (col_type_->scale != parquet_element_->scale) return true;
+ if (col_type_->scale != GetScale()) return true;
}
return false;
}
@@ -170,7 +189,7 @@ template <typename InternalType, bool MATERIALIZED>
template <typename DecimalType>
inline bool ParquetDataConverter<InternalType, MATERIALIZED>
::ConvertDecimalScale(DecimalType* slot) const {
- int parquet_file_scale = parquet_element_->scale;
+ int parquet_file_scale = GetScale();
int slot_scale = col_type_->scale;
if (LIKELY(parquet_file_scale == slot_scale)) return true;
diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc
index 7c8577c..bf905b7 100644
--- a/be/src/exec/parquet/parquet-metadata-utils.cc
+++ b/be/src/exec/parquet/parquet-metadata-utils.cc
@@ -180,6 +180,41 @@ void SetTimestampLogicalType(TParquetTimestampType::type parquet_timestamp_type,
col_schema->__set_logicalType(logical_type);
}
+bool IsScaleSet(const parquet::SchemaElement& schema_element) {
+ // Scale is required in DecimalType
+ return (schema_element.__isset.logicalType
+ && schema_element.logicalType.__isset.DECIMAL)
+ || schema_element.__isset.scale;
+}
+
+bool IsPrecisionSet(const parquet::SchemaElement& schema_element) {
+ // Precision is required in DecimalType
+ return (schema_element.__isset.logicalType
+ && schema_element.logicalType.__isset.DECIMAL)
+ || schema_element.__isset.precision;
+}
+
+int32_t GetScale(const parquet::SchemaElement& schema_element) {
+ if (schema_element.__isset.logicalType && schema_element.logicalType.__isset.DECIMAL) {
+ return schema_element.logicalType.DECIMAL.scale;
+ }
+
+ if (schema_element.__isset.scale) return schema_element.scale;
+
+ // If not specified, the scale is 0
+ return 0;
+}
+
+// Precision is required, this should be called after checking IsPrecisionSet()
+int32_t GetPrecision(const parquet::SchemaElement& schema_element) {
+ DCHECK(IsPrecisionSet(schema_element));
+ if (schema_element.__isset.logicalType && schema_element.logicalType.__isset.DECIMAL) {
+ return schema_element.logicalType.DECIMAL.precision;
+ }
+
+ return schema_element.precision;
+}
+
/// Mapping of impala's internal types to parquet storage types. This is indexed by
/// PrimitiveType enum
const parquet::Type::type INTERNAL_TO_PARQUET_TYPES[] = {
@@ -329,28 +364,24 @@ Status ParquetMetadataUtils::ValidateColumn(const char* filename,
filename, schema_element.name, schema_element.type_length));
}
}
- if (!schema_element.__isset.scale) {
- return Status(Substitute("File '$0' column '$1' does not have the scale set.",
- filename, schema_element.name));
- }
// We require that the precision be a positive value, and not larger than the
// precision in table schema.
- if (!schema_element.__isset.precision) {
+ if (!IsPrecisionSet(schema_element)) {
ErrorMsg msg(TErrorCode::PARQUET_MISSING_PRECISION, filename, schema_element.name);
return Status(msg);
} else {
- if (schema_element.precision > slot_desc->type().precision
- || schema_element.precision <= 0) {
+ int32_t precision = GetPrecision(schema_element);
+ if (precision > slot_desc->type().precision || precision <= 0) {
ErrorMsg msg(TErrorCode::PARQUET_WRONG_PRECISION, filename, schema_element.name,
- schema_element.precision, slot_desc->type().precision);
+ precision, slot_desc->type().precision);
return Status(msg);
}
- if (schema_element.scale < 0 || schema_element.scale > schema_element.precision) {
+ int32_t scale = GetScale(schema_element);
+ if (scale < 0 || scale > precision) {
return Status(
Substitute("File '$0' column '$1' has invalid scale: $2. Precision is $3.",
- filename, schema_element.name, schema_element.scale,
- schema_element.precision));
+ filename, schema_element.name, scale, precision));
}
}
@@ -361,7 +392,7 @@ Status ParquetMetadataUtils::ValidateColumn(const char* filename,
schema_element.name);
RETURN_IF_ERROR(state->LogOrReturnError(msg));
}
- } else if (schema_element.__isset.scale || schema_element.__isset.precision
+ } else if (IsScaleSet(schema_element) || IsPrecisionSet(schema_element)
|| is_converted_type_decimal) {
ErrorMsg msg(TErrorCode::PARQUET_INCOMPATIBLE_DECIMAL, filename, schema_element.name,
slot_desc->type().DebugString());
diff --git a/testdata/bad_parquet_data/README b/testdata/bad_parquet_data/README
index dc2aad8..ae22e85 100644
--- a/testdata/bad_parquet_data/README
+++ b/testdata/bad_parquet_data/README
@@ -17,19 +17,38 @@ Generated by modifying HdfsParquetTableWriter::WriteFileFooter by these:
Status HdfsParquetTableWriter::WriteFileFooter() {
+ file_metadata_.schema[1].__set_precision(0);
+ file_metadata_.schema[1].__isset.precision = false;
++ file_metadata_.schema[1].logicalType.DECIMAL.precision = 0;
++ file_metadata_.schema[1].logicalType.__isset.DECIMAL = false;
++
+ file_metadata_.schema[2].__set_precision(20);
++ file_metadata_.schema[2].logicalType.DECIMAL.precision = 20;
++ file_metadata_.schema[2].logicalType.__isset.DECIMAL = true;
++ file_metadata_.schema[2].__isset.logicalType = true;
++
+ file_metadata_.schema[3].__set_precision(-1);
-+ file_metadata_.schema[4].__set_scale(0);
-+ file_metadata_.schema[4].__isset.scale = false;
++ file_metadata_.schema[3].logicalType.DECIMAL.precision = -1;
++ file_metadata_.schema[3].logicalType.__isset.DECIMAL = true;
++ file_metadata_.schema[3].__isset.logicalType = true;
++
++ file_metadata_.schema[4].__set_type(parquet::Type::FIXED_LEN_BYTE_ARRAY);
++ file_metadata_.schema[4].__isset.type_length = false;
++
+ file_metadata_.schema[5].__set_type(parquet::Type::FIXED_LEN_BYTE_ARRAY);
-+ file_metadata_.schema[5].__isset.type_length = false;
-+ file_metadata_.schema[6].__set_type(parquet::Type::FIXED_LEN_BYTE_ARRAY);
-+ file_metadata_.schema[6].__set_type_length(0);
-+ file_metadata_.schema[7].__set_scale(-1);
-+ file_metadata_.schema[8].__set_scale(4);
-+ file_metadata_.schema[8].__set_precision(2);
++ file_metadata_.schema[5].__set_type_length(0);
++
++ file_metadata_.schema[6].__set_scale(-1);
++ file_metadata_.schema[6].logicalType.DECIMAL.scale = -1;
++ file_metadata_.schema[6].logicalType.__isset.DECIMAL = true;
++ file_metadata_.schema[6].__isset.logicalType = true;
++
++ file_metadata_.schema[7].__set_scale(4);
++ file_metadata_.schema[7].__set_precision(2);
++ file_metadata_.schema[7].logicalType.DECIMAL.scale = 4;
++ file_metadata_.schema[7].logicalType.DECIMAL.precision = 2;
++ file_metadata_.schema[7].logicalType.__isset.DECIMAL = true;
++ file_metadata_.schema[7].__isset.logicalType = true;
Then create the table and insert one row into it:
-create table my_decimal_tbl (d1 decimal(4,2), d2 decimal(4,2), ..., d8 decimal(4,2)) stored as parquet;
+create table my_decimal_tbl (d1 decimal(4,2), d2 decimal(4,2), ..., d7 decimal(4,2)) stored as parquet;
insert into my_decimal_tbl values (cast(0 as decimal(4,2)), cast(0 as decimal(4,2)), ...);
diff --git a/testdata/bad_parquet_data/illegal_decimals.parq b/testdata/bad_parquet_data/illegal_decimals.parq
index f444ab3..4b82f99 100644
Binary files a/testdata/bad_parquet_data/illegal_decimals.parq and b/testdata/bad_parquet_data/illegal_decimals.parq differ
diff --git a/testdata/data/README b/testdata/data/README
index 6b928c0..98c3bf3 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -683,3 +683,14 @@ partition_col_in_parquet.parquet:
Written by Impala 4.0. Parquet file with INT and DATE column. Values in the DATE columns
are identical. There's only a single value per page in the Parquet file (written by
setting query option 'parquet_page_row_count_limit' to 1).
+
+no_scale.parquet
+Generated by code injection, removed scale from written parquet files:
+Status HdfsParquetTableWriter::WriteFileFooter() {
++ file_metadata_.schema[1].__set_scale(1);
++ file_metadata_.schema[1].__isset.scale = false;
++ file_metadata_.schema[1].logicalType.DECIMAL.scale = 1;
++ file_metadata_.schema[1].logicalType.__isset.DECIMAL = false;
++ file_metadata_.schema[1].__isset.logicalType = false;
+create table my_decimal_tbl (d1 decimal(4,2)) stored as parquet;
+insert into my_decimal_tbl values (cast(0 as decimal(4,2)));
\ No newline at end of file
diff --git a/testdata/data/no_scale.parquet b/testdata/data/no_scale.parquet
new file mode 100644
index 0000000..768bc29
Binary files /dev/null and b/testdata/data/no_scale.parquet differ
diff --git a/testdata/workloads/functional-query/queries/QueryTest/default-scale.test b/testdata/workloads/functional-query/queries/QueryTest/default-scale.test
new file mode 100644
index 0000000..03bdc5f
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/default-scale.test
@@ -0,0 +1,6 @@
+====
+---- QUERY
+SELECT * FROM $DATABASE.no_scale;
+---- RESULTS
+0
+====
\ No newline at end of file
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-abort-on-error.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-abort-on-error.test
index 8e5ec86..8821fe6 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-abort-on-error.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-abort-on-error.test
@@ -39,29 +39,23 @@ File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.par
# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
select d4 from functional_parquet.bad_parquet_decimals
---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd4' does not have the scale set.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd4' does not have type_length set.
====
---- QUERY
# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
select d5 from functional_parquet.bad_parquet_decimals
---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd5' does not have type_length set.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd5' has invalid type length: 0
====
---- QUERY
# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
select d6 from functional_parquet.bad_parquet_decimals
---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd6' has invalid type length: 0
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd6' has invalid scale: -1. Precision is 4.
====
---- QUERY
# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
-select d7 from functional_parquet.bad_parquet_decimals
+select d7 from functional_parquet.bad_parquet_decimals;
---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd7' has invalid scale: -1. Precision is 4.
-====
----- QUERY
-# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
-select d8 from functional_parquet.bad_parquet_decimals;
----- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd8' has invalid scale: 4. Precision is 2.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd7' has invalid scale: 4. Precision is 2.
====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test
index 0aa5257..eaaafcf 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test
@@ -123,29 +123,23 @@ File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.par
# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
select d4 from functional_parquet.bad_parquet_decimals
---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd4' does not have the scale set.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd4' does not have type_length set.
====
---- QUERY
# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
select d5 from functional_parquet.bad_parquet_decimals
---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd5' does not have type_length set.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd5' has invalid type length: 0
====
---- QUERY
# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
select d6 from functional_parquet.bad_parquet_decimals
---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd6' has invalid type length: 0
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd6' has invalid scale: -1. Precision is 4.
====
---- QUERY
# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
-select d7 from functional_parquet.bad_parquet_decimals
+select d7 from functional_parquet.bad_parquet_decimals;
---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd7' has invalid scale: -1. Precision is 4.
-====
----- QUERY
-# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
-select d8 from functional_parquet.bad_parquet_decimals;
----- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd8' has invalid scale: 4. Precision is 2.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd7' has invalid scale: 4. Precision is 2.
====
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 33b433e..798f4d6 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -388,6 +388,10 @@ class TestParquet(ImpalaTestSuite):
new_vector.get_value('exec_option')['abort_on_error'] = 1
self.run_test_case('QueryTest/parquet-abort-on-error', new_vector)
+ def test_default_scale(self, vector, unique_database):
+ create_table_from_parquet(self.client, unique_database, "no_scale")
+ self.run_test_case('QueryTest/default-scale', vector, unique_database)
+
def test_timestamp_out_of_range(self, vector, unique_database):
"""IMPALA-4363: Test scanning parquet files with an out of range timestamp.
Also tests IMPALA-7595: Test Parquet timestamp columns where the time part