You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2022/03/04 18:02:32 UTC

[impala] 01/02: IMPALA-10948: Default scale and DecimalType

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 71c904e5c234087b32a908b9e13c30aa3f878a5d
Author: Gergely Fürnstáhl <gf...@cloudera.com>
AuthorDate: Fri Feb 11 14:14:12 2022 +0100

    IMPALA-10948: Default scale and DecimalType
    
    Added default 0 for scale if it is not set to comply with parquet spec.
    
    Wrapped reading scale and precision in a function to support reading
    LogicalType.DecimalType if it is set, falling back to old ones if it is
    not, for backward compatibility.
    
    Regenerated bad_parquet_decimals table with filled DecimalType, moved
    missing scale test, as it is no longer a bad table.
    
    Added no_scale.parquet table to test reading table without set scale.
    
    Checked it with parquet-tools:
    message schema {
      optional fixed_len_byte_array(2) d1 (DECIMAL(4,0));
    }
    
    Change-Id: I003220b6e2ef39d25d1c33df62c8432803fdc6eb
    Reviewed-on: http://gerrit.cloudera.org:8080/18224
    Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/parquet/parquet-data-converter.h       |  29 +++++++++--
 be/src/exec/parquet/parquet-metadata-utils.cc      |  55 ++++++++++++++++-----
 testdata/bad_parquet_data/README                   |  37 ++++++++++----
 testdata/bad_parquet_data/illegal_decimals.parq    | Bin 1957 -> 1722 bytes
 testdata/data/README                               |  11 +++++
 testdata/data/no_scale.parquet                     | Bin 0 -> 335 bytes
 .../queries/QueryTest/default-scale.test           |   6 +++
 .../queries/QueryTest/parquet-abort-on-error.test  |  16 ++----
 .../QueryTest/parquet-continue-on-error.test       |  16 ++----
 tests/query_test/test_scanners.py                  |   4 ++
 10 files changed, 126 insertions(+), 48 deletions(-)

diff --git a/be/src/exec/parquet/parquet-data-converter.h b/be/src/exec/parquet/parquet-data-converter.h
index feb0cab..387967b 100644
--- a/be/src/exec/parquet/parquet-data-converter.h
+++ b/be/src/exec/parquet/parquet-data-converter.h
@@ -57,6 +57,26 @@ class ParquetDataConverter {
   }
 
  private:
+  int32_t GetScale() const {
+    if (parquet_element_->__isset.logicalType
+        && parquet_element_->logicalType.__isset.DECIMAL) {
+      return parquet_element_->logicalType.DECIMAL.scale;
+    }
+
+    if (parquet_element_->__isset.scale) return parquet_element_->scale;
+
+    // If not specified, the scale is 0
+    return 0;
+  }
+
+  int32_t GetPrecision() const {
+    if (parquet_element_->__isset.logicalType
+        && parquet_element_->logicalType.__isset.DECIMAL) {
+      return parquet_element_->logicalType.DECIMAL.precision;
+    }
+
+    return parquet_element_->precision;
+  }
   /// Returns true if we need to do a conversion from the Parquet type to the slot type.
   bool CheckIfNeedsConversion() {
     if (!MATERIALIZED) return false;
@@ -67,17 +87,16 @@ class ParquetDataConverter {
       return true;
     }
     if (col_type_->type == TYPE_DECIMAL) {
-      if (col_type_->precision != parquet_element_->precision) {
+      if (col_type_->precision != GetPrecision()) {
         // Decimal values can be stored by Decimal4Value (4 bytes), Decimal8Value, and
         // Decimal16Value. We only need to do a conversion for different precision if
         // the values require different types (different byte size).
-        if (ColumnType::GetDecimalByteSize(parquet_element_->precision)
-            != col_type_->GetByteSize()) {
+        if (ColumnType::GetDecimalByteSize(GetPrecision()) != col_type_->GetByteSize()) {
           return true;
         }
       }
       // Different scales require decimal conversion.
-      if (col_type_->scale != parquet_element_->scale) return true;
+      if (col_type_->scale != GetScale()) return true;
     }
     return false;
   }
@@ -170,7 +189,7 @@ template <typename InternalType, bool MATERIALIZED>
 template <typename DecimalType>
 inline bool ParquetDataConverter<InternalType, MATERIALIZED>
     ::ConvertDecimalScale(DecimalType* slot) const {
-  int parquet_file_scale = parquet_element_->scale;
+  int parquet_file_scale = GetScale();
   int slot_scale = col_type_->scale;
   if (LIKELY(parquet_file_scale == slot_scale)) return true;
 
diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc
index 7c8577c..bf905b7 100644
--- a/be/src/exec/parquet/parquet-metadata-utils.cc
+++ b/be/src/exec/parquet/parquet-metadata-utils.cc
@@ -180,6 +180,41 @@ void SetTimestampLogicalType(TParquetTimestampType::type parquet_timestamp_type,
   col_schema->__set_logicalType(logical_type);
 }
 
+bool IsScaleSet(const parquet::SchemaElement& schema_element) {
+  // Scale is required in DecimalType
+  return (schema_element.__isset.logicalType
+             && schema_element.logicalType.__isset.DECIMAL)
+      || schema_element.__isset.scale;
+}
+
+bool IsPrecisionSet(const parquet::SchemaElement& schema_element) {
+  // Precision is required in DecimalType
+  return (schema_element.__isset.logicalType
+             && schema_element.logicalType.__isset.DECIMAL)
+      || schema_element.__isset.precision;
+}
+
+int32_t GetScale(const parquet::SchemaElement& schema_element) {
+  if (schema_element.__isset.logicalType && schema_element.logicalType.__isset.DECIMAL) {
+    return schema_element.logicalType.DECIMAL.scale;
+  }
+
+  if (schema_element.__isset.scale) return schema_element.scale;
+
+  // If not specified, the scale is 0
+  return 0;
+}
+
+// Precision is required, this should be called after checking IsPrecisionSet()
+int32_t GetPrecision(const parquet::SchemaElement& schema_element) {
+  DCHECK(IsPrecisionSet(schema_element));
+  if (schema_element.__isset.logicalType && schema_element.logicalType.__isset.DECIMAL) {
+    return schema_element.logicalType.DECIMAL.precision;
+  }
+
+  return schema_element.precision;
+}
+
 /// Mapping of impala's internal types to parquet storage types. This is indexed by
 /// PrimitiveType enum
 const parquet::Type::type INTERNAL_TO_PARQUET_TYPES[] = {
@@ -329,28 +364,24 @@ Status ParquetMetadataUtils::ValidateColumn(const char* filename,
             filename, schema_element.name, schema_element.type_length));
       }
     }
-    if (!schema_element.__isset.scale) {
-      return Status(Substitute("File '$0' column '$1' does not have the scale set.",
-          filename, schema_element.name));
-    }
 
     // We require that the precision be a positive value, and not larger than the
     // precision in table schema.
-    if (!schema_element.__isset.precision) {
+    if (!IsPrecisionSet(schema_element)) {
       ErrorMsg msg(TErrorCode::PARQUET_MISSING_PRECISION, filename, schema_element.name);
       return Status(msg);
     } else {
-      if (schema_element.precision > slot_desc->type().precision
-          || schema_element.precision <= 0) {
+      int32_t precision = GetPrecision(schema_element);
+      if (precision > slot_desc->type().precision || precision <= 0) {
         ErrorMsg msg(TErrorCode::PARQUET_WRONG_PRECISION, filename, schema_element.name,
-            schema_element.precision, slot_desc->type().precision);
+            precision, slot_desc->type().precision);
         return Status(msg);
       }
-      if (schema_element.scale < 0 || schema_element.scale > schema_element.precision) {
+      int32_t scale = GetScale(schema_element);
+      if (scale < 0 || scale > precision) {
         return Status(
             Substitute("File '$0' column '$1' has invalid scale: $2. Precision is $3.",
-                filename, schema_element.name, schema_element.scale,
-                schema_element.precision));
+                filename, schema_element.name, scale, precision));
       }
     }
 
@@ -361,7 +392,7 @@ Status ParquetMetadataUtils::ValidateColumn(const char* filename,
           schema_element.name);
       RETURN_IF_ERROR(state->LogOrReturnError(msg));
     }
-  } else if (schema_element.__isset.scale || schema_element.__isset.precision
+  } else if (IsScaleSet(schema_element) || IsPrecisionSet(schema_element)
       || is_converted_type_decimal) {
     ErrorMsg msg(TErrorCode::PARQUET_INCOMPATIBLE_DECIMAL, filename, schema_element.name,
         slot_desc->type().DebugString());
diff --git a/testdata/bad_parquet_data/README b/testdata/bad_parquet_data/README
index dc2aad8..ae22e85 100644
--- a/testdata/bad_parquet_data/README
+++ b/testdata/bad_parquet_data/README
@@ -17,19 +17,38 @@ Generated by modifying HdfsParquetTableWriter::WriteFileFooter by these:
  Status HdfsParquetTableWriter::WriteFileFooter() {
 +  file_metadata_.schema[1].__set_precision(0);
 +  file_metadata_.schema[1].__isset.precision = false;
++  file_metadata_.schema[1].logicalType.DECIMAL.precision = 0;
++  file_metadata_.schema[1].logicalType.__isset.DECIMAL = false;
++
 +  file_metadata_.schema[2].__set_precision(20);
++  file_metadata_.schema[2].logicalType.DECIMAL.precision = 20;
++  file_metadata_.schema[2].logicalType.__isset.DECIMAL = true;
++  file_metadata_.schema[2].__isset.logicalType = true;
++
 +  file_metadata_.schema[3].__set_precision(-1);
-+  file_metadata_.schema[4].__set_scale(0);
-+  file_metadata_.schema[4].__isset.scale = false;
++  file_metadata_.schema[3].logicalType.DECIMAL.precision = -1;
++  file_metadata_.schema[3].logicalType.__isset.DECIMAL = true;
++  file_metadata_.schema[3].__isset.logicalType = true;
++
++  file_metadata_.schema[4].__set_type(parquet::Type::FIXED_LEN_BYTE_ARRAY);
++  file_metadata_.schema[4].__isset.type_length = false;
++
 +  file_metadata_.schema[5].__set_type(parquet::Type::FIXED_LEN_BYTE_ARRAY);
-+  file_metadata_.schema[5].__isset.type_length = false;
-+  file_metadata_.schema[6].__set_type(parquet::Type::FIXED_LEN_BYTE_ARRAY);
-+  file_metadata_.schema[6].__set_type_length(0);
-+  file_metadata_.schema[7].__set_scale(-1);
-+  file_metadata_.schema[8].__set_scale(4);
-+  file_metadata_.schema[8].__set_precision(2);
++  file_metadata_.schema[5].__set_type_length(0);
++
++  file_metadata_.schema[6].__set_scale(-1);
++  file_metadata_.schema[6].logicalType.DECIMAL.scale = -1;
++  file_metadata_.schema[6].logicalType.__isset.DECIMAL = true;
++  file_metadata_.schema[6].__isset.logicalType = true;
++
++  file_metadata_.schema[7].__set_scale(4);
++  file_metadata_.schema[7].__set_precision(2);
++  file_metadata_.schema[7].logicalType.DECIMAL.scale = 4;
++  file_metadata_.schema[7].logicalType.DECIMAL.precision = 2;
++  file_metadata_.schema[7].logicalType.__isset.DECIMAL = true;
++  file_metadata_.schema[7].__isset.logicalType = true;
 
 Then create the table and insert one row into it:
 
-create table my_decimal_tbl (d1 decimal(4,2), d2 decimal(4,2), ..., d8 decimal(4,2)) stored as parquet;
+create table my_decimal_tbl (d1 decimal(4,2), d2 decimal(4,2), ..., d7 decimal(4,2)) stored as parquet;
 insert into my_decimal_tbl values (cast(0 as decimal(4,2)), cast(0 as decimal(4,2)), ...);
diff --git a/testdata/bad_parquet_data/illegal_decimals.parq b/testdata/bad_parquet_data/illegal_decimals.parq
index f444ab3..4b82f99 100644
Binary files a/testdata/bad_parquet_data/illegal_decimals.parq and b/testdata/bad_parquet_data/illegal_decimals.parq differ
diff --git a/testdata/data/README b/testdata/data/README
index 6b928c0..98c3bf3 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -683,3 +683,14 @@ partition_col_in_parquet.parquet:
 Written by Impala 4.0. Parquet file with INT and DATE column. Values in the DATE columns
 are identical. There's only a single value per page in the Parquet file (written by
 setting query option 'parquet_page_row_count_limit' to 1).
+
+no_scale.parquet
+Generated by code injection, removed scale from written parquet files:
+Status HdfsParquetTableWriter::WriteFileFooter() {
++  file_metadata_.schema[1].__set_scale(1);
++  file_metadata_.schema[1].__isset.scale = false;
++  file_metadata_.schema[1].logicalType.DECIMAL.scale = 1;
++  file_metadata_.schema[1].logicalType.__isset.DECIMAL = false;
++  file_metadata_.schema[1].__isset.logicalType = false;
+create table my_decimal_tbl (d1 decimal(4,2)) stored as parquet;
+insert into my_decimal_tbl values (cast(0 as decimal(4,2)));
\ No newline at end of file
diff --git a/testdata/data/no_scale.parquet b/testdata/data/no_scale.parquet
new file mode 100644
index 0000000..768bc29
Binary files /dev/null and b/testdata/data/no_scale.parquet differ
diff --git a/testdata/workloads/functional-query/queries/QueryTest/default-scale.test b/testdata/workloads/functional-query/queries/QueryTest/default-scale.test
new file mode 100644
index 0000000..03bdc5f
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/default-scale.test
@@ -0,0 +1,6 @@
+====
+---- QUERY
+SELECT * FROM $DATABASE.no_scale;
+---- RESULTS
+0
+====
\ No newline at end of file
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-abort-on-error.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-abort-on-error.test
index 8e5ec86..8821fe6 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-abort-on-error.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-abort-on-error.test
@@ -39,29 +39,23 @@ File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.par
 # IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
 select d4 from functional_parquet.bad_parquet_decimals
 ---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd4' does not have the scale set.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd4' does not have type_length set.
 ====
 ---- QUERY
 # IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
 select d5 from functional_parquet.bad_parquet_decimals
 ---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd5' does not have type_length set.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd5' has invalid type length: 0
 ====
 ---- QUERY
 # IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
 select d6 from functional_parquet.bad_parquet_decimals
 ---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd6' has invalid type length: 0
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd6' has invalid scale: -1. Precision is 4.
 ====
 ---- QUERY
 # IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
-select d7 from functional_parquet.bad_parquet_decimals
+select d7 from functional_parquet.bad_parquet_decimals;
 ---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd7' has invalid scale: -1. Precision is 4.
-====
----- QUERY
-# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
-select d8 from functional_parquet.bad_parquet_decimals;
----- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd8' has invalid scale: 4. Precision is 2.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd7' has invalid scale: 4. Precision is 2.
 ====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test
index 0aa5257..eaaafcf 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-continue-on-error.test
@@ -123,29 +123,23 @@ File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.par
 # IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
 select d4 from functional_parquet.bad_parquet_decimals
 ---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd4' does not have the scale set.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd4' does not have type_length set.
 ====
 ---- QUERY
 # IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
 select d5 from functional_parquet.bad_parquet_decimals
 ---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd5' does not have type_length set.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd5' has invalid type length: 0
 ====
 ---- QUERY
 # IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
 select d6 from functional_parquet.bad_parquet_decimals
 ---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd6' has invalid type length: 0
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd6' has invalid scale: -1. Precision is 4.
 ====
 ---- QUERY
 # IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
-select d7 from functional_parquet.bad_parquet_decimals
+select d7 from functional_parquet.bad_parquet_decimals;
 ---- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd7' has invalid scale: -1. Precision is 4.
-====
----- QUERY
-# IMPALA-10808, IMPALA-10814: Check illegal decimal file schemas
-select d8 from functional_parquet.bad_parquet_decimals;
----- CATCH
-File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd8' has invalid scale: 4. Precision is 2.
+File '$NAMENODE/test-warehouse/bad_parquet_decimals_parquet/illegal_decimals.parq' column 'd7' has invalid scale: 4. Precision is 2.
 ====
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 33b433e..798f4d6 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -388,6 +388,10 @@ class TestParquet(ImpalaTestSuite):
     new_vector.get_value('exec_option')['abort_on_error'] = 1
     self.run_test_case('QueryTest/parquet-abort-on-error', new_vector)
 
+  def test_default_scale(self, vector, unique_database):
+    create_table_from_parquet(self.client, unique_database, "no_scale")
+    self.run_test_case('QueryTest/default-scale', vector, unique_database)
+
   def test_timestamp_out_of_range(self, vector, unique_database):
     """IMPALA-4363: Test scanning parquet files with an out of range timestamp.
        Also tests IMPALA-7595: Test Parquet timestamp columns where the time part