You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by wj...@apache.org on 2023/01/26 19:52:58 UTC

[arrow] branch master updated: MINOR: [C++][Parquet] Rephrase decimal annotation (#33694)

This is an automated email from the ASF dual-hosted git repository.

wjones127 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 068f49953c MINOR: [C++][Parquet] Rephrase decimal annotation (#33694)
068f49953c is described below

commit 068f49953cd6d5f1355a2885eccf7b6527e0d9be
Author: Gang Wu <us...@gmail.com>
AuthorDate: Fri Jan 27 03:52:51 2023 +0800

    MINOR: [C++][Parquet] Rephrase decimal annotation (#33694)
    
    ### What changes are included in this PR?
    
    - Rename the function in the writer properties to `enable_short_decimal_stored_as_integer`.
    - Rephrase the comment to explain the behavior in detail.
    
    Authored-by: Gang Wu <us...@gmail.com>
    Signed-off-by: Will Jones <wi...@gmail.com>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc |  8 ++--
 cpp/src/parquet/arrow/schema.cc                   |  2 +-
 cpp/src/parquet/properties.h                      | 48 +++++++++++++++--------
 3 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index bdfd0fe07d..6585d286d1 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3443,8 +3443,8 @@ TEST(ArrowReadWrite, Decimal256AsInt) {
   auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), {array});
 
   parquet::WriterProperties::Builder builder;
-  // Enforce integer type to annotate decimal type
-  auto writer_properties = builder.enable_integer_annotate_decimal()->build();
+  // Allow small decimals to be stored as int32 or int64.
+  auto writer_properties = builder.enable_store_decimal_as_integer()->build();
   auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build();
 
   CheckConfiguredRoundtrip(table, table, writer_properties, props_store_schema);
@@ -4821,8 +4821,8 @@ class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO<TestType> {
     auto arrow_schema = ::arrow::schema({::arrow::field("a", values->type())});
 
     parquet::WriterProperties::Builder builder;
-    // Enforce integer type to annotate decimal type
-    auto writer_properties = builder.enable_integer_annotate_decimal()->build();
+    // Allow small decimals to be stored as int32 or int64.
+    auto writer_properties = builder.enable_store_decimal_as_integer()->build();
     std::shared_ptr<SchemaDescriptor> parquet_schema;
     ASSERT_OK_NO_THROW(ToParquetSchema(arrow_schema.get(), *writer_properties,
                                        *default_arrow_writer_properties(),
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index 267b892e4b..07d481ba8e 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -357,7 +357,7 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
       const auto& decimal_type = static_cast<const ::arrow::DecimalType&>(*field->type());
       precision = decimal_type.precision();
       scale = decimal_type.scale();
-      if (properties.integer_annotate_decimal() && 1 <= precision && precision <= 18) {
+      if (properties.store_decimal_as_integer() && 1 <= precision && precision <= 18) {
         type = precision <= 9 ? ParquetType ::INT32 : ParquetType ::INT64;
       } else {
         type = ParquetType::FIXED_LEN_BYTE_ARRAY;
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index d45dcfe69f..ef4ae124a2 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -201,7 +201,7 @@ class PARQUET_EXPORT WriterProperties {
           version_(ParquetVersion::PARQUET_2_4),
           data_page_version_(ParquetDataPageVersion::V1),
           created_by_(DEFAULT_CREATED_BY),
-          integer_annotate_decimal_(false) {}
+          store_decimal_as_integer_(false) {}
     virtual ~Builder() {}
 
     /// Specify the memory pool for the writer. Default default_memory_pool.
@@ -452,19 +452,35 @@ class PARQUET_EXPORT WriterProperties {
       return this->disable_statistics(path->ToDotString());
     }
 
-    /// Enable integer type to annotate decimal type as below:
-    ///   int32: 1 <= precision <= 9
-    ///   int64: 10 <= precision <= 18
-    /// Default disabled.
-    Builder* enable_integer_annotate_decimal() {
-      integer_annotate_decimal_ = true;
+    /// Allow decimals with 1 <= precision <= 18 to be stored as integers.
+    ///
+    /// In Parquet, DECIMAL can be stored in any of the following physical types:
+    /// - int32: for 1 <= precision <= 9.
+    /// - int64: for 10 <= precision <= 18.
+    /// - fixed_len_byte_array: precision is limited by the array size.
+    ///   Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits.
+    /// - binary: precision is unlimited. The minimum number of bytes to store
+    ///   the unscaled value is used.
+    ///
+    /// By default, this is DISABLED and all decimal types annotate fixed_len_byte_array.
+    ///
+    /// When enabled, the C++ writer will use following physical types to store decimals:
+    /// - int32: for 1 <= precision <= 9.
+    /// - int64: for 10 <= precision <= 18.
+    /// - fixed_len_byte_array: for precision > 18.
+    ///
+    /// As a consequence, decimal columns stored in integer types are more compact.
+    Builder* enable_store_decimal_as_integer() {
+      store_decimal_as_integer_ = true;
       return this;
     }
 
-    /// Disable integer type to annotate decimal type.
+    /// Disable decimal logical type with 1 <= precision <= 18 to be stored as
+    /// integer physical type.
+    ///
     /// Default disabled.
-    Builder* disable_integer_annotate_decimal() {
-      integer_annotate_decimal_ = false;
+    Builder* disable_store_decimal_as_integer() {
+      store_decimal_as_integer_ = false;
       return this;
     }
 
@@ -493,7 +509,7 @@ class PARQUET_EXPORT WriterProperties {
           pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
           pagesize_, version_, created_by_, std::move(file_encryption_properties_),
           default_column_properties_, column_properties, data_page_version_,
-          integer_annotate_decimal_));
+          store_decimal_as_integer_));
     }
 
    private:
@@ -505,7 +521,7 @@ class PARQUET_EXPORT WriterProperties {
     ParquetVersion::type version_;
     ParquetDataPageVersion data_page_version_;
     std::string created_by_;
-    bool integer_annotate_decimal_;
+    bool store_decimal_as_integer_;
 
     std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
 
@@ -536,7 +552,7 @@ class PARQUET_EXPORT WriterProperties {
 
   inline std::string created_by() const { return parquet_created_by_; }
 
-  inline bool integer_annotate_decimal() const { return integer_annotate_decimal_; }
+  inline bool store_decimal_as_integer() const { return store_decimal_as_integer_; }
 
   inline Encoding::type dictionary_index_encoding() const {
     if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
@@ -606,7 +622,7 @@ class PARQUET_EXPORT WriterProperties {
       std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
       const ColumnProperties& default_column_properties,
       const std::unordered_map<std::string, ColumnProperties>& column_properties,
-      ParquetDataPageVersion data_page_version, bool integer_annotate_decimal)
+      ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer)
       : pool_(pool),
         dictionary_pagesize_limit_(dictionary_pagesize_limit),
         write_batch_size_(write_batch_size),
@@ -615,7 +631,7 @@ class PARQUET_EXPORT WriterProperties {
         parquet_data_page_version_(data_page_version),
         parquet_version_(version),
         parquet_created_by_(created_by),
-        integer_annotate_decimal_(integer_annotate_decimal),
+        store_decimal_as_integer_(store_short_decimal_as_integer),
         file_encryption_properties_(file_encryption_properties),
         default_column_properties_(default_column_properties),
         column_properties_(column_properties) {}
@@ -628,7 +644,7 @@ class PARQUET_EXPORT WriterProperties {
   ParquetDataPageVersion parquet_data_page_version_;
   ParquetVersion::type parquet_version_;
   std::string parquet_created_by_;
-  bool integer_annotate_decimal_;
+  bool store_decimal_as_integer_;
 
   std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;