You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by wj...@apache.org on 2023/01/26 19:52:58 UTC
[arrow] branch master updated: MINOR: [C++][Parquet] Rephrase decimal annotation (#33694)
This is an automated email from the ASF dual-hosted git repository.
wjones127 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 068f49953c MINOR: [C++][Parquet] Rephrase decimal annotation (#33694)
068f49953c is described below
commit 068f49953cd6d5f1355a2885eccf7b6527e0d9be
Author: Gang Wu <us...@gmail.com>
AuthorDate: Fri Jan 27 03:52:51 2023 +0800
MINOR: [C++][Parquet] Rephrase decimal annotation (#33694)
### What changes are included in this PR?
- Rename the function in the writer properties to `enable_short_decimal_stored_as_integer`.
- Rephrase the comment to explain the behavior in detail.
Authored-by: Gang Wu <us...@gmail.com>
Signed-off-by: Will Jones <wi...@gmail.com>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 8 ++--
cpp/src/parquet/arrow/schema.cc | 2 +-
cpp/src/parquet/properties.h | 48 +++++++++++++++--------
3 files changed, 37 insertions(+), 21 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index bdfd0fe07d..6585d286d1 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3443,8 +3443,8 @@ TEST(ArrowReadWrite, Decimal256AsInt) {
auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), {array});
parquet::WriterProperties::Builder builder;
- // Enforce integer type to annotate decimal type
- auto writer_properties = builder.enable_integer_annotate_decimal()->build();
+ // Allow small decimals to be stored as int32 or int64.
+ auto writer_properties = builder.enable_store_decimal_as_integer()->build();
auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build();
CheckConfiguredRoundtrip(table, table, writer_properties, props_store_schema);
@@ -4821,8 +4821,8 @@ class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO<TestType> {
auto arrow_schema = ::arrow::schema({::arrow::field("a", values->type())});
parquet::WriterProperties::Builder builder;
- // Enforce integer type to annotate decimal type
- auto writer_properties = builder.enable_integer_annotate_decimal()->build();
+ // Allow small decimals to be stored as int32 or int64.
+ auto writer_properties = builder.enable_store_decimal_as_integer()->build();
std::shared_ptr<SchemaDescriptor> parquet_schema;
ASSERT_OK_NO_THROW(ToParquetSchema(arrow_schema.get(), *writer_properties,
*default_arrow_writer_properties(),
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index 267b892e4b..07d481ba8e 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -357,7 +357,7 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
const auto& decimal_type = static_cast<const ::arrow::DecimalType&>(*field->type());
precision = decimal_type.precision();
scale = decimal_type.scale();
- if (properties.integer_annotate_decimal() && 1 <= precision && precision <= 18) {
+ if (properties.store_decimal_as_integer() && 1 <= precision && precision <= 18) {
type = precision <= 9 ? ParquetType ::INT32 : ParquetType ::INT64;
} else {
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index d45dcfe69f..ef4ae124a2 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -201,7 +201,7 @@ class PARQUET_EXPORT WriterProperties {
version_(ParquetVersion::PARQUET_2_4),
data_page_version_(ParquetDataPageVersion::V1),
created_by_(DEFAULT_CREATED_BY),
- integer_annotate_decimal_(false) {}
+ store_decimal_as_integer_(false) {}
virtual ~Builder() {}
/// Specify the memory pool for the writer. Default default_memory_pool.
@@ -452,19 +452,35 @@ class PARQUET_EXPORT WriterProperties {
return this->disable_statistics(path->ToDotString());
}
- /// Enable integer type to annotate decimal type as below:
- /// int32: 1 <= precision <= 9
- /// int64: 10 <= precision <= 18
- /// Default disabled.
- Builder* enable_integer_annotate_decimal() {
- integer_annotate_decimal_ = true;
+ /// Allow decimals with 1 <= precision <= 18 to be stored as integers.
+ ///
+ /// In Parquet, DECIMAL can be stored in any of the following physical types:
+ /// - int32: for 1 <= precision <= 9.
+ /// - int64: for 10 <= precision <= 18.
+ /// - fixed_len_byte_array: precision is limited by the array size.
+ /// Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits.
+ /// - binary: precision is unlimited. The minimum number of bytes to store
+ /// the unscaled value is used.
+ ///
+ /// By default, this is DISABLED and all decimal types annotate fixed_len_byte_array.
+ ///
+ /// When enabled, the C++ writer will use following physical types to store decimals:
+ /// - int32: for 1 <= precision <= 9.
+ /// - int64: for 10 <= precision <= 18.
+ /// - fixed_len_byte_array: for precision > 18.
+ ///
+ /// As a consequence, decimal columns stored in integer types are more compact.
+ Builder* enable_store_decimal_as_integer() {
+ store_decimal_as_integer_ = true;
return this;
}
- /// Disable integer type to annotate decimal type.
+ /// Disable decimal logical type with 1 <= precision <= 18 to be stored as
+ /// integer physical type.
+ ///
/// Default disabled.
- Builder* disable_integer_annotate_decimal() {
- integer_annotate_decimal_ = false;
+ Builder* disable_store_decimal_as_integer() {
+ store_decimal_as_integer_ = false;
return this;
}
@@ -493,7 +509,7 @@ class PARQUET_EXPORT WriterProperties {
pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
pagesize_, version_, created_by_, std::move(file_encryption_properties_),
default_column_properties_, column_properties, data_page_version_,
- integer_annotate_decimal_));
+ store_decimal_as_integer_));
}
private:
@@ -505,7 +521,7 @@ class PARQUET_EXPORT WriterProperties {
ParquetVersion::type version_;
ParquetDataPageVersion data_page_version_;
std::string created_by_;
- bool integer_annotate_decimal_;
+ bool store_decimal_as_integer_;
std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
@@ -536,7 +552,7 @@ class PARQUET_EXPORT WriterProperties {
inline std::string created_by() const { return parquet_created_by_; }
- inline bool integer_annotate_decimal() const { return integer_annotate_decimal_; }
+ inline bool store_decimal_as_integer() const { return store_decimal_as_integer_; }
inline Encoding::type dictionary_index_encoding() const {
if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
@@ -606,7 +622,7 @@ class PARQUET_EXPORT WriterProperties {
std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
const ColumnProperties& default_column_properties,
const std::unordered_map<std::string, ColumnProperties>& column_properties,
- ParquetDataPageVersion data_page_version, bool integer_annotate_decimal)
+ ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer)
: pool_(pool),
dictionary_pagesize_limit_(dictionary_pagesize_limit),
write_batch_size_(write_batch_size),
@@ -615,7 +631,7 @@ class PARQUET_EXPORT WriterProperties {
parquet_data_page_version_(data_page_version),
parquet_version_(version),
parquet_created_by_(created_by),
- integer_annotate_decimal_(integer_annotate_decimal),
+ store_decimal_as_integer_(store_short_decimal_as_integer),
file_encryption_properties_(file_encryption_properties),
default_column_properties_(default_column_properties),
column_properties_(column_properties) {}
@@ -628,7 +644,7 @@ class PARQUET_EXPORT WriterProperties {
ParquetDataPageVersion parquet_data_page_version_;
ParquetVersion::type parquet_version_;
std::string parquet_created_by_;
- bool integer_annotate_decimal_;
+ bool store_decimal_as_integer_;
std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;