You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/22 18:58:42 UTC
[arrow-rs] branch master updated: Fix parquet decimal precision (#3164)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new ed1d74b71 Fix parquet decimal precision (#3164)
ed1d74b71 is described below
commit ed1d74b718ed9a7e99de452d7fd5794f549273b6
Author: Vrishabh <ps...@gmail.com>
AuthorDate: Wed Nov 23 00:28:36 2022 +0530
Fix parquet decimal precision (#3164)
---
parquet/src/arrow/arrow_reader/mod.rs | 34 ++++++++++++++++++++++++++++++++++
parquet/src/arrow/schema.rs | 9 ++++++++-
2 files changed, 42 insertions(+), 1 deletion(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
index a720d439c..da4b56237 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -2518,4 +2518,38 @@ mod tests {
assert_eq!(actual.num_rows(), 1);
assert_eq!(actual.column(0), &expected.column(0).slice(1, 1));
}
+
+ #[test]
+ fn test_arbitary_decimal() {
+ let values = [1, 2, 3, 4, 5, 6, 7, 8];
+ let decimals_19_0 = Decimal128Array::from_iter_values(values)
+ .with_precision_and_scale(19, 0)
+ .unwrap();
+ let decimals_12_0 = Decimal128Array::from_iter_values(values)
+ .with_precision_and_scale(12, 0)
+ .unwrap();
+ let decimals_17_10 = Decimal128Array::from_iter_values(values)
+ .with_precision_and_scale(17, 10)
+ .unwrap();
+
+ let written = RecordBatch::try_from_iter([
+ ("decimal_values_19_0", Arc::new(decimals_19_0) as ArrayRef),
+ ("decimal_values_12_0", Arc::new(decimals_12_0) as ArrayRef),
+ ("decimal_values_17_10", Arc::new(decimals_17_10) as ArrayRef),
+ ])
+ .unwrap();
+
+ let mut buffer = Vec::with_capacity(1024);
+ let mut writer =
+ ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap();
+ writer.write(&written).unwrap();
+ writer.close().unwrap();
+
+ let read = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 8)
+ .unwrap()
+ .collect::<Result<Vec<_>, _>>()
+ .unwrap();
+
+ assert_eq!(&written.slice(0, 8), &read[0]);
+ }
}
diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs
index 07afccdb2..464b86d0c 100644
--- a/parquet/src/arrow/schema.rs
+++ b/parquet/src/arrow/schema.rs
@@ -233,7 +233,14 @@ pub fn parquet_to_arrow_field(parquet_column: &ColumnDescriptor) -> Result<Field
}
pub fn decimal_length_from_precision(precision: u8) -> usize {
- (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as usize
+ // digits = floor(log_10(2^(8*n - 1) - 1)) // definition in parquet's logical types
+ // ceil(digits) = log10(2^(8*n - 1) - 1)
+ // 10^ceil(digits) = 2^(8*n - 1) - 1
+ // 10^ceil(digits) + 1 = 2^(8*n - 1)
+ // log2(10^ceil(digits) + 1) = (8*n - 1)
+ // log2(10^ceil(digits) + 1) + 1 = 8*n
+ // (log2(10^ceil(a) + 1) + 1) / 8 = n
+ (((10.0_f64.powi(precision as i32) + 1.0).log2() + 1.0) / 8.0).ceil() as usize
}
/// Convert an arrow field to a parquet `Type`