You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/22 18:58:42 UTC

[arrow-rs] branch master updated: Fix parquet decimal precision (#3164)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new ed1d74b71 Fix parquet decimal precision (#3164)
ed1d74b71 is described below

commit ed1d74b718ed9a7e99de452d7fd5794f549273b6
Author: Vrishabh <ps...@gmail.com>
AuthorDate: Wed Nov 23 00:28:36 2022 +0530

    Fix parquet decimal precision (#3164)
---
 parquet/src/arrow/arrow_reader/mod.rs | 34 ++++++++++++++++++++++++++++++++++
 parquet/src/arrow/schema.rs           |  9 ++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
index a720d439c..da4b56237 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -2518,4 +2518,38 @@ mod tests {
         assert_eq!(actual.num_rows(), 1);
         assert_eq!(actual.column(0), &expected.column(0).slice(1, 1));
     }
+
+    #[test]
+    fn test_arbitary_decimal() {
+        let values = [1, 2, 3, 4, 5, 6, 7, 8];
+        let decimals_19_0 = Decimal128Array::from_iter_values(values)
+            .with_precision_and_scale(19, 0)
+            .unwrap();
+        let decimals_12_0 = Decimal128Array::from_iter_values(values)
+            .with_precision_and_scale(12, 0)
+            .unwrap();
+        let decimals_17_10 = Decimal128Array::from_iter_values(values)
+            .with_precision_and_scale(17, 10)
+            .unwrap();
+
+        let written = RecordBatch::try_from_iter([
+            ("decimal_values_19_0", Arc::new(decimals_19_0) as ArrayRef),
+            ("decimal_values_12_0", Arc::new(decimals_12_0) as ArrayRef),
+            ("decimal_values_17_10", Arc::new(decimals_17_10) as ArrayRef),
+        ])
+        .unwrap();
+
+        let mut buffer = Vec::with_capacity(1024);
+        let mut writer =
+            ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap();
+        writer.write(&written).unwrap();
+        writer.close().unwrap();
+
+        let read = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 8)
+            .unwrap()
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        assert_eq!(&written.slice(0, 8), &read[0]);
+    }
 }
diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs
index 07afccdb2..464b86d0c 100644
--- a/parquet/src/arrow/schema.rs
+++ b/parquet/src/arrow/schema.rs
@@ -233,7 +233,14 @@ pub fn parquet_to_arrow_field(parquet_column: &ColumnDescriptor) -> Result<Field
 }
 
 pub fn decimal_length_from_precision(precision: u8) -> usize {
-    (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as usize
+    // digits = floor(log_10(2^(8*n - 1) - 1))  // definition in parquet's logical types
+    // ceil(digits) = log10(2^(8*n - 1) - 1)
+    // 10^ceil(digits) = 2^(8*n - 1) - 1
+    // 10^ceil(digits) + 1 = 2^(8*n - 1)
+    // log2(10^ceil(digits) + 1) = (8*n - 1)
+    // log2(10^ceil(digits) + 1) + 1 = 8*n
+    // (log2(10^ceil(a) + 1) + 1) / 8 = n
+    (((10.0_f64.powi(precision as i32) + 1.0).log2() + 1.0) / 8.0).ceil() as usize
 }
 
 /// Convert an arrow field to a parquet `Type`