You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2021/08/08 07:57:25 UTC
[arrow-rs] branch master updated: allocate enough bytes when writing booleans (#658)

This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 75432ed  allocate enough bytes when writing booleans (#658)
75432ed is described below

commit 75432edb05ff001481df728607fc5b9be969c266
Author: Ben Chambers <35...@users.noreply.github.com>
AuthorDate: Sun Aug 8 00:57:17 2021 -0700

    allocate enough bytes when writing booleans (#658)
    
    * allocate enough bytes when writing booleans
    
    * round up to nearest multiple of 256
---
 parquet/src/arrow/arrow_writer.rs | 28 +++++++++++++++++++++++++++-
 parquet/src/data_type.rs          |  8 +++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs
index 4726734..7728cd4 100644
--- a/parquet/src/arrow/arrow_writer.rs
+++ b/parquet/src/arrow/arrow_writer.rs
@@ -227,7 +227,7 @@ fn write_leaves(
         ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_) => {
             Err(ParquetError::NYI(
                 format!(
-                    "Attempting to write an Arrow type {:?} to parquet that is not yet implemented", 
+                    "Attempting to write an Arrow type {:?} to parquet that is not yet implemented",
                     array.data_type()
                 )
             ))
@@ -1200,6 +1200,32 @@ mod tests {
     }
 
     #[test]
+    fn bool_large_single_column() {
+        let values = Arc::new(
+            [None, Some(true), Some(false)]
+                .iter()
+                .cycle()
+                .copied()
+                .take(200_000)
+                .collect::<BooleanArray>(),
+        );
+        let schema =
+            Schema::new(vec![Field::new("col", values.data_type().clone(), true)]);
+        let expected_batch =
+            RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap();
+        let file = get_temp_file("bool_large_single_column", &[]);
+
+        let mut writer = ArrowWriter::try_new(
+            file.try_clone().unwrap(),
+            expected_batch.schema(),
+            None,
+        )
+        .expect("Unable to write file");
+        writer.write(&expected_batch).unwrap();
+        writer.close().unwrap();
+    }
+
+    #[test]
     fn i8_single_column() {
         required_and_optional::<Int8Array, _>(0..SMALL_SIZE as i8, "i8_single_column");
     }
diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs
index 127ba95..3573362 100644
--- a/parquet/src/data_type.rs
+++ b/parquet/src/data_type.rs
@@ -588,6 +588,7 @@ pub(crate) mod private {
     use crate::util::bit_util::{BitReader, BitWriter};
     use crate::util::memory::ByteBufferPtr;
 
+    use arrow::util::bit_util::round_upto_power_of_2;
     use byteorder::ByteOrder;
     use std::convert::TryInto;
 
@@ -669,7 +670,12 @@ pub(crate) mod private {
             bit_writer: &mut BitWriter,
         ) -> Result<()> {
             if bit_writer.bytes_written() + values.len() / 8 >= bit_writer.capacity() {
-                bit_writer.extend(256);
+                let bits_available =
+                    (bit_writer.capacity() - bit_writer.bytes_written()) * 8;
+                let bits_needed = values.len() - bits_available;
+                let bytes_needed = (bits_needed + 7) / 8;
+                let bytes_needed = round_upto_power_of_2(bytes_needed, 256);
+                bit_writer.extend(bytes_needed);
             }
             for value in values {
                 if !bit_writer.put_value(*value as u64, 1) {