You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2021/08/08 07:57:25 UTC
[arrow-rs] branch master updated: allocate enough bytes when
writing booleans (#658)
This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 75432ed allocate enough bytes when writing booleans (#658)
75432ed is described below
commit 75432edb05ff001481df728607fc5b9be969c266
Author: Ben Chambers <35...@users.noreply.github.com>
AuthorDate: Sun Aug 8 00:57:17 2021 -0700
allocate enough bytes when writing booleans (#658)
* allocate enough bytes when writing booleans
* round up to nearest multiple of 256
---
parquet/src/arrow/arrow_writer.rs | 28 +++++++++++++++++++++++++++-
parquet/src/data_type.rs | 8 +++++++-
2 files changed, 34 insertions(+), 2 deletions(-)
diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs
index 4726734..7728cd4 100644
--- a/parquet/src/arrow/arrow_writer.rs
+++ b/parquet/src/arrow/arrow_writer.rs
@@ -227,7 +227,7 @@ fn write_leaves(
ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_) => {
Err(ParquetError::NYI(
format!(
- "Attempting to write an Arrow type {:?} to parquet that is not yet implemented",
+ "Attempting to write an Arrow type {:?} to parquet that is not yet implemented",
array.data_type()
)
))
@@ -1200,6 +1200,32 @@ mod tests {
}
#[test]
+ fn bool_large_single_column() {
+ let values = Arc::new(
+ [None, Some(true), Some(false)]
+ .iter()
+ .cycle()
+ .copied()
+ .take(200_000)
+ .collect::<BooleanArray>(),
+ );
+ let schema =
+ Schema::new(vec![Field::new("col", values.data_type().clone(), true)]);
+ let expected_batch =
+ RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap();
+ let file = get_temp_file("bool_large_single_column", &[]);
+
+ let mut writer = ArrowWriter::try_new(
+ file.try_clone().unwrap(),
+ expected_batch.schema(),
+ None,
+ )
+ .expect("Unable to write file");
+ writer.write(&expected_batch).unwrap();
+ writer.close().unwrap();
+ }
+
+ #[test]
fn i8_single_column() {
required_and_optional::<Int8Array, _>(0..SMALL_SIZE as i8, "i8_single_column");
}
diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs
index 127ba95..3573362 100644
--- a/parquet/src/data_type.rs
+++ b/parquet/src/data_type.rs
@@ -588,6 +588,7 @@ pub(crate) mod private {
use crate::util::bit_util::{BitReader, BitWriter};
use crate::util::memory::ByteBufferPtr;
+ use arrow::util::bit_util::round_upto_power_of_2;
use byteorder::ByteOrder;
use std::convert::TryInto;
@@ -669,7 +670,12 @@ pub(crate) mod private {
bit_writer: &mut BitWriter,
) -> Result<()> {
if bit_writer.bytes_written() + values.len() / 8 >= bit_writer.capacity() {
- bit_writer.extend(256);
+ let bits_available =
+ (bit_writer.capacity() - bit_writer.bytes_written()) * 8;
+ let bits_needed = values.len() - bits_available;
+ let bytes_needed = (bits_needed + 7) / 8;
+ let bytes_needed = round_upto_power_of_2(bytes_needed, 256);
+ bit_writer.extend(bytes_needed);
}
for value in values {
if !bit_writer.put_value(*value as u64, 1) {