You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by su...@apache.org on 2019/04/14 21:31:59 UTC
[arrow] branch master updated: ARROW-5129: [Rust] Column writer
bug: check dictionary encoder when adding a new data page
This is an automated email from the ASF dual-hosted git repository.
sunchao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 6241a38 ARROW-5129: [Rust] Column writer bug: check dictionary encoder when adding a new data page
6241a38 is described below
commit 6241a3857efa11de48beae8c6a76c0ec6400bf4c
Author: Ivan Sadikov <iv...@gmail.com>
AuthorDate: Sun Apr 14 14:31:49 2019 -0700
ARROW-5129: [Rust] Column writer bug: check dictionary encoder when adding a new data page
This PR fixes the issue when we would encode all data into a single data page, unless we fall back to secondary encoder. Added test to verify the correctness of the patch.
Test is a bit awkward and clunky, but that is the only way I could come up to test the patch, since I need to check the written pages (I could not get "adding buffer to TestPageWriter" to work because of the box + lifetime issues).
Author: Ivan Sadikov <iv...@gmail.com>
Closes #4152 from sadikovi/ARROW-5129 and squashes the following commits:
61d2741b <Ivan Sadikov> fix style
42eb4de8 <Ivan Sadikov> add test
be7f41a8 <Ivan Sadikov> check dict encoder for data pages
---
rust/parquet/src/column/writer.rs | 55 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 54 insertions(+), 1 deletion(-)
diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs
index b520997..08e6e46 100644
--- a/rust/parquet/src/column/writer.rs
+++ b/rust/parquet/src/column/writer.rs
@@ -421,7 +421,15 @@ impl<T: DataType> ColumnWriterImpl<T> {
/// Returns true if there is enough data for a data page, false otherwise.
#[inline]
fn should_add_data_page(&self) -> bool {
- self.encoder.estimated_data_encoded_size() >= self.props.data_pagesize_limit()
+ match self.dict_encoder {
+ Some(ref encoder) => {
+ encoder.estimated_data_encoded_size() >= self.props.data_pagesize_limit()
+ }
+ None => {
+ self.encoder.estimated_data_encoded_size()
+ >= self.props.data_pagesize_limit()
+ }
+ }
}
/// Performs dictionary fallback.
@@ -1379,6 +1387,51 @@ mod tests {
);
}
+ #[test]
+ fn test_column_writer_add_data_pages_with_dict() {
+ // ARROW-5129: Test verifies that we add data page in case of dictionary encoding
+ // and no fallback occured so far.
+ let file = get_temp_file("test_column_writer_add_data_pages_with_dict", &[]);
+ let sink = FileSink::new(&file);
+ let page_writer = Box::new(SerializedPageWriter::new(sink));
+ let props = Rc::new(
+ WriterProperties::builder()
+ .set_data_pagesize_limit(15) // actually each page will have size 15-18 bytes
+ .set_write_batch_size(3) // write 3 values at a time
+ .build(),
+ );
+ let data = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+ let mut writer = get_test_column_writer::<Int32Type>(page_writer, 0, 0, props);
+ writer.write_batch(data, None, None).unwrap();
+ let (bytes_written, _, _) = writer.close().unwrap();
+
+ // Read pages and check the sequence
+ let source = FileSource::new(&file, 0, bytes_written as usize);
+ let mut page_reader = Box::new(
+ SerializedPageReader::new(
+ source,
+ data.len() as i64,
+ Compression::UNCOMPRESSED,
+ Int32Type::get_physical_type(),
+ )
+ .unwrap(),
+ );
+ let mut res = Vec::new();
+ while let Some(page) = page_reader.get_next_page().unwrap() {
+ res.push((page.page_type(), page.num_values()));
+ }
+ assert_eq!(
+ res,
+ vec![
+ (PageType::DICTIONARY_PAGE, 10),
+ (PageType::DATA_PAGE, 3),
+ (PageType::DATA_PAGE, 3),
+ (PageType::DATA_PAGE, 3),
+ (PageType::DATA_PAGE, 1)
+ ]
+ );
+ }
+
/// Performs write-read roundtrip with randomly generated values and levels.
/// `max_size` is maximum number of values or levels (if `max_def_level` > 0) to write
/// for a column.