You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by su...@apache.org on 2019/04/14 21:31:59 UTC

[arrow] branch master updated: ARROW-5129: [Rust] Column writer bug: check dictionary encoder when adding a new data page

This is an automated email from the ASF dual-hosted git repository.

sunchao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 6241a38  ARROW-5129: [Rust] Column writer bug: check dictionary encoder when adding a new data page
6241a38 is described below

commit 6241a3857efa11de48beae8c6a76c0ec6400bf4c
Author: Ivan Sadikov <iv...@gmail.com>
AuthorDate: Sun Apr 14 14:31:49 2019 -0700

    ARROW-5129: [Rust] Column writer bug: check dictionary encoder when adding a new data page
    
    This PR fixes the issue when we would encode all data into a single data page, unless we fall back to secondary encoder. Added test to verify the correctness of the patch.
    
    Test is a bit awkward and clunky, but that is the only way I could come up to test the patch, since I need to check the written pages (I could not get "adding buffer to TestPageWriter" to work because of the box + lifetime issues).
    
    Author: Ivan Sadikov <iv...@gmail.com>
    
    Closes #4152 from sadikovi/ARROW-5129 and squashes the following commits:
    
    61d2741b <Ivan Sadikov> fix style
    42eb4de8 <Ivan Sadikov> add test
    be7f41a8 <Ivan Sadikov> check dict encoder for data pages
---
 rust/parquet/src/column/writer.rs | 55 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs
index b520997..08e6e46 100644
--- a/rust/parquet/src/column/writer.rs
+++ b/rust/parquet/src/column/writer.rs
@@ -421,7 +421,15 @@ impl<T: DataType> ColumnWriterImpl<T> {
     /// Returns true if there is enough data for a data page, false otherwise.
     #[inline]
     fn should_add_data_page(&self) -> bool {
-        self.encoder.estimated_data_encoded_size() >= self.props.data_pagesize_limit()
+        match self.dict_encoder {
+            Some(ref encoder) => {
+                encoder.estimated_data_encoded_size() >= self.props.data_pagesize_limit()
+            }
+            None => {
+                self.encoder.estimated_data_encoded_size()
+                    >= self.props.data_pagesize_limit()
+            }
+        }
     }
 
     /// Performs dictionary fallback.
@@ -1379,6 +1387,51 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_column_writer_add_data_pages_with_dict() {
+        // ARROW-5129: Test verifies that we add data page in case of dictionary encoding
+        // and no fallback occured so far.
+        let file = get_temp_file("test_column_writer_add_data_pages_with_dict", &[]);
+        let sink = FileSink::new(&file);
+        let page_writer = Box::new(SerializedPageWriter::new(sink));
+        let props = Rc::new(
+            WriterProperties::builder()
+                .set_data_pagesize_limit(15) // actually each page will have size 15-18 bytes
+                .set_write_batch_size(3) // write 3 values at a time
+                .build(),
+        );
+        let data = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+        let mut writer = get_test_column_writer::<Int32Type>(page_writer, 0, 0, props);
+        writer.write_batch(data, None, None).unwrap();
+        let (bytes_written, _, _) = writer.close().unwrap();
+
+        // Read pages and check the sequence
+        let source = FileSource::new(&file, 0, bytes_written as usize);
+        let mut page_reader = Box::new(
+            SerializedPageReader::new(
+                source,
+                data.len() as i64,
+                Compression::UNCOMPRESSED,
+                Int32Type::get_physical_type(),
+            )
+            .unwrap(),
+        );
+        let mut res = Vec::new();
+        while let Some(page) = page_reader.get_next_page().unwrap() {
+            res.push((page.page_type(), page.num_values()));
+        }
+        assert_eq!(
+            res,
+            vec![
+                (PageType::DICTIONARY_PAGE, 10),
+                (PageType::DATA_PAGE, 3),
+                (PageType::DATA_PAGE, 3),
+                (PageType::DATA_PAGE, 3),
+                (PageType::DATA_PAGE, 1)
+            ]
+        );
+    }
+
     /// Performs write-read roundtrip with randomly generated values and levels.
     /// `max_size` is maximum number of values or levels (if `max_def_level` > 0) to write
     /// for a column.