You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/06/28 18:53:56 UTC

[arrow-rs] branch master updated: Fix empty offset index for all null columns (#4459) (#4460)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 554aebe3b Fix empty offset index for all null columns (#4459) (#4460)
554aebe3b is described below

commit 554aebe3b523737b3aaf6109846f4735110b26f8
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed Jun 28 19:53:51 2023 +0100

    Fix empty offset index for all null columns (#4459) (#4460)
---
 parquet/src/arrow/arrow_writer/mod.rs | 27 +++++++++++++++++++++++++++
 parquet/src/column/writer/mod.rs      |  7 +++----
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
index 0aca77f5b..ccec4ffb2 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -790,6 +790,7 @@ mod tests {
     use arrow::util::pretty::pretty_format_batches;
     use arrow::{array::*, buffer::Buffer};
     use arrow_array::RecordBatch;
+    use arrow_buffer::NullBuffer;
     use arrow_schema::Fields;
 
     use crate::basic::Encoding;
@@ -2609,4 +2610,30 @@ mod tests {
 
         writer.close().unwrap();
     }
+
+    #[test]
+    fn test_writer_all_null() {
+        let a = Int32Array::from(vec![1, 2, 3, 4, 5]);
+        let b = Int32Array::new(vec![0; 5].into(), Some(NullBuffer::new_null(5)));
+        let batch = RecordBatch::try_from_iter(vec![
+            ("a", Arc::new(a) as ArrayRef),
+            ("b", Arc::new(b) as ArrayRef),
+        ])
+        .unwrap();
+
+        let mut buf = Vec::with_capacity(1024);
+        let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), None).unwrap();
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+
+        let bytes = Bytes::from(buf);
+        let options = ReadOptionsBuilder::new().with_page_index().build();
+        let reader = SerializedFileReader::new_with_options(bytes, options).unwrap();
+        let index = reader.metadata().offset_index().unwrap();
+
+        assert_eq!(index.len(), 1);
+        assert_eq!(index[0].len(), 2); // 2 columns
+        assert_eq!(index[0][0].len(), 1); // 1 page
+        assert_eq!(index[0][1].len(), 1); // 1 page
+    }
 }
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 4aefef98f..1cacfe793 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -690,11 +690,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
                     }
                 }
             }
-
-            // update the offset index
-            self.offset_index_builder
-                .append_row_count(self.page_metrics.num_buffered_rows as i64);
         }
+        // update the offset index
+        self.offset_index_builder
+            .append_row_count(self.page_metrics.num_buffered_rows as i64);
     }
 
     fn truncate_min_value(&self, data: &[u8]) -> Vec<u8> {