You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/07/29 15:38:00 UTC

[arrow-datafusion] branch master updated: fix `RowWriter` index out of bounds error (#2968)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 811bad406 fix `RowWriter`  index out of bounds error (#2968)
811bad406 is described below

commit 811bad406f5ab4d58352cedc64e28f2bcacba9a9
Author: comphead <co...@users.noreply.github.com>
AuthorDate: Fri Jul 29 08:37:55 2022 -0700

    fix `RowWriter`  index out of bounds error (#2968)
    
    * fix writer index out of bounds
    
    * fix comments
    
    * removed obsolete comments
---
 datafusion/core/src/dataframe.rs | 37 +++++++++++++++++++++++++++++++++++++
 datafusion/row/src/writer.rs     | 28 +++++++++++++---------------
 2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs
index 2c99f5be1..9084619e4 100644
--- a/datafusion/core/src/dataframe.rs
+++ b/datafusion/core/src/dataframe.rs
@@ -1266,4 +1266,41 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn row_writer_resize_test() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![arrow::datatypes::Field::new(
+            "column_1",
+            DataType::Utf8,
+            false,
+        )]));
+
+        let data = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(arrow::array::StringArray::from(vec![
+                    Some("2a0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"),
+                    Some("3a0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800"),
+                ]))
+            ],
+        )?;
+
+        let table = crate::datasource::MemTable::try_new(schema, vec![vec![data]])?;
+
+        let ctx = SessionContext::new();
+        ctx.register_table("test", Arc::new(table))?;
+
+        let sql = r#"
+        SELECT 
+            COUNT(1)
+        FROM 
+            test
+        GROUP BY
+            column_1"#;
+
+        let df = ctx.sql(sql).await.unwrap();
+        df.show_limit(10).await.unwrap();
+
+        Ok(())
+    }
 }
diff --git a/datafusion/row/src/writer.rs b/datafusion/row/src/writer.rs
index 2992ec175..fcfaae976 100644
--- a/datafusion/row/src/writer.rs
+++ b/datafusion/row/src/writer.rs
@@ -256,7 +256,7 @@ impl RowWriter {
     pub(crate) fn end_padding(&mut self) {
         let payload_width = self.current_width();
         self.row_width = round_upto_power_of_2(payload_width, 8);
-        if self.data.capacity() < self.row_width {
+        if self.data.len() < self.row_width {
             self.data.resize(self.row_width, 0);
         }
     }
@@ -269,29 +269,29 @@ impl RowWriter {
 
 /// Stitch attributes of tuple in `batch` at `row_idx` and returns the tuple width
 pub fn write_row(
-    row: &mut RowWriter,
+    row_writer: &mut RowWriter,
     row_idx: usize,
     schema: &Schema,
     columns: &[ArrayRef],
 ) -> usize {
     // Get the row from the batch denoted by row_idx
-    if row.null_free() {
+    if row_writer.null_free() {
         for ((i, f), col) in schema.fields().iter().enumerate().zip(columns.iter()) {
-            write_field(i, row_idx, col, f.data_type(), row);
+            write_field(i, row_idx, col, f.data_type(), row_writer);
         }
     } else {
         for ((i, f), col) in schema.fields().iter().enumerate().zip(columns.iter()) {
             if !col.is_null(row_idx) {
-                row.set_non_null_at(i);
-                write_field(i, row_idx, col, f.data_type(), row);
+                row_writer.set_non_null_at(i);
+                write_field(i, row_idx, col, f.data_type(), row_writer);
             } else {
-                row.set_null_at(i);
+                row_writer.set_null_at(i);
             }
         }
     }
 
-    row.end_padding();
-    row.row_width
+    row_writer.end_padding();
+    row_writer.row_width
 }
 
 macro_rules! fn_write_field {
@@ -349,9 +349,8 @@ pub(crate) fn write_field_utf8(
     let from = from.as_any().downcast_ref::<StringArray>().unwrap();
     let s = from.value(row_idx);
     let new_width = to.current_width() + s.as_bytes().len();
-    if new_width > to.data.capacity() {
-        // double the capacity to avoid repeated resize
-        to.data.resize(max(to.data.capacity() * 2, new_width), 0);
+    if new_width > to.data.len() {
+        to.data.resize(max(to.data.capacity(), new_width), 0);
     }
     to.set_utf8(col_idx, s);
 }
@@ -365,9 +364,8 @@ pub(crate) fn write_field_binary(
     let from = from.as_any().downcast_ref::<BinaryArray>().unwrap();
     let s = from.value(row_idx);
     let new_width = to.current_width() + s.len();
-    if new_width > to.data.capacity() {
-        // double the capacity to avoid repeated resize
-        to.data.resize(max(to.data.capacity() * 2, new_width), 0);
+    if new_width > to.data.len() {
+        to.data.resize(max(to.data.capacity(), new_width), 0);
     }
     to.set_binary(col_idx, s);
 }