You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2021/06/25 16:36:53 UTC

[arrow-rs] branch master updated: Implement function slice for RecordBatch (#490)

This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new de62168  Implement function slice for RecordBatch (#490)
de62168 is described below

commit de62168a4f428e3c334e1cfa5c5db23272f313d7
Author: baishen <ba...@gmail.com>
AuthorDate: Fri Jun 25 11:36:44 2021 -0500

    Implement function slice for RecordBatch (#490)
    
    * Implement RecordBatch::slice()
    
    * optimize
    
    * optimize
    
    * add test case
    
    * fix clippy
---
 arrow/src/record_batch.rs | 91 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 84 insertions(+), 7 deletions(-)

diff --git a/arrow/src/record_batch.rs b/arrow/src/record_batch.rs
index f1fd867..4d2abc3 100644
--- a/arrow/src/record_batch.rs
+++ b/arrow/src/record_batch.rs
@@ -244,6 +244,31 @@ impl RecordBatch {
         &self.columns[..]
     }
 
+    /// Return a new RecordBatch where each column is sliced
+    /// according to `offset` and `length`
+    ///
+    /// # Panics
+    ///
+    /// Panics if `offset` with `length` is greater than column length.
+    pub fn slice(&self, offset: usize, length: usize) -> RecordBatch {
+        if self.schema.fields().is_empty() {
+            assert!((offset + length) == 0);
+            return RecordBatch::new_empty(self.schema.clone());
+        }
+        assert!((offset + length) <= self.num_rows());
+
+        let columns = self
+            .columns()
+            .iter()
+            .map(|column| column.slice(offset, length))
+            .collect();
+
+        Self {
+            schema: self.schema.clone(),
+            columns,
+        }
+    }
+
     /// Create a `RecordBatch` from an iterable list of pairs of the
     /// form `(field_name, array)`, with the same requirements on
     /// fields and arrays as [`RecordBatch::try_new`]. This method is
@@ -414,16 +439,68 @@ mod tests {
         let record_batch =
             RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)])
                 .unwrap();
-        check_batch(record_batch)
+        check_batch(record_batch, 5)
     }
 
-    fn check_batch(record_batch: RecordBatch) {
-        assert_eq!(5, record_batch.num_rows());
+    fn check_batch(record_batch: RecordBatch, num_rows: usize) {
+        assert_eq!(num_rows, record_batch.num_rows());
         assert_eq!(2, record_batch.num_columns());
         assert_eq!(&DataType::Int32, record_batch.schema().field(0).data_type());
         assert_eq!(&DataType::Utf8, record_batch.schema().field(1).data_type());
-        assert_eq!(5, record_batch.column(0).data().len());
-        assert_eq!(5, record_batch.column(1).data().len());
+        assert_eq!(num_rows, record_batch.column(0).data().len());
+        assert_eq!(num_rows, record_batch.column(1).data().len());
+    }
+
+    #[test]
+    #[should_panic(expected = "assertion failed: (offset + length) <= self.num_rows()")]
+    fn create_record_batch_slice() {
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Utf8, false),
+        ]);
+        let expected_schema = schema.clone();
+
+        let a = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8]);
+        let b = StringArray::from(vec!["a", "b", "c", "d", "e", "f", "h", "i"]);
+
+        let record_batch =
+            RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)])
+                .unwrap();
+
+        let offset = 2;
+        let length = 5;
+        let record_batch_slice = record_batch.slice(offset, length);
+
+        assert_eq!(record_batch_slice.schema().as_ref(), &expected_schema);
+        check_batch(record_batch_slice, 5);
+
+        let offset = 2;
+        let length = 0;
+        let record_batch_slice = record_batch.slice(offset, length);
+
+        assert_eq!(record_batch_slice.schema().as_ref(), &expected_schema);
+        check_batch(record_batch_slice, 0);
+
+        let offset = 2;
+        let length = 10;
+        let _record_batch_slice = record_batch.slice(offset, length);
+    }
+
+    #[test]
+    #[should_panic(expected = "assertion failed: (offset + length) == 0")]
+    fn create_record_batch_slice_empty_batch() {
+        let schema = Schema::new(vec![]);
+
+        let record_batch = RecordBatch::new_empty(Arc::new(schema));
+
+        let offset = 0;
+        let length = 0;
+        let record_batch_slice = record_batch.slice(offset, length);
+        assert_eq!(0, record_batch_slice.schema().fields().len());
+
+        let offset = 1;
+        let length = 2;
+        let _record_batch_slice = record_batch.slice(offset, length);
     }
 
     #[test]
@@ -445,7 +522,7 @@ mod tests {
             Field::new("b", DataType::Utf8, false),
         ]);
         assert_eq!(record_batch.schema().as_ref(), &expected_schema);
-        check_batch(record_batch);
+        check_batch(record_batch, 5);
     }
 
     #[test]
@@ -465,7 +542,7 @@ mod tests {
             Field::new("b", DataType::Utf8, true),
         ]);
         assert_eq!(record_batch.schema().as_ref(), &expected_schema);
-        check_batch(record_batch);
+        check_batch(record_batch, 5);
     }
 
     #[test]