You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ke...@apache.org on 2023/09/08 14:55:17 UTC

[arrow] branch main updated: GH-37570: [MATLAB] Implement `isequal` for the `arrow.tabular.RecordBatch` MATLAB class (#37627)

This is an automated email from the ASF dual-hosted git repository.

kevingurney pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 0e6b8c5315 GH-37570: [MATLAB] Implement `isequal` for the `arrow.tabular.RecordBatch` MATLAB class (#37627)
0e6b8c5315 is described below

commit 0e6b8c53158be291c75c0b71cc72e7301171cff0
Author: sgilmore10 <74...@users.noreply.github.com>
AuthorDate: Fri Sep 8 10:55:10 2023 -0400

    GH-37570: [MATLAB] Implement `isequal` for the `arrow.tabular.RecordBatch` MATLAB class (#37627)
    
    
    
    ### Rationale for this change
    
    Following on to https://github.com/apache/arrow/pull/37474, https://github.com/apache/arrow/pull/37446, and https://github.com/apache/arrow/pull/37525, we should implement `isequal` for the `arrow.tabular.RecordBatch` MATLAB class.
    
    ### What changes are included in this PR?
    
    1. Implemented `isequal` method for `arrow.tabular.RecordBatch`
    
    ### Are these changes tested?
    
    Yes. Added `isequal` unit tests to `tRecordBatch.m`.
    
    ### Are there any user-facing changes?
    
    Yes, users can now use `isequal` to compare `arrow.tabular.RecordBatch`es.
    
    **Example**
    
    ```matlab
    >> t1 = table(1, "A", false, VariableNames=["Number",  "String", "Logical"]);
    >> t2 = table([1; 2], ["A"; "B"], [false; false], VariableNames=["Number",  "String", "Logical"]);
    >> rb1 = arrow.recordBatch(t1);
    >> rb2 = arrow.recordBatch(t2);
    >> rb3 = arrow.recordBatch(t1);
    
    >> isequal(rb1, rb2)
    
    ans =
    
      logical
    
       0
    
    >> isequal(rb1, rb3)
    
    ans =
    
      logical
    
       1
    ```
    
    ### Future Directions
    1. #37628
    
    * Closes: #37570
    
    Authored-by: Sarah Gilmore <sg...@mathworks.com>
    Signed-off-by: Kevin Gurney <kg...@mathworks.com>
---
 matlab/src/matlab/+arrow/+tabular/RecordBatch.m | 46 +++++++++++++++-
 matlab/test/arrow/tabular/tRecordBatch.m        | 72 +++++++++++++++++++++++++
 2 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m
index f8a670d095..32269e9114 100644
--- a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m
+++ b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m
@@ -1,3 +1,7 @@
+%RECORDBATCH A tabular data structure representing a set of 
+%arrow.array.Array objects with a fixed schema.
+
+
 % Licensed to the Apache Software Foundation (ASF) under one or more
 % contributor license agreements.  See the NOTICE file distributed with
 % this work for additional information regarding copyright ownership.
@@ -15,8 +19,6 @@
 
 classdef RecordBatch < matlab.mixin.CustomDisplay & ...
                        matlab.mixin.Scalar
-%arrow.tabular.RecordBatch A tabular data structure representing
-% a set of arrow.array.Array objects with a fixed schema.
 
     properties (Dependent, SetAccess=private, GetAccess=public)
         NumColumns
@@ -91,6 +93,46 @@ classdef RecordBatch < matlab.mixin.CustomDisplay & ...
         function T = toMATLAB(obj)
             T = obj.table();
         end
+
+        function tf = isequal(obj, varargin)
+            narginchk(2, inf);
+            tf = false;
+
+            schemasToCompare = cell([1 numel(varargin)]);
+            for ii = 1:numel(varargin)
+                rb = varargin{ii};
+                if ~isa(rb, "arrow.tabular.RecordBatch")
+                    % If rb is not a RecordBatch, then it cannot be equal
+                    % to obj. Return false early.
+                    return;
+                end
+                schemasToCompare{ii} = rb.Schema;
+            end
+
+            if ~isequal(obj.Schema, schemasToCompare{:})
+                % If the schemas are not equal, the record batches are not
+                % equal. Return false early.
+                return;
+            end
+
+            % Function that extracts the column stored at colIndex from the
+            % record batch stored at rbIndex in varargin.
+            getColumnFcn = @(rbIndex, colIndex) varargin{rbIndex}.column(colIndex);
+
+            rbIndices = 1:numel(varargin);
+            for ii = 1:obj.NumColumns
+                colIndices = repmat(ii, [1 numel(rbIndices)]);
+                % Gather all columns at index ii across the record
+                % batches stored in varargin. Compare these columns with
+                % the corresponding column in obj. If they are not equal,
+                % then the record batches are not equal. Return false.
+                columnsToCompare = arrayfun(getColumnFcn, rbIndices, colIndices, UniformOutput=false);
+                if ~isequal(obj.column(ii), columnsToCompare{:})
+                    return;
+                end
+            end
+            tf = true;
+        end
     end
 
     methods (Access = private)
diff --git a/matlab/test/arrow/tabular/tRecordBatch.m b/matlab/test/arrow/tabular/tRecordBatch.m
index f4b156a377..b26729012a 100644
--- a/matlab/test/arrow/tabular/tRecordBatch.m
+++ b/matlab/test/arrow/tabular/tRecordBatch.m
@@ -386,6 +386,78 @@ classdef tRecordBatch < matlab.unittest.TestCase
             testCase.verifyError(@() recordBatch.column(name), "arrow:badsubscript:NonScalar");
         end
 
+        function TestIsEqualTrue(testCase)
+            % Verify two record batches are considered equal if:
+            %   1. They have the same schema
+            %   2. Their corresponding columns are equal
+            import arrow.tabular.RecordBatch
+
+            a1 = arrow.array([1 2 3]);
+            a2 = arrow.array(["A" "B" "C"]);
+            a3 = arrow.array([true true false]);
+
+            rb1 = RecordBatch.fromArrays(a1, a2, a3, ...
+                ColumnNames=["A", "B", "C"]);
+            rb2 = RecordBatch.fromArrays(a1, a2, a3, ...
+                ColumnNames=["A", "B", "C"]);
+            testCase.verifyTrue(isequal(rb1, rb2));
+
+            % Compare zero-column record batches
+            rb3 = RecordBatch.fromArrays();
+            rb4 = RecordBatch.fromArrays();
+            testCase.verifyTrue(isequal(rb3, rb4));
+
+            % Compare zero-row record batches
+            a4 = arrow.array([]);
+            a5 = arrow.array(strings(0, 0));
+            rb5 = RecordBatch.fromArrays(a4, a5, ColumnNames=["D" "E"]);
+            rb6 = RecordBatch.fromArrays(a4, a5, ColumnNames=["D" "E"]);
+            testCase.verifyTrue(isequal(rb5, rb6));
+
+            % Call isequal with more than two arguments
+            testCase.verifyTrue(isequal(rb3, rb4, rb3, rb4));
+        end
+
+        function TestIsEqualFalse(testCase)
+            % Verify isequal returns false when expected.
+            import arrow.tabular.RecordBatch
+
+            a1 = arrow.array([1 2 3]);
+            a2 = arrow.array(["A" "B" "C"]);
+            a3 = arrow.array([true true false]);
+            a4 = arrow.array(["A" missing "C"]); 
+            a5 = arrow.array([1 2]);
+            a6 = arrow.array(["A" "B"]);
+            a7 = arrow.array([true true]);
+
+            rb1 = RecordBatch.fromArrays(a1, a2, a3, ...
+                ColumnNames=["A", "B", "C"]);
+            rb2 = RecordBatch.fromArrays(a1, a2, a3, ...
+                ColumnNames=["D", "E", "F"]);
+            rb3 = RecordBatch.fromArrays(a1, a4, a3, ...
+                ColumnNames=["A", "B", "C"]);
+            rb4 = RecordBatch.fromArrays(a5, a6, a7, ...
+                ColumnNames=["A", "B", "C"]);
+            rb5 = RecordBatch.fromArrays(a1, a2, a3, a1, ...
+                ColumnNames=["A", "B", "C", "D"]);
+
+            % The column names are not equal
+            testCase.verifyFalse(isequal(rb1, rb2));
+
+            % The columns are not equal
+            testCase.verifyFalse(isequal(rb1, rb3));
+
+            % The number of rows are not equal
+            testCase.verifyFalse(isequal(rb1, rb4));
+
+            % The number of columns are not equal
+            testCase.verifyFalse(isequal(rb1, rb5));
+
+            % Call isequal with more than two arguments
+            testCase.verifyFalse(isequal(rb1, rb2, rb3, rb4));
+        end
+
+
     end
 
     methods