You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2023/05/09 15:34:07 UTC

[arrow] branch main updated: GH-35411: [MATLAB] Create a templated C++ Proxy Class for Numeric Arrays (#35479)

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new b372242b68 GH-35411: [MATLAB] Create a templated C++ Proxy Class for Numeric Arrays (#35479)
b372242b68 is described below

commit b372242b686d0cc6eace5bd912dbf460edfc36e1
Author: sgilmore10 <74...@users.noreply.github.com>
AuthorDate: Tue May 9 11:33:59 2023 -0400

    GH-35411: [MATLAB] Create a templated C++ Proxy Class for Numeric Arrays (#35479)
    
    ### Rationale for this change
    
    This pull request is a followup to #34563. To facilitate the implementation of future array types, we would like to first create a C++ template class for numeric arrays. We also want to start adding basic tests for the array functionality in the MATLAB interface.
    
    ### What changes are included in this PR?
    
    1. Added a C++ template Class called `NumericArray` templated on `CType`.
    2. Re-implemented the `Float64Array` C++ proxy class in terms of the new template class, i.e. `NumericArray<double>`.
    3. Added a method called `double()` on the MATLAB Float64Array class to convert the arrow.[Type]Array to a MATLAB `double` array.
    4. Added basic tests for round-tripping float64 arrays.
    5. Created a base C++ proxy `Array` class that all proxy array classes will inherit from.
    6. Renamed `Print()` to `ToString()` and made it return a string instead of printing to the screen.
    
    ### Are these changes tested?
    
    Yes, we added automated test cases to the test class `tFloat64Array.m`. In addition, we manually qualified these changes on macOS.
    
    ### Are there any user-facing changes?
    Yes, the `Print()` method is no longer public and there is now a method called `double()` on `arrow.array.Float64Array`.
    
    Included below is a simple example of using the `double()` method:
    
    ```matlab
    >> arrowArray = arrow.array.Float64Array([1, 2, 3])
    
    arrowArray =
    
    [
      1,
      2,
      3
    ]
    
    >> matlabArray = double(arrowArray)
    
    matlabArray =
    
         1
         2
         3
    
    >> class(arrowArray)
    
    ans =
    
        'arrow.array.Float64Array'
    
    >> class(matlabArray)
    
    ans =
    
        'double'
    ```
    
    ### Future Directions
    
    1. Support the rest of the numeric types.
    2. Add an abstract MATLAB base class called `arrow.array.Array`.
    3. Continue building out the methods (e.g. `length()`)
    4.  Support `null` values (validity bitmap).
    5. Handle converting non-ascii characters from `UTF-8` to `UTF-16`.
    6. Handle errors in the C++ layer.
    
    * Closes: #35411
    
    Lead-authored-by: Sarah Gilmore <sg...@mathworks.com>
    Co-authored-by: sgilmore10 <74...@users.noreply.github.com>
    Co-authored-by: Kevin Gurney <kg...@mathworks.com>
    Co-authored-by: Sutou Kouhei <ko...@cozmixng.org>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 .../array/proxy/{float64_array.cc => array.cc}     | 22 +++--
 .../array/proxy/{float64_array.cc => array.h}      | 26 ++++--
 .../cpp/arrow/matlab/array/proxy/float64_array.h   | 60 -------------
 .../cpp/arrow/matlab/array/proxy/numeric_array.h   | 97 ++++++++++++++++++++++
 matlab/src/cpp/arrow/matlab/proxy/factory.cc       |  4 +-
 matlab/src/matlab/+arrow/+array/Float64Array.m     | 27 ++++--
 matlab/test/arrow/array/tFloat64Array.m            | 81 ++++++++++++++++--
 matlab/tools/cmake/BuildMatlabArrowInterface.cmake |  3 +-
 8 files changed, 231 insertions(+), 89 deletions(-)

diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/float64_array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/array.cc
similarity index 61%
copy from matlab/src/cpp/arrow/matlab/array/proxy/float64_array.cc
copy to matlab/src/cpp/arrow/matlab/array/proxy/array.cc
index 4f25912b9e..865dd694b7 100644
--- a/matlab/src/cpp/arrow/matlab/array/proxy/float64_array.cc
+++ b/matlab/src/cpp/arrow/matlab/array/proxy/array.cc
@@ -15,11 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "float64_array.h"
+#include "arrow/matlab/array/proxy/array.h"
 
 namespace arrow::matlab::array::proxy {
-void Float64Array::Print(libmexclass::proxy::method::Context& context) {
-    // TODO: Return an MDA string representation of the Arrow array. 
-    std::cout << array->ToString() << std::endl;
+
+    Array::Array(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
+
+        // Register Proxy methods.
+        REGISTER_METHOD(Array, ToString);
+        REGISTER_METHOD(Array, ToMatlab);
+
+    }
+
+    void Array::ToString(libmexclass::proxy::method::Context& context) {
+        ::matlab::data::ArrayFactory factory;
+
+        // TODO: handle non-ascii characters
+        auto str_mda = factory.createScalar(array->ToString());
+        context.outputs[0] = str_mda;
+    }
 }
-} // namespace arrow::matlab::array::proxy
diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/float64_array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/array.h
similarity index 64%
rename from matlab/src/cpp/arrow/matlab/array/proxy/float64_array.cc
rename to matlab/src/cpp/arrow/matlab/array/proxy/array.h
index 4f25912b9e..8a16630348 100644
--- a/matlab/src/cpp/arrow/matlab/array/proxy/float64_array.cc
+++ b/matlab/src/cpp/arrow/matlab/array/proxy/array.h
@@ -15,11 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "float64_array.h"
+#pragma once
+
+#include "arrow/array.h"
+
+#include "libmexclass/proxy/Proxy.h"
 
 namespace arrow::matlab::array::proxy {
-void Float64Array::Print(libmexclass::proxy::method::Context& context) {
-    // TODO: Return an MDA string representation of the Arrow array. 
-    std::cout << array->ToString() << std::endl;
+
+class Array : public libmexclass::proxy::Proxy {
+    public:
+        Array(const libmexclass::proxy::FunctionArguments& constructor_arguments);
+    
+        virtual ~Array() {}
+
+    protected:
+
+        void ToString(libmexclass::proxy::method::Context& context);
+
+        virtual void ToMatlab(libmexclass::proxy::method::Context& context) = 0;
+
+        std::shared_ptr<arrow::Array> array;
+};
+
 }
-} // namespace arrow::matlab::array::proxy
diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/float64_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/float64_array.h
deleted file mode 100644
index 3d6e449326..0000000000
--- a/matlab/src/cpp/arrow/matlab/array/proxy/float64_array.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/array.h"
-#include "arrow/builder.h"
-
-#include "libmexclass/proxy/Proxy.h"
-
-namespace arrow::matlab::array::proxy {
-class Float64Array : public libmexclass::proxy::Proxy {
-    public:
-        Float64Array(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
-            // Get the mxArray from constructor arguments
-            const ::matlab::data::TypedArray<double> double_mda = constructor_arguments[0];
-
-            // Get raw pointer of mxArray
-            auto it(double_mda.cbegin());
-            auto dt = it.operator->();
-
-            // Pass pointer to Arrow array constructor that takes a buffer
-            // Do not make a copy when creating arrow::Buffer
-            std::shared_ptr<arrow::Buffer> buffer(
-                  new arrow::Buffer(reinterpret_cast<const uint8_t*>(dt), 
-                                    sizeof(double) * double_mda.getNumberOfElements()));
-            
-            // Construct arrow::NumericArray specialization using arrow::Buffer.
-            // pass in nulls information...we could compute and provide the number of nulls here too
-            std::shared_ptr<arrow::Array> array_wrapper(
-                new arrow::NumericArray<arrow::DoubleType>(double_mda.getNumberOfElements(), buffer,
-                                                       nullptr, // TODO: fill validity bitmap with data
-                                                       -1));
-
-            array = array_wrapper;
-
-            // Register Proxy methods.
-            REGISTER_METHOD(Float64Array, Print);
-        }
-    private:
-        void Print(libmexclass::proxy::method::Context& context);
-
-        // "Raw" arrow::Array
-        std::shared_ptr<arrow::Array> array;
-};
-} // namespace arrow::matlab::array::proxy
diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h b/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h
new file mode 100644
index 0000000000..f7f19a60b0
--- /dev/null
+++ b/matlab/src/cpp/arrow/matlab/array/proxy/numeric_array.h
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+
+#include "arrow/array.h"
+#include "arrow/array/data.h"
+#include "arrow/array/util.h"
+
+#include "arrow/builder.h"
+#include "arrow/type_traits.h"
+
+#include "arrow/matlab/array/proxy/array.h"
+
+#include "libmexclass/proxy/Proxy.h"
+
+namespace arrow::matlab::array::proxy {
+
+template<typename CType>
+class NumericArray : public arrow::matlab::array::proxy::Array {
+    public:
+        NumericArray(const libmexclass::proxy::FunctionArguments& constructor_arguments)
+            : arrow::matlab::array::proxy::Array(constructor_arguments) {
+            using ArrowType = typename arrow::CTypeTraits<CType>::ArrowType;
+            using BuilderType = typename arrow::CTypeTraits<CType>::BuilderType;
+
+            // Get the mxArray from constructor arguments
+            const ::matlab::data::TypedArray<CType> numeric_mda = constructor_arguments[0];
+            const ::matlab::data::TypedArray<bool> make_copy = constructor_arguments[1];
+
+            // Get raw pointer of mxArray
+            auto it(numeric_mda.cbegin());
+            auto dt = it.operator->();
+
+            const auto make_deep_copy = make_copy[0];
+
+            if (make_deep_copy) {
+                BuilderType builder;
+                auto st = builder.AppendValues(dt, numeric_mda.getNumberOfElements());
+
+                // TODO: handle error case
+                if (st.ok()) {
+                    auto maybe_array = builder.Finish();
+                    if (maybe_array.ok()) {
+                        array = *maybe_array;
+                    }
+                }
+            } else {
+                const auto data_type = arrow::CTypeTraits<CType>::type_singleton();
+                const auto length = static_cast<int64_t>(numeric_mda.getNumberOfElements()); // cast size_t to int64_t
+
+                // Do not make a copy when creating arrow::Buffer
+                auto data_buffer = std::make_shared<arrow::Buffer>(reinterpret_cast<const uint8_t*>(dt),
+                                                              sizeof(CType) * numeric_mda.getNumberOfElements());
+
+                // TODO: Implement null support
+                std::shared_ptr<arrow::Buffer> null_buffer = nullptr;
+
+                auto array_data = arrow::ArrayData::Make(data_type, length, {null_buffer, data_buffer});
+                array = arrow::MakeArray(array_data);
+
+            }
+        }
+
+    protected:
+        void ToMatlab(libmexclass::proxy::method::Context& context) override {
+            using ArrowArrayType = typename arrow::CTypeTraits<CType>::ArrayType;
+
+            const auto num_elements = static_cast<size_t>(array->length());
+            const auto numeric_array = std::static_pointer_cast<ArrowArrayType>(array);
+            const CType* const data_begin = numeric_array->raw_values();
+            const CType* const data_end = data_begin + num_elements;
+
+            ::matlab::data::ArrayFactory factory;
+
+            // Constructs a TypedArray from the raw values. Makes a copy.
+            ::matlab::data::TypedArray<CType> result = factory.createArray({num_elements, 1}, data_begin, data_end);
+            context.outputs[0] = result;
+        }
+};
+
+}
diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc
index 9bf7ec2dc9..c767d7dc37 100644
--- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc
+++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/matlab/array/proxy/float64_array.h"
+#include "arrow/matlab/array/proxy/numeric_array.h"
 
 #include "factory.h"
 
@@ -26,7 +26,7 @@ namespace arrow::matlab::proxy {
 std::shared_ptr<Proxy> Factory::make_proxy(const ClassName& class_name, const FunctionArguments& constructor_arguments) {
 
     // Register MATLAB Proxy classes with corresponding C++ Proxy classes.
-    REGISTER_PROXY(arrow.array.proxy.Float64Array, arrow::matlab::array::proxy::Float64Array);
+    REGISTER_PROXY(arrow.array.proxy.Float64Array, arrow::matlab::array::proxy::NumericArray<double>);
 
     // TODO: Decide what to do in the case that there isn't a Proxy match.
     std::cout << "Did not find a matching C++ proxy for: " + class_name << std::endl;
diff --git a/matlab/src/matlab/+arrow/+array/Float64Array.m b/matlab/src/matlab/+arrow/+array/Float64Array.m
index 71d56e6cc0..c48ee45574 100644
--- a/matlab/src/matlab/+arrow/+array/Float64Array.m
+++ b/matlab/src/matlab/+arrow/+array/Float64Array.m
@@ -20,25 +20,38 @@ classdef Float64Array < matlab.mixin.CustomDisplay
         Proxy
     end
 
-    properties (Access=private)
+    properties (Hidden, SetAccess=private)
         MatlabArray
     end
 
     methods
-        function obj = Float64Array(matlabArray)
-            obj.MatlabArray = matlabArray;
-            obj.Proxy = libmexclass.proxy.Proxy("Name", "arrow.array.proxy.Float64Array", "ConstructorArguments", {obj.MatlabArray});
+        function obj = Float64Array(data, opts)
+            arguments
+                data
+                opts.DeepCopy = false
+            end
+
+            validateattributes(data, "double", ["2d", "nonsparse", "real"]);
+            if ~isempty(data), validateattributes(data, "double", "vector"); end
+            % Store a reference to the array if not doing a deep copy
+            if (~opts.DeepCopy), obj.MatlabArray = data; end
+            obj.Proxy = libmexclass.proxy.Proxy("Name", "arrow.array.proxy.Float64Array", "ConstructorArguments", {data, opts.DeepCopy});
         end
 
-        function Print(obj)
-            obj.Proxy.Print();
+        function data = double(obj)
+            data = obj.Proxy.ToMatlab();
         end
     end
 
     methods (Access=protected)
         function displayScalarObject(obj)
-            obj.Print();
+            disp(obj.ToString());
         end
     end
 
+    methods (Access=private)
+        function str = ToString(obj)
+            str = obj.Proxy.ToString();
+        end
+    end
 end
diff --git a/matlab/test/arrow/array/tFloat64Array.m b/matlab/test/arrow/array/tFloat64Array.m
index f104ad4a7d..7e1a878c42 100755
--- a/matlab/test/arrow/array/tFloat64Array.m
+++ b/matlab/test/arrow/array/tFloat64Array.m
@@ -16,6 +16,10 @@ classdef tFloat64Array < matlab.unittest.TestCase
     % implied.  See the License for the specific language governing
     % permissions and limitations under the License.
     
+    properties (TestParameter)
+        MakeDeepCopy = {true false}
+    end
+
     methods(TestClassSetup)
         function verifyOnMatlabPath(testCase)
             % arrow.array.Float64Array must be on the MATLAB path.
@@ -24,18 +28,77 @@ classdef tFloat64Array < matlab.unittest.TestCase
         end
     end
     
-    methods(TestMethodSetup)
-        function setupTempWorkingDirectory(testCase)
-            import matlab.unittest.fixtures.WorkingFolderFixture;
-            testCase.applyFixture(WorkingFolderFixture);
-        end
-    end
-    
     methods(Test)
-        function Basic(testCase)
-            A = arrow.array.Float64Array([1, 2, 3]);
+        function Basic(testCase, MakeDeepCopy)
+            A = arrow.array.Float64Array([1, 2, 3], DeepCopy=MakeDeepCopy);
             className = string(class(A));
             testCase.verifyEqual(className, "arrow.array.Float64Array");
         end
+
+        function ShallowCopy(testCase)
+            % By default, Float64Array does not create a deep copy on
+            % construction when constructed from a MATLAB array. Instead,
+            % it stores a shallow copy of the array keep the memory alive.
+            A = arrow.array.Float64Array([1, 2, 3]);
+            testCase.verifyEqual(A.MatlabArray, [1 2 3]);
+            testCase.verifyEqual(double(A), [1 2 3]');
+
+            A = arrow.array.Float64Array([1, 2, 3], DeepCopy=false);
+            testCase.verifyEqual(A.MatlabArray, [1 2 3]);
+            testCase.verifyEqual(double(A), [1 2 3]');
+        end
+
+        function DeepCopy(testCase)
+            % Verify Float64Array does not store shallow copy of the MATLAB
+            % array if DeepCopy=true was supplied.
+            A = arrow.array.Float64Array([1, 2, 3], DeepCopy=true);
+            testCase.verifyEqual(A.MatlabArray, []);
+            testCase.verifyEqual(double(A), [1 2 3]');
+        end
+
+        function Double(testCase, MakeDeepCopy)
+            % Create a Float64Array from a scalar double
+            A1 = arrow.array.Float64Array(100, DeepCopy=MakeDeepCopy);
+            data = double(A1);
+            testCase.verifyEqual(data, 100);
+
+            % Create a Float64Array from a double vector 
+            A2 = arrow.array.Float64Array([1 2 3], DeepCopy=MakeDeepCopy);
+            data = double(A2);
+            testCase.verifyEqual(data, [1 2 3]');
+        
+            % Create a Float64Array from an empty double vector
+            A3 = arrow.array.Float64Array([], DeepCopy=MakeDeepCopy);
+            data = double(A3);
+            testCase.verifyEqual(data, double.empty(0, 1));
+        end
+
+        function MinValue(testCase, MakeDeepCopy)
+            A1 = arrow.array.Float64Array(realmin, DeepCopy=MakeDeepCopy);
+            data = double(A1);
+            testCase.verifyEqual(data, realmin);
+        end
+
+        function MaxValue(testCase, MakeDeepCopy)
+            A1 = arrow.array.Float64Array(realmax, DeepCopy=MakeDeepCopy);
+            data = double(A1);
+            testCase.verifyEqual(data, realmax);
+        end
+
+        function InfValues(testCase, MakeDeepCopy)
+            A1 = arrow.array.Float64Array([Inf -Inf], DeepCopy=MakeDeepCopy);
+            data = double(A1);
+            testCase.verifyEqual(data, [Inf -Inf]');
+        end
+
+        function ErrorIfComplex(testCase, MakeDeepCopy)
+            fcn = @() arrow.array.Float64Array([10 + 1i, 4], DeepCopy=MakeDeepCopy);
+            testCase.verifyError(fcn, "MATLAB:expectedReal");
+        end
+
+        function ErrorIfSparse(testCase, MakeDeepCopy)
+            fcn = @() arrow.array.Float64Array(sparse(ones([10 1])), DeepCopy=MakeDeepCopy);
+            testCase.verifyError(fcn, "MATLAB:expectedNonsparse");
+        end
     end
 end
diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake
index 59c48c0235..92ed955ed4 100644
--- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake
+++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake
@@ -34,7 +34,8 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_SOURCE_SUBDIR "libmexclass/cpp
 set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_NAME arrowproxy)
 set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_ROOT_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp")
 set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy")
-set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/float64_array.cc")
+set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/array/proxy/array.cc")
+
 set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy")
 set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy/factory.cc")
 set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_INCLUDE_DIRS ${MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_ROOT_INCLUDE_DIR}