You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "kou (via GitHub)" <gi...@apache.org> on 2023/06/30 21:25:37 UTC

[GitHub] [arrow] kou commented on a diff in pull request #36366: GH-36250: [MATLAB] Add `arrow.array.StringArray` class

kou commented on code in PR #36366:
URL: https://github.com/apache/arrow/pull/36366#discussion_r1248295638


##########
matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc:
##########
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/matlab/array/proxy/string_array.h"
+
+#include "arrow/array/builder_binary.h"
+
+#include "arrow/matlab/error/error.h"
+#include "arrow/matlab/bit/pack.h"
+#include "arrow/matlab/bit/unpack.h"
+#include "arrow/util/utf8.h"
+
+namespace arrow::matlab::array::proxy {
+
+        libmexclass::proxy::MakeResult StringArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
+            namespace mda = ::matlab::data;
+
+            mda::StructArray opts = constructor_arguments[0];
+            const mda::StringArray array_mda = opts[0]["MatlabArray"];
+            const mda::TypedArray<bool> unpacked_validity_bitmap_mda = opts[0]["Valid"];
+
+            // Convert UTF-16 encoded MATLAB string values to UTF-8 encoded Arrow string values.
+            const auto array_length = array_mda.getNumberOfElements();
+            std::vector<std::string> strings;
+            strings.reserve(array_length);
+            for (const auto& str : array_mda) {
+                // Substitute MATLAB string(missing) values with the empty string value ("") before converting to Arrow format.
+                const auto str_utf16 = str ? *str : std::u16string{u""};
+                MATLAB_ASSIGN_OR_ERROR(auto str_utf8, arrow::util::UTF16StringToUTF8(str_utf16), error::UNICODE_CONVERSION_ERROR_ID);

Review Comment:
   Can we avoid calling `arrow::util::UTF16StringToUTF8()` when `str` is missing?



##########
matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc:
##########
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/matlab/array/proxy/string_array.h"
+
+#include "arrow/array/builder_binary.h"
+
+#include "arrow/matlab/error/error.h"
+#include "arrow/matlab/bit/pack.h"
+#include "arrow/matlab/bit/unpack.h"
+#include "arrow/util/utf8.h"
+
+namespace arrow::matlab::array::proxy {
+
+        libmexclass::proxy::MakeResult StringArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
+            namespace mda = ::matlab::data;
+
+            mda::StructArray opts = constructor_arguments[0];
+            const mda::StringArray array_mda = opts[0]["MatlabArray"];
+            const mda::TypedArray<bool> unpacked_validity_bitmap_mda = opts[0]["Valid"];
+
+            // Convert UTF-16 encoded MATLAB string values to UTF-8 encoded Arrow string values.
+            const auto array_length = array_mda.getNumberOfElements();
+            std::vector<std::string> strings;
+            strings.reserve(array_length);
+            for (const auto& str : array_mda) {
+                // Substitute MATLAB string(missing) values with the empty string value ("") before converting to Arrow format.
+                const auto str_utf16 = str ? *str : std::u16string{u""};
+                MATLAB_ASSIGN_OR_ERROR(auto str_utf8, arrow::util::UTF16StringToUTF8(str_utf16), error::UNICODE_CONVERSION_ERROR_ID);
+                strings.push_back(str_utf8);
+            }
+
+            auto unpacked_validity_bitmap_ptr = bit::unpacked_as_ptr(unpacked_validity_bitmap_mda);
+
+            // Build up an Arrow StringArray from a vector of UTF-8 encoded strings.
+            arrow::StringBuilder builder;
+            MATLAB_ERROR_IF_NOT_OK(builder.AppendValues(strings, unpacked_validity_bitmap_ptr), error::STRING_BUILDER_APPEND_FAILED);
+            MATLAB_ASSIGN_OR_ERROR(auto array, builder.Finish(), error::STRING_BUILDER_FINISH_FAILED);
+
+            return std::make_shared<arrow::matlab::array::proxy::StringArray>(array);
+        }
+
+        void StringArray::toMATLAB(libmexclass::proxy::method::Context& context) {
+            namespace mda = ::matlab::data;
+
+            // Convert UTF-8 encoded Arrow string values to UTF-16 encoded MATLAB string values.
+            auto array_length = static_cast<size_t>(array->length());
+            std::vector<mda::MATLABString> strings;
+            strings.reserve(array_length);
+            for (size_t i = 0; i < array_length; ++i) {
+                auto string_array = std::static_pointer_cast<arrow::StringArray>(array);
+                const auto& str_utf8 = string_array->GetString(i);

Review Comment:
   How about using `GetView()` instead of `GetString()` to avoid copying?
   
   ```suggestion
                   const auto& str_utf8 = string_array->GetView(i);
   ```
   
   (Ah, we should have used `std::string_view` for `UTF8StringToUTF16()`... cc: @sgilmore10 )



##########
matlab/src/cpp/arrow/matlab/bit/unpack.cc:
##########
@@ -38,4 +38,13 @@ namespace arrow::matlab::bit {
 
         return unpacked_matlab_logical_Array;
     }
+
+    const uint8_t* unpacked_as_ptr(const ::matlab::data::TypedArray<bool>& unpacked_validity_bitmap) {

Review Comment:
   Hmm. I think that we can have better name for this but I don't have any idea...
   (For example, I think that using verb is better... `extract_ptr()`? `extract_raw_data()`?)



##########
matlab/src/cpp/arrow/matlab/array/proxy/string_array.cc:
##########
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/matlab/array/proxy/string_array.h"
+
+#include "arrow/array/builder_binary.h"
+
+#include "arrow/matlab/error/error.h"
+#include "arrow/matlab/bit/pack.h"
+#include "arrow/matlab/bit/unpack.h"
+#include "arrow/util/utf8.h"
+
+namespace arrow::matlab::array::proxy {
+
+        libmexclass::proxy::MakeResult StringArray::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
+            namespace mda = ::matlab::data;
+
+            mda::StructArray opts = constructor_arguments[0];
+            const mda::StringArray array_mda = opts[0]["MatlabArray"];
+            const mda::TypedArray<bool> unpacked_validity_bitmap_mda = opts[0]["Valid"];
+
+            // Convert UTF-16 encoded MATLAB string values to UTF-8 encoded Arrow string values.
+            const auto array_length = array_mda.getNumberOfElements();
+            std::vector<std::string> strings;
+            strings.reserve(array_length);
+            for (const auto& str : array_mda) {
+                // Substitute MATLAB string(missing) values with the empty string value ("") before converting to Arrow format.
+                const auto str_utf16 = str ? *str : std::u16string{u""};
+                MATLAB_ASSIGN_OR_ERROR(auto str_utf8, arrow::util::UTF16StringToUTF8(str_utf16), error::UNICODE_CONVERSION_ERROR_ID);
+                strings.push_back(str_utf8);

Review Comment:
   Can we use `std::move()` here to avoid copying?
   
   ```suggestion
                   strings.push_back(std::move(str_utf8));
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org