You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2023/06/19 21:03:28 UTC

[arrow] branch main updated: GH-36166: [C++][MATLAB] Add utility to convert UTF-8 strings to UTF-16 and UTF-16 strings to UTF-8 (#36167)

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new bd7455f02b GH-36166: [C++][MATLAB] Add utility to convert UTF-8 strings to UTF-16 and UTF-16 strings to UTF-8 (#36167)
bd7455f02b is described below

commit bd7455f02b1a3c82f5e57b2ac221a6a70f30da88
Author: sgilmore10 <74...@users.noreply.github.com>
AuthorDate: Mon Jun 19 17:03:22 2023 -0400

    GH-36166: [C++][MATLAB] Add utility to convert UTF-8 strings to UTF-16 and UTF-16 strings to UTF-8 (#36167)
    
    ### Rationale for this change
    
    MATLAB uses UTF-16 encoded strings, but arrow uses UTF-8.  We need a way to convert between the two encodings.
    
    ### What changes are included in this PR?
    
    Added two new utility functions:
    
    1. `std::string UTF16StringToUTF8(const std::basic_string<char16_t>& source)`
    2. `std::basic_string<char16_t> UTF8StringToUTF16(const std::string& source)`
    
    ### Are these changes tested?
    
    Added two test cases to `utf8_util_test.cc`:
    
    1. `UTF16StringToUTF8`
    2. `UTF8StringToUTF16`
    
    ### Are there any user-facing changes?
    No, these APIs are intended for developers.
    
    ### Future Directions
    
    In a followup PR, we will update the MATLAB Interface source code to use these utilities when converting between UTF16 and UTF8 encoded strings.
    * Closes: #36166
    
    Authored-by: Sarah Gilmore <sg...@mathworks.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 cpp/src/arrow/util/utf8.cc           | 28 +++++++++++++++++++++++
 cpp/src/arrow/util/utf8.h            |  6 +++++
 cpp/src/arrow/util/utf8_util_test.cc | 43 ++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+)

diff --git a/cpp/src/arrow/util/utf8.cc b/cpp/src/arrow/util/utf8.cc
index fbe94f7b31..3aa46347ba 100644
--- a/cpp/src/arrow/util/utf8.cc
+++ b/cpp/src/arrow/util/utf8.cc
@@ -146,6 +146,18 @@ std::string WideStringToUTF8Internal(const std::wstring& source) {
   return s;
 }
 
+std::string UTF16StringToUTF8Internal(const std::u16string& source) {
+  std::string s;
+  ::utf8::utf16to8(source.begin(), source.end(), std::back_inserter(s));
+  return s;
+}
+
+std::u16string UTF8StringToUTF16Internal(const std::string& source) {
+  std::u16string s;
+  ::utf8::utf8to16(source.begin(), source.end(), std::back_inserter(s));
+  return s;
+}
+
 }  // namespace
 
 Result<std::wstring> UTF8ToWideString(std::string_view source) {
@@ -164,5 +176,21 @@ ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source) {
   }
 }
 
+ARROW_EXPORT Result<std::string> UTF16StringToUTF8(const std::u16string& source) {
+  try {
+    return UTF16StringToUTF8Internal(source);
+  } catch (std::exception& e) {
+    return Status::Invalid(e.what());
+  }
+}
+
+ARROW_EXPORT Result<std::u16string> UTF8StringToUTF16(const std::string& source) {
+  try {
+    return UTF8StringToUTF16Internal(source);
+  } catch (std::exception& e) {
+    return Status::Invalid(e.what());
+  }
+}
+
 }  // namespace util
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index 44d33e3e43..3be5788f93 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -36,6 +36,12 @@ ARROW_EXPORT Result<std::wstring> UTF8ToWideString(std::string_view source);
 // Similarly, convert a wstring to a UTF8 string.
 ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source);
 
+// Convert UTF8 string to a UTF16 string.
+ARROW_EXPORT Result<std::u16string> UTF8StringToUTF16(const std::string& source);
+
+// Convert UTF16 string to a UTF8 string.
+ARROW_EXPORT Result<std::string> UTF16StringToUTF8(const std::u16string& source);
+
 // This function needs to be called before doing UTF8 validation.
 ARROW_EXPORT void InitializeUTF8();
 
diff --git a/cpp/src/arrow/util/utf8_util_test.cc b/cpp/src/arrow/util/utf8_util_test.cc
index 3c8059d904..cb59ba9be0 100644
--- a/cpp/src/arrow/util/utf8_util_test.cc
+++ b/cpp/src/arrow/util/utf8_util_test.cc
@@ -397,6 +397,49 @@ TEST(WideStringToUTF8, Basics) {
 #endif
 }
 
+TEST(UTF8StringToUTF16, Basics) {
+  auto CheckOk = [](const std::string& s, const std::u16string& expected) -> void {
+    ASSERT_OK_AND_ASSIGN(std::u16string u16s, UTF8StringToUTF16(s));
+    ASSERT_EQ(u16s, expected);
+  };
+
+  auto CheckInvalid = [](const std::string& s) -> void {
+    ASSERT_RAISES(Invalid, UTF8StringToUTF16(s));
+  };
+
+  CheckOk("", u"");
+  CheckOk("foo", u"foo");
+  CheckOk("h\xc3\xa9h\xc3\xa9", u"h\u00e9h\u00e9");
+  CheckOk("\xf0\x9f\x98\x80", u"\U0001F600");
+  CheckOk("\xf4\x8f\xbf\xbf", u"\U0010FFFF");
+  CheckOk({0, 'x'}, {0, u'x'});
+
+  CheckInvalid("\xff");
+  CheckInvalid("h\xc3");
+}
+
+TEST(UTF16StringToUTF8, Basics) {
+  auto CheckOk = [](const std::u16string& u16s, const std::string& expected) -> void {
+    ASSERT_OK_AND_ASSIGN(std::string s, UTF16StringToUTF8(u16s));
+    ASSERT_EQ(s, expected);
+  };
+
+  auto CheckInvalid = [](const std::u16string& u16s) -> void {
+    ASSERT_RAISES(Invalid, UTF16StringToUTF8(u16s));
+  };
+
+  CheckOk(u"", "");
+  CheckOk(u"foo", "foo");
+  CheckOk(u"h\u00e9h\u00e9", "h\xc3\xa9h\xc3\xa9");
+  CheckOk(u"\U0001F600", "\xf0\x9f\x98\x80");
+  CheckOk(u"\U0010FFFF", "\xf4\x8f\xbf\xbf");
+  CheckOk({0, u'x'}, {0, 'x'});
+
+  // Lone surrogate
+  CheckInvalid({0xD800});
+  CheckInvalid({0xDFFF});
+}
+
 TEST(UTF8DecodeReverse, Basics) {
   auto CheckOk = [](const std::string& s) -> void {
     const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.c_str());