You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2023/06/19 21:03:28 UTC
[arrow] branch main updated: GH-36166: [C++][MATLAB] Add utility to convert UTF-8 strings to UTF-16 and UTF-16 strings to UTF-8 (#36167)
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new bd7455f02b GH-36166: [C++][MATLAB] Add utility to convert UTF-8 strings to UTF-16 and UTF-16 strings to UTF-8 (#36167)
bd7455f02b is described below
commit bd7455f02b1a3c82f5e57b2ac221a6a70f30da88
Author: sgilmore10 <74...@users.noreply.github.com>
AuthorDate: Mon Jun 19 17:03:22 2023 -0400
GH-36166: [C++][MATLAB] Add utility to convert UTF-8 strings to UTF-16 and UTF-16 strings to UTF-8 (#36167)
### Rationale for this change
MATLAB uses UTF-16 encoded strings, but arrow uses UTF-8. We need a way to convert between the two encodings.
### What changes are included in this PR?
Added two new utility functions:
1. `std::string UTF16StringToUTF8(const std::basic_string<char16_t>& source)`
2. `std::basic_string<char16_t> UTF8StringToUTF16(const std::string& source)`
### Are these changes tested?
Added two test cases to `utf8_util_test.cc`:
1. `UTF16StringToUTF8`
2. `UTF8StringToUTF16`
### Are there any user-facing changes?
No, these APIs are intended for developers.
### Future Directions
In a followup PR, we will update the MATLAB Interface source code to use these utilities when converting between UTF16 and UTF8 encoded strings.
* Closes: #36166
Authored-by: Sarah Gilmore <sg...@mathworks.com>
Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
cpp/src/arrow/util/utf8.cc | 28 +++++++++++++++++++++++
cpp/src/arrow/util/utf8.h | 6 +++++
cpp/src/arrow/util/utf8_util_test.cc | 43 ++++++++++++++++++++++++++++++++++++
3 files changed, 77 insertions(+)
diff --git a/cpp/src/arrow/util/utf8.cc b/cpp/src/arrow/util/utf8.cc
index fbe94f7b31..3aa46347ba 100644
--- a/cpp/src/arrow/util/utf8.cc
+++ b/cpp/src/arrow/util/utf8.cc
@@ -146,6 +146,18 @@ std::string WideStringToUTF8Internal(const std::wstring& source) {
return s;
}
+std::string UTF16StringToUTF8Internal(const std::u16string& source) {
+ std::string s;
+ ::utf8::utf16to8(source.begin(), source.end(), std::back_inserter(s));
+ return s;
+}
+
+std::u16string UTF8StringToUTF16Internal(const std::string& source) {
+ std::u16string s;
+ ::utf8::utf8to16(source.begin(), source.end(), std::back_inserter(s));
+ return s;
+}
+
} // namespace
Result<std::wstring> UTF8ToWideString(std::string_view source) {
@@ -164,5 +176,21 @@ ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source) {
}
}
+ARROW_EXPORT Result<std::string> UTF16StringToUTF8(const std::u16string& source) {
+ try {
+ return UTF16StringToUTF8Internal(source);
+ } catch (std::exception& e) {
+ return Status::Invalid(e.what());
+ }
+}
+
+ARROW_EXPORT Result<std::u16string> UTF8StringToUTF16(const std::string& source) {
+ try {
+ return UTF8StringToUTF16Internal(source);
+ } catch (std::exception& e) {
+ return Status::Invalid(e.what());
+ }
+}
+
} // namespace util
} // namespace arrow
diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index 44d33e3e43..3be5788f93 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -36,6 +36,12 @@ ARROW_EXPORT Result<std::wstring> UTF8ToWideString(std::string_view source);
// Similarly, convert a wstring to a UTF8 string.
ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source);
+// Convert UTF8 string to a UTF16 string.
+ARROW_EXPORT Result<std::u16string> UTF8StringToUTF16(const std::string& source);
+
+// Convert UTF16 string to a UTF8 string.
+ARROW_EXPORT Result<std::string> UTF16StringToUTF8(const std::u16string& source);
+
// This function needs to be called before doing UTF8 validation.
ARROW_EXPORT void InitializeUTF8();
diff --git a/cpp/src/arrow/util/utf8_util_test.cc b/cpp/src/arrow/util/utf8_util_test.cc
index 3c8059d904..cb59ba9be0 100644
--- a/cpp/src/arrow/util/utf8_util_test.cc
+++ b/cpp/src/arrow/util/utf8_util_test.cc
@@ -397,6 +397,49 @@ TEST(WideStringToUTF8, Basics) {
#endif
}
+TEST(UTF8StringToUTF16, Basics) {
+ auto CheckOk = [](const std::string& s, const std::u16string& expected) -> void {
+ ASSERT_OK_AND_ASSIGN(std::u16string u16s, UTF8StringToUTF16(s));
+ ASSERT_EQ(u16s, expected);
+ };
+
+ auto CheckInvalid = [](const std::string& s) -> void {
+ ASSERT_RAISES(Invalid, UTF8StringToUTF16(s));
+ };
+
+ CheckOk("", u"");
+ CheckOk("foo", u"foo");
+ CheckOk("h\xc3\xa9h\xc3\xa9", u"h\u00e9h\u00e9");
+ CheckOk("\xf0\x9f\x98\x80", u"\U0001F600");
+ CheckOk("\xf4\x8f\xbf\xbf", u"\U0010FFFF");
+ CheckOk({0, 'x'}, {0, u'x'});
+
+ CheckInvalid("\xff");
+ CheckInvalid("h\xc3");
+}
+
+TEST(UTF16StringToUTF8, Basics) {
+ auto CheckOk = [](const std::u16string& u16s, const std::string& expected) -> void {
+ ASSERT_OK_AND_ASSIGN(std::string s, UTF16StringToUTF8(u16s));
+ ASSERT_EQ(s, expected);
+ };
+
+ auto CheckInvalid = [](const std::u16string& u16s) -> void {
+ ASSERT_RAISES(Invalid, UTF16StringToUTF8(u16s));
+ };
+
+ CheckOk(u"", "");
+ CheckOk(u"foo", "foo");
+ CheckOk(u"h\u00e9h\u00e9", "h\xc3\xa9h\xc3\xa9");
+ CheckOk(u"\U0001F600", "\xf0\x9f\x98\x80");
+ CheckOk(u"\U0010FFFF", "\xf4\x8f\xbf\xbf");
+ CheckOk({0, u'x'}, {0, 'x'});
+
+ // Lone surrogate
+ CheckInvalid({0xD800});
+ CheckInvalid({0xDFFF});
+}
+
TEST(UTF8DecodeReverse, Basics) {
auto CheckOk = [](const std::string& s) -> void {
const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.c_str());