You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/02/15 23:42:15 UTC

[impala] 02/02: IMPALA-2019(part-4): Add UTF-8 support for case conversion functions

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 35375b3287859e2f047ce546178bd14645e0d965
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Thu Jun 17 13:51:57 2021 +0800

    IMPALA-2019(part-4): Add UTF-8 support for case conversion functions
    
    There are 3 builtin case conversion string functions: upper(), lower(),
    and initcap(). Previously they only convert English alphabetic
    characters. This patch adds support to deal with Unicode characters.
    
    There are many corner cases in case conversion depending on the locale
    and context. E.g.
    1) Case conversion is locale-sensitive.
    Turkish has 4 letter "I"s. English has only two, a lowercase dotted i
    and an uppercase dotless I. Turkish has lowercase and uppercase forms of
    both dotted and dotless I. So simply converting "i" to "I" for upper
    case is wrong in Turkish:
        +-------+--------+---------+
        |       | Dotted | Dotless |
        +-------+--------+---------+
        | Upper | İ      | I       |
        +-------+--------+---------+
        | Lower | i      | ı       |
        +-------+--------+---------+
    
    2) Case conversion may change a string's length.
    The German word "grüßen" should be converted to "GRÜSSEN" in upper case:
    the letter "ß" should be converted to "SS".
    
    3) Case conversion is context-sensitive.
    The Greek word "ὈΔΥΣΣΕΎΣ" should be converted to "ὀδυσσεύς", where the
    Greek letter "Σ" is converted to "σ" or to "ς", depending on its
    position in the word.
    
    The above cases will be focus in follow-up JIRAs. This patch addes the
    initial implementation of UTF-8 aware case conversion functions.
    
    --------
    Implementation:
    In UTF-8 mode (turned on by set UTF8_MODE=true) of these functions, the
    bytes in strings are converted to wide characters using std::mbrtowc().
    Each wide character (wchar_t) will then be converted using std::towupper
    or std::towlower correspondingly. We then convert them back to multi
    bytes using std::wcrtomb().
    
    Note that these builtins are locale aware. If impalad is launched
    without a UTF-8 aware locale, e.g. LC_ALL="C", these builtins can't
    recognize non-ascii characters, which will return unexpected results.
    Thus we modify our docker images to set LC_ALL="C.UTF-8" instead of "C".
    This patch also logs the current locale when launching impala daemons
    for better debugging. We will support customized locale in IMPALA-11080.
    
    Test:
     - Add BE unit tests and e2e tests.
    
    Change-Id: I443e89d46f4638ce85664b021666bc4f03ee8abd
    Reviewed-on: http://gerrit.cloudera.org:8080/17785
    Reviewed-by: Csaba Ringhofer <cs...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/common/init.cc                              |   1 +
 be/src/exprs/expr-test.cc                          |  50 ++++++++
 be/src/exprs/string-functions-ir.cc                | 132 +++++++++++++++++++++
 be/src/exprs/string-functions.h                    |   6 +
 common/function-registry/impala_functions.py       |   6 +
 docker/daemon_entrypoint.sh                        |   3 +
 docker/test-with-docker.py                         |   2 +-
 .../queries/QueryTest/utf8-string-functions.test   |  50 ++++++++
 8 files changed, 249 insertions(+), 1 deletion(-)

diff --git a/be/src/common/init.cc b/be/src/common/init.cc
index 58cb601..b0470ad 100644
--- a/be/src/common/init.cc
+++ b/be/src/common/init.cc
@@ -386,6 +386,7 @@ void impala::InitCommonRuntime(int argc, char** argv, bool init_jvm,
 
   LOG(INFO) << impala::GetVersionString();
   LOG(INFO) << "Using hostname: " << FLAGS_hostname;
+  LOG(INFO) << "Using locale: " << std::locale("").name();
   impala::LogCommandLineFlags();
 
   // When a process calls send(2) on a socket closed on the other end, linux generates
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index 62b59ac..d8a29cb 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -10854,6 +10854,56 @@ TEST_P(ExprTest, Utf8Test) {
   TestValue("locate('SQL', '最快的SQL引擎跑SQL', 0)", TYPE_INT, 0);
   TestValue("locate('SQL', '最快的SQL引擎跑SQL', -1)", TYPE_INT, 0);
   TestIsNull("locate('SQL', '最快的SQL引擎跑SQL', NULL)", TYPE_INT);
+
+  TestStringValue("upper('abcd áäèü')", "ABCD ÁÄÈÜ");
+  TestStringValue("lower('ABCD ÁÄÈÜ')", "abcd áäèü");
+  TestStringValue("initcap('abcd áäèü ABCD ÁÄÈÜ')",
+      "Abcd Áäèü Abcd Áäèü");
+
+  TestStringValue("upper('aáàâãäăāåąæ')", "AÁÀÂÃÄĂĀÅĄÆ");
+  TestStringValue("lower('AÁÀÂÃÄĂĀÅĄÆ')", "aáàâãäăāåąæ");
+  TestStringValue("initcap('AÁÀÂÃÄĂĀÅĄÆ')", "Aáàâãäăāåąæ");
+
+  TestStringValue("upper('eéèêëěēėę')", "EÉÈÊËĚĒĖĘ");
+  TestStringValue("lower('EÉÈÊËĚĒĖĘ')", "eéèêëěēėę");
+  TestStringValue("initcap('EÉÈÊËĚĒĖĘ')", "Eéèêëěēėę");
+
+  // The uppercase of "i" and "ı" are both "I". However, the lowercase of "I" is "i"
+  // because we don't support Turkish locale yet (IMPALA-11080).
+  // Due to the same reason, the lowercase of "İ" and "I" are both "i", but the uppercase
+  // of "i" is "I".
+  TestStringValue("upper('iíìîïīįı')", "IÍÌÎÏĪĮI");
+  TestStringValue("lower('IÍÌÎÏĪĮIİ')", "iíìîïīįii");
+  TestStringValue("initcap('IÍÌÎÏĪĮIİ')", "Iíìîïīįii");
+
+  TestStringValue("upper('oóòôõöőøœ')", "OÓÒÔÕÖŐØŒ");
+  TestStringValue("lower('OÓÒÔÕÖŐØŒ')", "oóòôõöőøœ");
+  TestStringValue("initcap('OÓÒÔÕÖŐØŒ')", "Oóòôõöőøœ");
+
+  TestStringValue("upper('uúùûüűū')", "UÚÙÛÜŰŪ");
+  TestStringValue("lower('UÚÙÛÜŰŪ')", "uúùûüűū");
+  TestStringValue("initcap('UÚÙÛÜŰŪ')", "Uúùûüűū");
+
+  // The uppercase of "đ" and "ð" are both "Đ", but the lowercase of "Đ" is "đ"
+  // due to the hard-coded locale, i.e. "en_US.UTF-8".
+  TestStringValue("upper('ýćčďđðģğķłļ')", "ÝĆČĎĐÐĢĞĶŁĻ");
+  TestStringValue("lower('ÝĆČĎĐĢĞĶŁĻ')", "ýćčďđģğķłļ");
+  TestStringValue("initcap('ÝĆČĎĐĢĞĶŁĻ')", "Ýćčďđģğķłļ");
+
+  TestStringValue("upper('ńñňņŋ')", "ŃÑŇŅŊ");
+  TestStringValue("lower('ŃÑŇŅŊ')", "ńñňņŋ");
+  TestStringValue("initcap('ŃÑŇŅŊ')", "Ńñňņŋ");
+
+  TestStringValue("upper('řśšşťŧþţżźž')", "ŘŚŠŞŤŦÞŢŻŹŽ");
+  TestStringValue("lower('ŘŚŠŞŤŦÞŢŻŹŽ')", "řśšşťŧþţżźž");
+  TestStringValue("initcap('ŘŚŠŞŤŦÞŢŻŹŽ')", "Řśšşťŧþţżźž");
+
+  // Tests with the null byte ('\0') in the middle. Explicitly create the expected
+  // results as std::string in case they are truncated at '\0'.
+  TestStringValue("upper('ábć\\0èfğ')", string("ÁBĆ\0ÈFĞ", 11));
+  TestStringValue("lower('ÁBĆ\\0ÈFĞ')", string("ábć\0èfğ", 11));
+  TestStringValue("initcap('ábć\\0ÈFĞ')", string("Ábć\0èfğ", 11));
+
   executor_->PopExecOption();
 }
 
diff --git a/be/src/exprs/string-functions-ir.cc b/be/src/exprs/string-functions-ir.cc
index f8a1a61..96724ad 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -286,6 +286,14 @@ IntVal StringFunctions::Utf8Length(FunctionContext* context, const StringVal& st
 
 StringVal StringFunctions::Lower(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return StringVal::null();
+  if (context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE)) {
+    return LowerUtf8(context, str);
+  }
+  return LowerAscii(context, str);
+}
+
+StringVal StringFunctions::LowerAscii(FunctionContext* context, const StringVal& str) {
+  // Not in UTF-8 mode, only English alphabetic characters will be converted.
   StringVal result(context, str.len);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   for (int i = 0; i < str.len; ++i) {
@@ -296,6 +304,14 @@ StringVal StringFunctions::Lower(FunctionContext* context, const StringVal& str)
 
 StringVal StringFunctions::Upper(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return StringVal::null();
+  if (context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE)) {
+    return UpperUtf8(context, str);
+  }
+  return UpperAscii(context, str);
+}
+
+StringVal StringFunctions::UpperAscii(FunctionContext* context, const StringVal& str) {
+  // Not in UTF-8 mode, only English alphabetic characters will be converted.
   StringVal result(context, str.len);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   for (int i = 0; i < str.len; ++i) {
@@ -310,6 +326,13 @@ StringVal StringFunctions::Upper(FunctionContext* context, const StringVal& str)
 // will return NULL
 StringVal StringFunctions::InitCap(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return StringVal::null();
+  if (context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE)) {
+    return InitCapUtf8(context, str);
+  }
+  return InitCapAscii(context, str);
+}
+
+StringVal StringFunctions::InitCapAscii(FunctionContext* context, const StringVal& str) {
   StringVal result(context, str.len);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   uint8_t* result_ptr = result.ptr;
@@ -326,6 +349,115 @@ StringVal StringFunctions::InitCap(FunctionContext* context, const StringVal& st
   return result;
 }
 
+/// Reports the error in parsing multibyte characters with leading bytes and current
+/// locale. Used in Utf8CaseConversion().
+static void ReportErrorBytes(FunctionContext* context, const StringVal& str,
+    int current_idx) {
+  DCHECK_LT(current_idx, str.len);
+  stringstream ss;
+  ss << "[0x" << std::hex << (int)DCHECK_NOTNULL(str.ptr)[current_idx];
+  for (int k = 1; k < 4 && current_idx + k < str.len; ++k) {
+    ss << ", 0x" << std::hex << (int)str.ptr[current_idx + k];
+  }
+  ss << "]";
+  context->AddWarning(Substitute(
+      "Illegal multi-byte character in string. Leading bytes: $0. Current locale: $1",
+      ss.str(), std::locale("").name()).c_str());
+}
+
+/// Converts string based on the transform function 'fn'. The unit of the conversion is
+/// a wchar_t (i.e. uint32_t) which is parsed from multi bytes using std::mbtowc().
+/// The transform function 'fn' accepts two parameters: the original wchar_t and a flag
+/// indicating whether it's the first character of a word.
+/// After the transformation, the wchar_t is converted back to bytes.
+static StringVal Utf8CaseConversion(FunctionContext* context, const StringVal& str,
+    uint32_t (*fn)(uint32_t, bool*)) {
+  // Usually the upper/lower cases have the same size in bytes. Here we add 4 bytes
+  // buffer in case of illegal Unicodes.
+  int max_result_bytes = str.len + 4;
+  StringVal result(context, max_result_bytes);
+  if (UNLIKELY(result.is_null)) return StringVal::null();
+  wchar_t wc;
+  int wc_bytes;
+  bool word_start = true;
+  uint8_t* result_ptr = result.ptr;
+  std::mbstate_t wc_state{};
+  std::mbstate_t mb_state{};
+  for (int i = 0; i < str.len; i += wc_bytes) {
+    // std::mbtowc converts a multibyte sequence to a wide character. It's not
+    // thread safe. Here we use std::mbrtowc instead.
+    wc_bytes = std::mbrtowc(&wc, reinterpret_cast<char*>(str.ptr + i), str.len - i,
+        &wc_state);
+    bool needs_conversion = true;
+    if (wc_bytes == 0) {
+      // std::mbtowc returns 0 when hitting '\0'.
+      wc = 0;
+      wc_bytes = 1;
+    } else if (wc_bytes < 0) {
+      ReportErrorBytes(context, str, i);
+      // Replace it to the replacement character (U+FFFD)
+      wc = 0xFFFD;
+      needs_conversion = false;
+      // Jump to the next legal UTF-8 start byte.
+      wc_bytes = 1;
+      while (i + wc_bytes < str.len && !BitUtil::IsUtf8StartByte(str.ptr[i + wc_bytes])) {
+        wc_bytes++;
+      }
+    }
+    if (needs_conversion) wc = fn(wc, &word_start);
+    // std::wctomb converts a wide character to a multibyte sequence. It's not
+    // thread safe. Here we use std::wcrtomb instead.
+    int res_bytes = std::wcrtomb(reinterpret_cast<char*>(result_ptr), wc, &mb_state);
+    if (res_bytes <= 0) {
+      if (needs_conversion) {
+        context->AddWarning(Substitute(
+            "Ignored illegal wide character in results: $0. Current locale: $1",
+            wc, std::locale("").name()).c_str());
+      }
+      continue;
+    }
+    result_ptr += res_bytes;
+    if (result_ptr - result.ptr > max_result_bytes - 4) {
+      // Double the result buffer for overflow
+      max_result_bytes *= 2;
+      max_result_bytes = min<int>(StringVal::MAX_LENGTH,
+          static_cast<int>(BitUtil::RoundUpToPowerOfTwo(max_result_bytes)));
+      int offset = result_ptr - result.ptr;
+      if (UNLIKELY(!result.Resize(context, max_result_bytes))) return StringVal::null();
+      result_ptr = result.ptr + offset;
+    }
+  }
+  result.len = result_ptr - result.ptr;
+  return result;
+}
+
+StringVal StringFunctions::LowerUtf8(FunctionContext* context, const StringVal& str) {
+  return Utf8CaseConversion(context, str,
+      [](uint32_t wide_char, bool* word_start) {
+        return std::towlower(wide_char);
+      });
+}
+
+StringVal StringFunctions::UpperUtf8(FunctionContext* context, const StringVal& str) {
+  return Utf8CaseConversion(context, str,
+      [](uint32_t wide_char, bool* word_start) {
+        return std::towupper(wide_char);
+      });
+}
+
+StringVal StringFunctions::InitCapUtf8(FunctionContext* context, const StringVal& str) {
+  return Utf8CaseConversion(context, str,
+      [](uint32_t wide_char, bool* word_start) {
+        if (UNLIKELY(iswspace(wide_char))) {
+          *word_start = true;
+          return wide_char;
+        }
+        uint32_t res = *word_start ? std::towupper(wide_char) : std::towlower(wide_char);
+        *word_start = false;
+        return res;
+      });
+}
+
 struct ReplaceContext {
   ReplaceContext(StringVal *pattern_in) {
     pattern = StringValue::FromStringVal(*pattern_in);
diff --git a/be/src/exprs/string-functions.h b/be/src/exprs/string-functions.h
index b9a2248..e2b7dea 100644
--- a/be/src/exprs/string-functions.h
+++ b/be/src/exprs/string-functions.h
@@ -77,8 +77,14 @@ class StringFunctions {
   static IntVal CharLength(FunctionContext*, const StringVal& str);
   static IntVal Utf8Length(FunctionContext*, const StringVal& str);
   static StringVal Lower(FunctionContext*, const StringVal& str);
+  static StringVal LowerAscii(FunctionContext*, const StringVal& str);
+  static StringVal LowerUtf8(FunctionContext*, const StringVal& str);
   static StringVal Upper(FunctionContext*, const StringVal& str);
+  static StringVal UpperAscii(FunctionContext*, const StringVal& str);
+  static StringVal UpperUtf8(FunctionContext*, const StringVal& str);
   static StringVal InitCap(FunctionContext*, const StringVal& str);
+  static StringVal InitCapAscii(FunctionContext*, const StringVal& str);
+  static StringVal InitCapUtf8(FunctionContext*, const StringVal& str);
   static void ReplacePrepare(FunctionContext*, FunctionContext::FunctionStateScope);
   static void ReplaceClose(FunctionContext*, FunctionContext::FunctionStateScope);
   static StringVal Replace(FunctionContext*, const StringVal& str,
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index ee2a9a5..30fb3c4 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -514,8 +514,14 @@ visible_functions = [
   [['character_length'], 'INT', ['STRING'], 'impala::StringFunctions::Length'],
   [['utf8_length'], 'INT', ['STRING'], 'impala::StringFunctions::Utf8Length'],
   [['lower', 'lcase'], 'STRING', ['STRING'], 'impala::StringFunctions::Lower'],
+  [['lower_ascii', 'lcase_ascii'], 'STRING', ['STRING'], 'impala::StringFunctions::LowerAscii'],
+  [['lower_utf8', 'lcase_utf8'], 'STRING', ['STRING'], 'impala::StringFunctions::LowerUtf8'],
   [['upper', 'ucase'], 'STRING', ['STRING'], 'impala::StringFunctions::Upper'],
+  [['upper_ascii', 'ucase_ascii'], 'STRING', ['STRING'], 'impala::StringFunctions::UpperAscii'],
+  [['upper_utf8', 'ucase_utf8'], 'STRING', ['STRING'], 'impala::StringFunctions::UpperUtf8'],
   [['initcap'], 'STRING', ['STRING'], 'impala::StringFunctions::InitCap'],
+  [['initcap_ascii'], 'STRING', ['STRING'], 'impala::StringFunctions::InitCapAscii'],
+  [['initcap_utf8'], 'STRING', ['STRING'], 'impala::StringFunctions::InitCapUtf8'],
   [['replace'], 'STRING', ['STRING', 'STRING', 'STRING'], 'impala::StringFunctions::Replace',
    '_ZN6impala15StringFunctions14ReplacePrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
    '_ZN6impala15StringFunctions12ReplaceCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
diff --git a/docker/daemon_entrypoint.sh b/docker/daemon_entrypoint.sh
index f253296..e670bd5 100755
--- a/docker/daemon_entrypoint.sh
+++ b/docker/daemon_entrypoint.sh
@@ -66,4 +66,7 @@ fi
 # Set ulimit core file size 0.
 ulimit -c 0
 
+# Set a UTF-8 locale to enable upper/lower/initcap functions with UTF-8 mode.
+export LC_ALL=C.UTF-8
+
 exec "$@"
diff --git a/docker/test-with-docker.py b/docker/test-with-docker.py
index 35f64aa..f01b4e5 100755
--- a/docker/test-with-docker.py
+++ b/docker/test-with-docker.py
@@ -553,7 +553,7 @@ class TestWithDocker(object):
           # Label with the git root directory for easier cleanup
           "--label=pwd=" + self.git_root,
           # Consistent locales
-          "-e", "LC_ALL=C",
+          "-e", "LC_ALL=C.UTF-8",
           "-e", "IMPALAD_MEM_LIMIT_BYTES=" +
           str(self.impalad_mem_limit_bytes),
           # Mount the git directory so that clones can be local.
diff --git a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
index 84bab4b..8d607c9 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
@@ -180,3 +180,53 @@ select mask('SQL引擎', 'x', 'x', 'x', 'x'),
 ---- TYPES
 STRING,STRING,STRING,STRING,STRING
 ====
+---- QUERY
+set utf8_mode=false;
+select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
+---- RESULTS: RAW_STRING
+'ABCD áäèü','abcd ÁÄÈÜ','Abcd áäèü Abcd ÁÄÈÜ'
+---- TYPES
+STRING,STRING,STRING
+====
+---- QUERY
+set utf8_mode=true;
+select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
+---- RESULTS: RAW_STRING
+'ABCD ÁÄÈÜ','abcd áäèü','Abcd Áäèü Abcd Áäèü'
+---- TYPES
+STRING,STRING,STRING
+====
+---- QUERY
+set utf8_mode=false;
+select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','张三','张三'
+2,'李四','李四','李四'
+3,'王五','王五','王五'
+4,'李小龙','李小龙','李小龙'
+5,'ALICE','alice','Alice'
+6,'陈BOB','陈bob','陈bob'
+7,'БOPиC','Бopиc','Бopиc'
+8,'JöRG','jörg','Jörg'
+9,'ひなた','ひなた','ひなた'
+10,'서연','서연','서연'
+---- TYPES
+INT,STRING,STRING,STRING
+====
+---- QUERY
+set utf8_mode=true;
+select id, upper(name), lower(name), initcap(name) from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','张三','张三'
+2,'李四','李四','李四'
+3,'王五','王五','王五'
+4,'李小龙','李小龙','李小龙'
+5,'ALICE','alice','Alice'
+6,'陈BOB','陈bob','陈bob'
+7,'БOPИC','бopиc','Бopиc'
+8,'JÖRG','jörg','Jörg'
+9,'ひなた','ひなた','ひなた'
+10,'서연','서연','서연'
+---- TYPES
+INT,STRING,STRING,STRING
+====