You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2022/10/11 20:34:22 UTC

[impala] 04/04: IMPALA-11526: Install en_US.UTF-8 locale into docker images

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 11e66523d6070957f84c1fdbba3e26ecf3888d74
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Thu Sep 29 14:36:34 2022 -0700

    IMPALA-11526: Install en_US.UTF-8 locale into docker images
    
    In IMPALA-11492, ExprTest.Utf8MaskTest was failing on some
    configurations because the en_US.UTF-8 was missing. Since the
    Docker images don't contain en_US.UTF-8, they are subject
    to the same bug. This was confirmed by adding tests cases
    to the test_utf8_strings.py end-to-end test and running it
    in the dockerized tests.
    
    This add the appropriate language pack to the list of packages
    installed for the Docker build.
    
    Testing:
     - This adds end-to-end tests to test_utf8_strings.py covering the
       same cases that were failing in ExprTest.Utf8MaskTest. They
       failed without the added languages packs, and now succeed.
    
    Change-Id: I353f257b3cb6d45f7d0a28f7d5319fdb457e6e3d
    Reviewed-on: http://gerrit.cloudera.org:8080/19080
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Laszlo Gaal <la...@cloudera.com>
---
 bin/bootstrap_system.sh                            |  2 +-
 docker/daemon_entrypoint.sh                        | 21 +++++-
 docker/install_os_packages.sh                      | 17 +++++
 .../queries/QueryTest/utf8-string-functions.test   | 80 ++++++++++++++++++++++
 4 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh
index d637106cb..45c61c34e 100755
--- a/bin/bootstrap_system.sh
+++ b/bin/bootstrap_system.sh
@@ -268,7 +268,7 @@ redhat sudo yum install -y curl gawk gcc gcc-c++ git krb5-devel krb5-server \
         wget vim-common nscd cmake fuse-devel zlib-devel \
         psmisc lsof openssh-server redhat-lsb java-1.8.0-openjdk-devel \
         java-1.8.0-openjdk-src python3-devel python3-setuptools net-tools \
-        langpacks-en
+        langpacks-en glibc-langpack-en
 
 # Enable the Powertools repo for snappy-devel on RedHat 8
 redhat8 sudo yum install -y dnf-plugins-core
diff --git a/docker/daemon_entrypoint.sh b/docker/daemon_entrypoint.sh
index a62cc81f5..08deadcab 100755
--- a/docker/daemon_entrypoint.sh
+++ b/docker/daemon_entrypoint.sh
@@ -176,7 +176,26 @@ fi
 # Set ulimit core file size 0.
 ulimit -c 0
 
+# The UTF-8 masking functions rely on the presence of en_US.utf8. Make sure
+# it is present.
+if locale -a | grep en_US.utf8 ; then
+  echo "en_US.utf8 is present"
+else
+  echo "ERROR: en_US.utf8 locale is not present."
+  exit 1
+fi
+
 # Set a UTF-8 locale to enable upper/lower/initcap functions with UTF-8 mode.
-export LC_ALL=C.UTF-8
+# Use C.UTF-8 (aka C.utf8) if it is available, and fall back to en_US.utf8 if not
+#
+# Distributions can show either C.UTF-8 or C.utf8 in "locale -a", match either one
+if locale -a | grep -e "^C.UTF-8" -e "^C.utf8" ; then
+  # C.UTF-8 and C.utf8 are interchangeable as a setting for LC_ALL.
+  export LC_ALL=C.UTF-8
+else
+  # Presence of en_US.utf8 was verified above
+  export LC_ALL=en_US.utf8
+fi
+echo "LC_ALL: ${LC_ALL}"
 
 exec "$@"
diff --git a/docker/install_os_packages.sh b/docker/install_os_packages.sh
index e05fdaa68..f11fe94b4 100755
--- a/docker/install_os_packages.sh
+++ b/docker/install_os_packages.sh
@@ -92,6 +92,7 @@ if [[ $DISTRIBUTION == Ubuntu ]]; then
   fi
   apt-get install -y \
       krb5-user \
+      language-pack-en \
       libsasl2-2 \
       libsasl2-modules \
       libsasl2-modules-gssapi-mit \
@@ -122,6 +123,16 @@ elif [[ $DISTRIBUTION == Redhat ]]; then
       krb5-workstation \
       openldap-devel \
       tzdata
+
+  # UTF-8 masking functions require the presence of en_US.utf8.
+  # Install the appropriate language packs. Redhat/Centos 7 come
+  # with en_US.utf8, so there is no need to install anything.
+  if ! grep 'release 7\.' /etc/redhat-release; then
+      yum install -y --disableplugin=subscription-manager \
+          glibc-langpack-en \
+          langpacks-en
+  fi
+
   if $INSTALL_DEBUG_TOOLS ; then
     echo "Installing extra debug tools"
     yum install -y --disableplugin=subscription-manager \
@@ -137,6 +148,12 @@ elif [[ $DISTRIBUTION == Redhat ]]; then
   fi
 fi
 
+# Verify en_US.utf8 is present
+if ! locale -a | grep en_US.utf8 ; then
+  echo "ERROR: en_US.utf8 locale is not present."
+  exit 1
+fi
+
 # To minimize the size for the Docker image, clean up any unnecessary files.
 if [[ $DISTRIBUTION == Ubuntu ]]; then
   apt-get clean
diff --git a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
index 8d607c95e..9417e5ad4 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
@@ -181,6 +181,86 @@ select mask('SQL引擎', 'x', 'x', 'x', 'x'),
 STRING,STRING,STRING,STRING,STRING
 ====
 ---- QUERY
+set utf8_mode=true;
+select mask('abcd áäèü ABCD ÁÄÈÜ');
+---- RESULTS: RAW_STRING
+'xxxx xxxx XXXX XXXX'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask('Ich möchte ein Bier. Tschüss');
+---- RESULTS: RAW_STRING
+'Xxx xxxxxx xxx Xxxx. Xxxxxxx'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask('Hungarian áéíöóőüúű ÁÉÍÖÓŐÜÚŰ');
+---- RESULTS: RAW_STRING
+'Xxxxxxxxx xxxxxxxxx XXXXXXXXX'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask('German äöüß ÄÖÜẞ');
+---- RESULTS: RAW_STRING
+'Xxxxxx xxxx XXXX'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask('French àâæçéèêëïîôœùûüÿ ÀÂÆÇÉÈÊËÏÎÔŒÙÛÜŸ');
+---- RESULTS: RAW_STRING
+'Xxxxxx xxxxxxxxxxxxxxxx XXXXXXXXXXXXXXXX'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask('Greek αβξδ άέήώ ΑΒΞΔ ΆΈΉΏ 1234');
+---- RESULTS: RAW_STRING
+'Xxxxx xxxx xxxx XXXX XXXX nnnn'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask_first_n('áéíöóőüúű');
+---- RESULTS: RAW_STRING
+'xxxxóőüúű'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask_show_first_n('áéíöóőüúű');
+---- RESULTS: RAW_STRING
+'áéíöxxxxx'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask_last_n('áéíöóőüúű');
+---- RESULTS: RAW_STRING
+'áéíöóxxxx'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask_show_last_n('áéíöóőüúű')
+---- RESULTS: RAW_STRING
+'xxxxxőüúű'
+---- TYPES
+STRING
+====
+---- QUERY
 set utf8_mode=false;
 select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
 ---- RESULTS: RAW_STRING