You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2022/10/11 20:34:22 UTC
[impala] 04/04: IMPALA-11526: Install en_US.UTF-8 locale into docker images
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 11e66523d6070957f84c1fdbba3e26ecf3888d74
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Thu Sep 29 14:36:34 2022 -0700
IMPALA-11526: Install en_US.UTF-8 locale into docker images
In IMPALA-11492, ExprTest.Utf8MaskTest was failing on some
configurations because the en_US.UTF-8 was missing. Since the
Docker images don't contain en_US.UTF-8, they are subject
to the same bug. This was confirmed by adding tests cases
to the test_utf8_strings.py end-to-end test and running it
in the dockerized tests.
This add the appropriate language pack to the list of packages
installed for the Docker build.
Testing:
- This adds end-to-end tests to test_utf8_strings.py covering the
same cases that were failing in ExprTest.Utf8MaskTest. They
failed without the added languages packs, and now succeed.
Change-Id: I353f257b3cb6d45f7d0a28f7d5319fdb457e6e3d
Reviewed-on: http://gerrit.cloudera.org:8080/19080
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Laszlo Gaal <la...@cloudera.com>
---
bin/bootstrap_system.sh | 2 +-
docker/daemon_entrypoint.sh | 21 +++++-
docker/install_os_packages.sh | 17 +++++
.../queries/QueryTest/utf8-string-functions.test | 80 ++++++++++++++++++++++
4 files changed, 118 insertions(+), 2 deletions(-)
diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh
index d637106cb..45c61c34e 100755
--- a/bin/bootstrap_system.sh
+++ b/bin/bootstrap_system.sh
@@ -268,7 +268,7 @@ redhat sudo yum install -y curl gawk gcc gcc-c++ git krb5-devel krb5-server \
wget vim-common nscd cmake fuse-devel zlib-devel \
psmisc lsof openssh-server redhat-lsb java-1.8.0-openjdk-devel \
java-1.8.0-openjdk-src python3-devel python3-setuptools net-tools \
- langpacks-en
+ langpacks-en glibc-langpack-en
# Enable the Powertools repo for snappy-devel on RedHat 8
redhat8 sudo yum install -y dnf-plugins-core
diff --git a/docker/daemon_entrypoint.sh b/docker/daemon_entrypoint.sh
index a62cc81f5..08deadcab 100755
--- a/docker/daemon_entrypoint.sh
+++ b/docker/daemon_entrypoint.sh
@@ -176,7 +176,26 @@ fi
# Set ulimit core file size 0.
ulimit -c 0
+# The UTF-8 masking functions rely on the presence of en_US.utf8. Make sure
+# it is present.
+if locale -a | grep en_US.utf8 ; then
+ echo "en_US.utf8 is present"
+else
+ echo "ERROR: en_US.utf8 locale is not present."
+ exit 1
+fi
+
# Set a UTF-8 locale to enable upper/lower/initcap functions with UTF-8 mode.
-export LC_ALL=C.UTF-8
+# Use C.UTF-8 (aka C.utf8) if it is available, and fall back to en_US.utf8 if not
+#
+# Distributions can show either C.UTF-8 or C.utf8 in "locale -a", match either one
+if locale -a | grep -e "^C.UTF-8" -e "^C.utf8" ; then
+ # C.UTF-8 and C.utf8 are interchangeable as a setting for LC_ALL.
+ export LC_ALL=C.UTF-8
+else
+ # Presence of en_US.utf8 was verified above
+ export LC_ALL=en_US.utf8
+fi
+echo "LC_ALL: ${LC_ALL}"
exec "$@"
diff --git a/docker/install_os_packages.sh b/docker/install_os_packages.sh
index e05fdaa68..f11fe94b4 100755
--- a/docker/install_os_packages.sh
+++ b/docker/install_os_packages.sh
@@ -92,6 +92,7 @@ if [[ $DISTRIBUTION == Ubuntu ]]; then
fi
apt-get install -y \
krb5-user \
+ language-pack-en \
libsasl2-2 \
libsasl2-modules \
libsasl2-modules-gssapi-mit \
@@ -122,6 +123,16 @@ elif [[ $DISTRIBUTION == Redhat ]]; then
krb5-workstation \
openldap-devel \
tzdata
+
+ # UTF-8 masking functions require the presence of en_US.utf8.
+ # Install the appropriate language packs. Redhat/Centos 7 come
+ # with en_US.utf8, so there is no need to install anything.
+ if ! grep 'release 7\.' /etc/redhat-release; then
+ yum install -y --disableplugin=subscription-manager \
+ glibc-langpack-en \
+ langpacks-en
+ fi
+
if $INSTALL_DEBUG_TOOLS ; then
echo "Installing extra debug tools"
yum install -y --disableplugin=subscription-manager \
@@ -137,6 +148,12 @@ elif [[ $DISTRIBUTION == Redhat ]]; then
fi
fi
+# Verify en_US.utf8 is present
+if ! locale -a | grep en_US.utf8 ; then
+ echo "ERROR: en_US.utf8 locale is not present."
+ exit 1
+fi
+
# To minimize the size for the Docker image, clean up any unnecessary files.
if [[ $DISTRIBUTION == Ubuntu ]]; then
apt-get clean
diff --git a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
index 8d607c95e..9417e5ad4 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
@@ -181,6 +181,86 @@ select mask('SQL引擎', 'x', 'x', 'x', 'x'),
STRING,STRING,STRING,STRING,STRING
====
---- QUERY
+set utf8_mode=true;
+select mask('abcd áäèü ABCD ÁÄÈÜ');
+---- RESULTS: RAW_STRING
+'xxxx xxxx XXXX XXXX'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask('Ich möchte ein Bier. Tschüss');
+---- RESULTS: RAW_STRING
+'Xxx xxxxxx xxx Xxxx. Xxxxxxx'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask('Hungarian áéíöóőüúű ÁÉÍÖÓŐÜÚŰ');
+---- RESULTS: RAW_STRING
+'Xxxxxxxxx xxxxxxxxx XXXXXXXXX'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask('German äöüß ÄÖÜẞ');
+---- RESULTS: RAW_STRING
+'Xxxxxx xxxx XXXX'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask('French àâæçéèêëïîôœùûüÿ ÀÂÆÇÉÈÊËÏÎÔŒÙÛÜŸ');
+---- RESULTS: RAW_STRING
+'Xxxxxx xxxxxxxxxxxxxxxx XXXXXXXXXXXXXXXX'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask('Greek αβξδ άέήώ ΑΒΞΔ ΆΈΉΏ 1234');
+---- RESULTS: RAW_STRING
+'Xxxxx xxxx xxxx XXXX XXXX nnnn'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask_first_n('áéíöóőüúű');
+---- RESULTS: RAW_STRING
+'xxxxóőüúű'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask_show_first_n('áéíöóőüúű');
+---- RESULTS: RAW_STRING
+'áéíöxxxxx'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask_last_n('áéíöóőüúű');
+---- RESULTS: RAW_STRING
+'áéíöóxxxx'
+---- TYPES
+STRING
+====
+---- QUERY
+set utf8_mode=true;
+select mask_show_last_n('áéíöóőüúű')
+---- RESULTS: RAW_STRING
+'xxxxxőüúű'
+---- TYPES
+STRING
+====
+---- QUERY
set utf8_mode=false;
select upper('abcd áäèü'), lower('ABCD ÁÄÈÜ'), initcap('abcd áäèü ABCD ÁÄÈÜ');
---- RESULTS: RAW_STRING