You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/08/08 23:37:04 UTC
[impala] 07/27: IMPALA-11325: Fix UnicodeDecodeError for shell file output
This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git
commit f6a870ccdf1ce24b80b40ff3af7ac6c023138306
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Tue May 31 16:14:55 2022 -0700
IMPALA-11325: Fix UnicodeDecodeError for shell file output
When using the --output_file commandline option for
impala-shell, the shell fails with UnicodeDecodeError
if the output contains Unicode characters.
For example, if running this command:
impala-shell -B -q "select '引'" --output_file=output.txt
This fails with:
UnicodeDecodeError : 'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)
This happens due to an encode('utf-8') call happening
in OutputStream::write() on a string that is already UTF-8 encoded.
This changes the code to skip the encode('utf-8') call for Python 2.
Python 3 is using a string and still needs the encode call.
This is mostly a pragmatic fix to make the code a little bit
more functional, and there is more work to be done to have
clear contracts for the format() methods and clear points
of conversion to/from bytes.
Testing:
- Ran shell tests with Python 2 and Python 3 on Ubuntu 18
- Added a shell test that outputs a Unicode character
to an output file. Without the fix, this test fails.
Change-Id: Ic40be3d530c2694465f7bd2edb0e0586ff0e1fba
Reviewed-on: http://gerrit.cloudera.org:8080/18576
Reviewed-by: Michael Smith <mi...@cloudera.com>
Reviewed-by: Quanlong Huang <hu...@gmail.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
shell/shell_output.py | 10 +++++++++-
tests/shell/test_shell_commandline.py | 17 +++++++++++++++++
2 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/shell/shell_output.py b/shell/shell_output.py
index 978196539..becc4dd06 100644
--- a/shell/shell_output.py
+++ b/shell/shell_output.py
@@ -112,7 +112,15 @@ class OutputStream(object):
with open(self.filename, 'ab') as out_file:
# Note that instances of this class do not persist, so it's fine to
# close the we close the file handle after each write.
- out_file.write(formatted_data.encode('utf-8')) # file opened in binary mode
+ # The file is opened in binary mode. Python 2 returns Unicode bytes
+ # that can be written directly. Python 3 returns a string, which
+ # we need to encode before writing.
+ # TODO: Reexamine the contract of the format() function and see if
+ # we can remove this.
+ if sys.version_info.major == 2 and isinstance(formatted_data, str):
+ out_file.write(formatted_data)
+ else:
+ out_file.write(formatted_data.encode('utf-8'))
out_file.write(b'\n')
except IOError as err:
file_err_msg = "Error opening file %s: %s" % (self.filename, str(err))
diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py
index 14a6fdcfb..c8393ecc4 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -1193,6 +1193,23 @@ class TestImpalaShell(ImpalaTestSuite):
rows_from_file = [line.rstrip() for line in f]
assert rows_from_stdout == rows_from_file
+ def test_output_file_utf8(self, vector, tmp_file):
+ """Test that writing UTF-8 output to a file using '--output_file' produces the
+ same output as written to stdout."""
+ # This is purely about UTF-8 output, so it doesn't need multiple rows.
+ query = "select '引'"
+ # Run the query normally and keep the stdout
+ output = run_impala_shell_cmd(vector, ['-q', query, '-B', '--output_delimiter=;'])
+ assert "Fetched 1 row(s)" in output.stderr
+ rows_from_stdout = output.stdout.strip().split('\n')
+ # Run the query with output sent to a file using '--output_file'.
+ result = run_impala_shell_cmd(vector, ['-q', query, '-B', '--output_delimiter=;',
+ '--output_file=%s' % tmp_file])
+ assert "Fetched 1 row(s)" in result.stderr
+ with open(tmp_file, "r") as f:
+ rows_from_file = [line.rstrip() for line in f]
+ assert rows_from_stdout == rows_from_file
+
def test_http_socket_timeout(self, vector):
"""Test setting different http_socket_timeout_s values."""
if (vector.get_value('strict_hs2_protocol') or