You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2022/06/02 01:53:39 UTC

[impala] 02/02: IMPALA-11325: Fix UnicodeDecodeError for shell file output

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit ed0d9341d3229b5857c8583d1817172d61b0f68c
Author: Joe McDonnell <jo...@cloudera.com>
AuthorDate: Tue May 31 16:14:55 2022 -0700

    IMPALA-11325: Fix UnicodeDecodeError for shell file output
    
    When using the --output_file commandline option for
    impala-shell, the shell fails with UnicodeDecodeError
    if the output contains Unicode characters.
    
    For example, if running this command:
    impala-shell -B -q "select '引'" --output_file=output.txt
    This fails with:
    UnicodeDecodeError : 'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)
    
    This happens due to an encode('utf-8') call happening
    in OutputStream::write() on a string that is already UTF-8 encoded.
    This changes the code to skip the encode('utf-8') call for Python 2.
    Python 3 is using a string and still needs the encode call.
    
    This is mostly a pragmatic fix to make the code a little bit
    more functional, and there is more work to be done to have
    clear contracts for the format() methods and clear points
    of conversion to/from bytes.
    
    Testing:
     - Ran shell tests with Python 2 and Python 3 on Ubuntu 18
     - Added a shell test that outputs a Unicode character
       to an output file. Without the fix, this test fails.
    
    Change-Id: Ic40be3d530c2694465f7bd2edb0e0586ff0e1fba
    Reviewed-on: http://gerrit.cloudera.org:8080/18576
    Reviewed-by: Michael Smith <mi...@cloudera.com>
    Reviewed-by: Quanlong Huang <hu...@gmail.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 shell/shell_output.py                 | 10 +++++++++-
 tests/shell/test_shell_commandline.py | 17 +++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/shell/shell_output.py b/shell/shell_output.py
index 978196539..becc4dd06 100644
--- a/shell/shell_output.py
+++ b/shell/shell_output.py
@@ -112,7 +112,15 @@ class OutputStream(object):
         with open(self.filename, 'ab') as out_file:
           # Note that instances of this class do not persist, so it's fine to
           # close the we close the file handle after each write.
-          out_file.write(formatted_data.encode('utf-8'))  # file opened in binary mode
+          # The file is opened in binary mode. Python 2 returns Unicode bytes
+          # that can be written directly. Python 3 returns a string, which
+          # we need to encode before writing.
+          # TODO: Reexamine the contract of the format() function and see if
+          # we can remove this.
+          if sys.version_info.major == 2 and isinstance(formatted_data, str):
+            out_file.write(formatted_data)
+          else:
+            out_file.write(formatted_data.encode('utf-8'))
           out_file.write(b'\n')
       except IOError as err:
         file_err_msg = "Error opening file %s: %s" % (self.filename, str(err))
diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py
index 3c6454d5a..7b410d333 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -1202,6 +1202,23 @@ class TestImpalaShell(ImpalaTestSuite):
       rows_from_file = [line.rstrip() for line in f]
       assert rows_from_stdout == rows_from_file
 
+  def test_output_file_utf8(self, vector, tmp_file):
+    """Test that writing UTF-8 output to a file using '--output_file' produces the
+    same output as written to stdout."""
+    # This is purely about UTF-8 output, so it doesn't need multiple rows.
+    query = "select '引'"
+    # Run the query normally and keep the stdout
+    output = run_impala_shell_cmd(vector, ['-q', query, '-B', '--output_delimiter=;'])
+    assert "Fetched 1 row(s)" in output.stderr
+    rows_from_stdout = output.stdout.strip().split('\n')
+    # Run the query with output sent to a file using '--output_file'.
+    result = run_impala_shell_cmd(vector, ['-q', query, '-B', '--output_delimiter=;',
+                                           '--output_file=%s' % tmp_file])
+    assert "Fetched 1 row(s)" in result.stderr
+    with open(tmp_file, "r") as f:
+      rows_from_file = [line.rstrip() for line in f]
+      assert rows_from_stdout == rows_from_file
+
   def test_http_socket_timeout(self, vector):
     """Test setting different http_socket_timeout_s values."""
     if (vector.get_value('strict_hs2_protocol') or