You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ph...@apache.org on 2018/04/12 23:10:15 UTC
[3/6] impala git commit: IMPALA-2717: fix output of formatted unicode
to non-TTY
IMPALA-2717: fix output of formatted unicode to non-TTY
The bug is that PrettyOutputFormatter.format() returned a unicode
object, and Python cannot automatically write unicode objects to
output streams where there is no default encoding.
The fix is to convert to UTF-8 encoded in a regular string, which
can be output to any output device. This makes the output type
consistent with DelimitedOutputFormatter.format().
Based on code by Marcell Szabo.
Testing:
Added a basic test.
Played around in an interactive shell to make sure that unicode
characters still work in interactive mode.
Change-Id: I9de641ecf767a2feef3b9f48b344ef2d55e17a7f
Reviewed-on: http://gerrit.cloudera.org:8080/9928
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/318051cc
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/318051cc
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/318051cc
Branch: refs/heads/master
Commit: 318051cc21cc7fbe96886e30b3f13b90bbb7b50a
Parents: 75b612a
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Wed Apr 4 11:51:51 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Thu Apr 12 20:34:47 2018 +0000
----------------------------------------------------------------------
shell/impala_shell.py | 4 +++-
shell/shell_output.py | 8 +++++++-
tests/shell/test_shell_commandline.py | 25 +++++++++++++++++++++----
3 files changed, 31 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/318051cc/shell/impala_shell.py
----------------------------------------------------------------------
diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index 93bdafb..55ea692 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py
@@ -70,7 +70,9 @@ class CmdStatus:
ERROR = False
class ImpalaPrettyTable(prettytable.PrettyTable):
- """Patched version of PrettyTable that TODO"""
+ """Patched version of PrettyTable with different unicode handling - instead of throwing
+ exceptions when a character can't be converted to unicode, it is replaced with a
+ placeholder character."""
def _unicode(self, value):
if not isinstance(value, basestring):
value = str(value)
http://git-wip-us.apache.org/repos/asf/impala/blob/318051cc/shell/shell_output.py
----------------------------------------------------------------------
diff --git a/shell/shell_output.py b/shell/shell_output.py
index f0cecc8..8ab3bee 100644
--- a/shell/shell_output.py
+++ b/shell/shell_output.py
@@ -28,11 +28,16 @@ class PrettyOutputFormatter(object):
self.prettytable = prettytable
def format(self, rows):
+ """Returns string containing UTF-8-encoded representation of the table data."""
# Clear rows that already exist in the table.
self.prettytable.clear_rows()
try:
map(self.prettytable.add_row, rows)
- return self.prettytable.get_string()
+ # PrettyTable.get_string() converts UTF-8-encoded strs added via add_row() into
+ # Python unicode strings. We need to convert it back to a UTF-8-encoded str for
+ # output, since Python won't do the encoding automatically when outputting to a
+ # non-terminal (see IMPALA-2717).
+ return self.prettytable.get_string().encode('utf-8')
except Exception, e:
# beeswax returns each row as a tab separated string. If a string column
# value in a row has tabs, it will break the row split. Default to displaying
@@ -53,6 +58,7 @@ class DelimitedOutputFormatter(object):
raise ValueError, error_msg
def format(self, rows):
+ """Returns string containing UTF-8-encoded representation of the table data."""
# csv.writer expects a file handle to the input.
# cStringIO is used as the temporary buffer.
temp_buffer = StringIO()
http://git-wip-us.apache.org/repos/asf/impala/blob/318051cc/tests/shell/test_shell_commandline.py
----------------------------------------------------------------------
diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py
index e69c512..6aa05f6 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -33,6 +33,8 @@ from util import assert_var_substitution, run_impala_shell_cmd, ImpalaShell
DEFAULT_QUERY = 'select 1'
QUERY_FILE_PATH = os.path.join(os.environ['IMPALA_HOME'], 'tests', 'shell')
+RUSSIAN_CHARS = (u"А, Б, В, Г, Д, Е, Ё, Ж, З, И, Й, К, Л, М, Н, О, П, Р,"
+ u"С, Т, У, Ф, Х, Ц,Ч, Ш, Щ, Ъ, Ы, Ь, Э, Ю, Я")
@pytest.fixture
def empty_table(unique_database, request):
@@ -405,12 +407,27 @@ class TestImpalaShell(ImpalaTestSuite):
def test_international_characters(self):
"""Sanity test to ensure that the shell can read international characters."""
- russian_chars = (u"А, Б, В, Г, Д, Е, Ё, Ж, З, И, Й, К, Л, М, Н, О, П, Р,"
- u"С, Т, У, Ф, Х, Ц,Ч, Ш, Щ, Ъ, Ы, Ь, Э, Ю, Я")
- args = """-B -q "select '%s'" """ % russian_chars
+ args = """-B -q "select '%s'" """ % RUSSIAN_CHARS
result = run_impala_shell_cmd(args.encode('utf-8'))
assert 'UnicodeDecodeError' not in result.stderr
- assert russian_chars.encode('utf-8') in result.stdout
+ assert RUSSIAN_CHARS.encode('utf-8') in result.stdout
+
+ def test_international_characters_prettyprint(self):
+ """IMPALA-2717: ensure we can handle international characters in pretty-printed
+ output"""
+ args = """-q "select '%s'" """ % RUSSIAN_CHARS
+ result = run_impala_shell_cmd(args.encode('utf-8'))
+ assert 'UnicodeDecodeError' not in result.stderr
+ assert RUSSIAN_CHARS.encode('utf-8') in result.stdout
+
+ def test_international_characters_prettyprint_tabs(self):
+ """IMPALA-2717: ensure we can handle international characters in pretty-printed
+ output when pretty-printing falls back to delimited output."""
+ args = """-q "select '%s\\t'" """ % RUSSIAN_CHARS
+ result = run_impala_shell_cmd(args.encode('utf-8'))
+ assert 'Reverting to tab delimited text' in result.stderr
+ assert 'UnicodeDecodeError' not in result.stderr
+ assert RUSSIAN_CHARS.encode('utf-8') in result.stdout
@pytest.mark.execute_serially # This tests invalidates metadata, and must run serially
def test_config_file(self):