You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/06/25 15:45:17 UTC
[impala] 05/20: IMPALA-7259: Improve Impala shell performance
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git
commit eee10343558c790cdb20238c3d132d486b190a9c
Author: Fredy Wijaya <fw...@cloudera.com>
AuthorDate: Fri Jul 13 03:23:37 2018 -0500
IMPALA-7259: Improve Impala shell performance
This patch fixes the slow performance in Impala shell, especially for
large queries by replacing all calls to sqlparse.format(sql_string,
strip_comments=True) with the custom implementation of strip comments
that does not use grouping. The code to strip leading comments was also
refactored to not use grouping.
* Benchmark running a query with 12K columns *
Before the patch:
$ time impala-shell.sh -f large.sql --quiet
real 2m4.154s
user 2m0.536s
sys 0m0.088s
After the patch:
$ time impala-shell.sh -f large.sql --quiet
real 0m3.885s
user 0m1.516s
sys 0m0.048s
Testing:
- Added a new test to test the Impala shell performance
- Ran all shell tests on Python 2.6 and Python 2.7
Change-Id: Idac9f3caed7c44846a8c922dbe5ca3bf3b095b81
Reviewed-on: http://gerrit.cloudera.org:8080/10939
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
shell/impala_shell.py | 47 ++++++++++++++++++++++++-----------
tests/shell/test_shell_commandline.py | 38 +++++++++++++++++++++++++++-
2 files changed, 70 insertions(+), 15 deletions(-)
diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index a6ab277..0446f69 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py
@@ -59,6 +59,18 @@ try:
except Exception:
pass
+def strip_comments(sql):
+ """sqlparse default implementation of strip comments has a bad performance when parsing
+ very large SQL due to the grouping. This is because the default implementation tries to
+ format the SQL for pretty-printing. Impala shell use of strip comments is mostly for
+ checking and not for altering the actual SQL, so having a pretty-formatted SQL is
+ irrelevant in Impala shell. Removing the grouping gives a significant performance boost.
+ """
+ stack = sqlparse.engine.FilterStack()
+ stack.stmtprocess.append(sqlparse.filters.StripCommentsFilter())
+ stack.postprocess.append(sqlparse.filters.SerializerUnicode())
+ return ''.join(stack.run(sql, 'utf-8')).strip()
+
class CmdStatus:
"""Values indicate the execution status of a command to the cmd shell driver module
SUCCESS and ERROR continue running the shell and ABORT exits the shell
@@ -401,13 +413,13 @@ class ImpalaShell(object, cmd.Cmd):
# Strip any comments to make a statement such as the following be considered as
# ending with a delimiter:
# select 1 + 1; -- this is a comment
- line = sqlparse.format(line, strip_comments=True).encode('utf-8').rstrip()
+ line = strip_comments(line).encode('utf-8').rstrip()
if line.endswith(ImpalaShell.CMD_DELIM):
try:
# Look for an open quotation in the entire command, and not just the
# current line.
if self.partial_cmd:
- line = sqlparse.format('%s %s' % (self.partial_cmd, line), strip_comments=True)
+ line = strip_comments('%s %s' % (self.partial_cmd, line))
self._shlex_split(line)
return True
# If the command ends with a delimiter, check if it has an open quotation.
@@ -1142,7 +1154,7 @@ class ImpalaShell(object, cmd.Cmd):
query = self._create_beeswax_query(args)
# Set posix=True and add "'" to escaped quotes
# to deal with escaped quotes in string literals
- lexer = shlex.shlex(sqlparse.format(query.query.lstrip(), strip_comments=True)
+ lexer = shlex.shlex(strip_comments(query.query.lstrip())
.encode('utf-8'), posix=True)
lexer.escapedquotes += "'"
try:
@@ -1345,24 +1357,31 @@ class ImpalaShell(object, cmd.Cmd):
def _process(self, tlist):
token = tlist.token_first()
if self._is_comment(token):
- self.comment = token.value
- tidx = tlist.token_index(token)
- tlist.tokens.pop(tidx)
+ self.comment = ''
+ while token:
+ if self._is_comment(token) or self._is_whitespace(token):
+ tidx = tlist.token_index(token)
+ self.comment += token.value
+ tlist.tokens.pop(tidx)
+ tidx -= 1
+ token = tlist.token_next(tidx, False)
+ else:
+ break
def _is_comment(self, token):
- if isinstance(token, sqlparse.sql.Comment):
- return True
- for comment in sqlparse.tokens.Comment:
- if token.ttype == comment:
- return True
- return False
+ return isinstance(token, sqlparse.sql.Comment) or \
+ token.ttype == sqlparse.tokens.Comment.Single or \
+ token.ttype == sqlparse.tokens.Comment.Multiline
+
+ def _is_whitespace(self, token):
+ return token.ttype == sqlparse.tokens.Whitespace or \
+ token.ttype == sqlparse.tokens.Newline
def process(self, stack, stmt):
[self.process(stack, sgroup) for sgroup in stmt.get_sublists()]
self._process(stmt)
stack = sqlparse.engine.FilterStack()
- stack.enable_grouping()
strip_leading_comment_filter = StripLeadingCommentFilter()
stack.stmtprocess.append(strip_leading_comment_filter)
stack.postprocess.append(sqlparse.filters.SerializerUnicode())
@@ -1492,7 +1511,7 @@ def parse_query_text(query_text, utf8_encode_policy='strict'):
# "--comment2" is sent as is. Impala's parser however doesn't consider it a valid SQL
# and throws an exception. We identify such trailing comments and ignore them (do not
# send them to Impala).
- if query_list and not sqlparse.format(query_list[-1], strip_comments=True).strip("\n"):
+ if query_list and not strip_comments(query_list[-1]).strip("\n"):
query_list.pop()
return query_list
diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py
index 858e46b..6fc34fe 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -23,12 +23,13 @@ import re
import signal
import shlex
import socket
+import tempfile
from subprocess import call, Popen
from tests.common.impala_service import ImpaladService
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.skip import SkipIf
-from time import sleep
+from time import sleep, time
from util import IMPALAD, SHELL_CMD
from util import assert_var_substitution, run_impala_shell_cmd, ImpalaShell
@@ -667,3 +668,38 @@ class TestImpalaShell(ImpalaTestSuite):
args = "-q \"with v as (select 1) \;\""
result = run_impala_shell_cmd(args, expect_success=False)
assert "Encountered: Unexpected character" in result.stderr
+
+ def test_large_sql(self, unique_database):
+ sql_file, sql_path = tempfile.mkstemp()
+ os.write(sql_file, "create table large_table (\n")
+ num_cols = 4000
+ for i in xrange(num_cols):
+ os.write(sql_file, "col_{0} int".format(i))
+ if i < num_cols - 1:
+ os.write(sql_file, ",")
+ os.write(sql_file, "\n")
+ os.write(sql_file, ");")
+ os.write(sql_file, "select \n")
+
+ for i in xrange(num_cols):
+ if i < num_cols:
+ os.write(sql_file, 'col_{0} as a{1},\n'.format(i, i))
+ os.write(sql_file, 'col_{0} as b{1},\n'.format(i, i))
+ os.write(sql_file, 'col_{0} as c{1}{2}\n'.format(i, i,
+ ',' if i < num_cols - 1 else ''))
+ os.write(sql_file, 'from large_table;')
+ os.close(sql_file)
+
+ try:
+ args = "-f {0} -d {1}".format(sql_path, unique_database)
+ start_time = time()
+ result = run_impala_shell_cmd(args)
+ end_time = time()
+ assert "Fetched 0 row(s)" in result.stderr
+ time_limit_s = 30
+ actual_time_s = end_time - start_time
+ assert actual_time_s <= time_limit_s, (
+ "It took {0} seconds to execute the query. Time limit is {1} seconds.".format(
+ actual_time_s, time_limit_s))
+ finally:
+ os.remove(sql_path)