You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/06/25 15:45:17 UTC

[impala] 05/20: IMPALA-7259: Improve Impala shell performance

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit eee10343558c790cdb20238c3d132d486b190a9c
Author: Fredy Wijaya <fw...@cloudera.com>
AuthorDate: Fri Jul 13 03:23:37 2018 -0500

    IMPALA-7259: Improve Impala shell performance
    
    This patch fixes the slow performance in Impala shell, especially for
    large queries by replacing all calls to sqlparse.format(sql_string,
    strip_comments=True) with the custom implementation of strip comments
    that does not use grouping. The code to strip leading comments was also
    refactored to not use grouping.
    
    * Benchmark running a query with 12K columns *
    
    Before the patch:
    $ time impala-shell.sh -f large.sql --quiet
    real    2m4.154s
    user    2m0.536s
    sys     0m0.088s
    
    After the patch:
    $ time impala-shell.sh -f large.sql --quiet
    real    0m3.885s
    user    0m1.516s
    sys     0m0.048s
    
    Testing:
    - Added a new test to test the Impala shell performance
    - Ran all shell tests on Python 2.6 and Python 2.7
    
    Change-Id: Idac9f3caed7c44846a8c922dbe5ca3bf3b095b81
    Reviewed-on: http://gerrit.cloudera.org:8080/10939
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 shell/impala_shell.py                 | 47 ++++++++++++++++++++++++-----------
 tests/shell/test_shell_commandline.py | 38 +++++++++++++++++++++++++++-
 2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index a6ab277..0446f69 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py
@@ -59,6 +59,18 @@ try:
 except Exception:
   pass
 
+def strip_comments(sql):
+  """sqlparse default implementation of strip comments has a bad performance when parsing
+  very large SQL due to the grouping. This is because the default implementation tries to
+  format the SQL for pretty-printing. Impala shell use of strip comments is mostly for
+  checking and not for altering the actual SQL, so having a pretty-formatted SQL is
+  irrelevant in Impala shell. Removing the grouping gives a significant performance boost.
+  """
+  stack = sqlparse.engine.FilterStack()
+  stack.stmtprocess.append(sqlparse.filters.StripCommentsFilter())
+  stack.postprocess.append(sqlparse.filters.SerializerUnicode())
+  return ''.join(stack.run(sql, 'utf-8')).strip()
+
 class CmdStatus:
   """Values indicate the execution status of a command to the cmd shell driver module
   SUCCESS and ERROR continue running the shell and ABORT exits the shell
@@ -401,13 +413,13 @@ class ImpalaShell(object, cmd.Cmd):
     # Strip any comments to make a statement such as the following be considered as
     # ending with a delimiter:
     # select 1 + 1; -- this is a comment
-    line = sqlparse.format(line, strip_comments=True).encode('utf-8').rstrip()
+    line = strip_comments(line).encode('utf-8').rstrip()
     if line.endswith(ImpalaShell.CMD_DELIM):
       try:
         # Look for an open quotation in the entire command, and not just the
         # current line.
         if self.partial_cmd:
-          line = sqlparse.format('%s %s' % (self.partial_cmd, line), strip_comments=True)
+          line = strip_comments('%s %s' % (self.partial_cmd, line))
         self._shlex_split(line)
         return True
       # If the command ends with a delimiter, check if it has an open quotation.
@@ -1142,7 +1154,7 @@ class ImpalaShell(object, cmd.Cmd):
     query = self._create_beeswax_query(args)
     # Set posix=True and add "'" to escaped quotes
     # to deal with escaped quotes in string literals
-    lexer = shlex.shlex(sqlparse.format(query.query.lstrip(), strip_comments=True)
+    lexer = shlex.shlex(strip_comments(query.query.lstrip())
                         .encode('utf-8'), posix=True)
     lexer.escapedquotes += "'"
     try:
@@ -1345,24 +1357,31 @@ class ImpalaShell(object, cmd.Cmd):
       def _process(self, tlist):
         token = tlist.token_first()
         if self._is_comment(token):
-          self.comment = token.value
-          tidx = tlist.token_index(token)
-          tlist.tokens.pop(tidx)
+          self.comment = ''
+          while token:
+            if self._is_comment(token) or self._is_whitespace(token):
+              tidx = tlist.token_index(token)
+              self.comment += token.value
+              tlist.tokens.pop(tidx)
+              tidx -= 1
+              token = tlist.token_next(tidx, False)
+            else:
+              break
 
       def _is_comment(self, token):
-        if isinstance(token, sqlparse.sql.Comment):
-          return True
-        for comment in sqlparse.tokens.Comment:
-          if token.ttype == comment:
-            return True
-        return False
+        return isinstance(token, sqlparse.sql.Comment) or \
+               token.ttype == sqlparse.tokens.Comment.Single or \
+               token.ttype == sqlparse.tokens.Comment.Multiline
+
+      def _is_whitespace(self, token):
+        return token.ttype == sqlparse.tokens.Whitespace or \
+               token.ttype == sqlparse.tokens.Newline
 
       def process(self, stack, stmt):
         [self.process(stack, sgroup) for sgroup in stmt.get_sublists()]
         self._process(stmt)
 
     stack = sqlparse.engine.FilterStack()
-    stack.enable_grouping()
     strip_leading_comment_filter = StripLeadingCommentFilter()
     stack.stmtprocess.append(strip_leading_comment_filter)
     stack.postprocess.append(sqlparse.filters.SerializerUnicode())
@@ -1492,7 +1511,7 @@ def parse_query_text(query_text, utf8_encode_policy='strict'):
   # "--comment2" is sent as is. Impala's parser however doesn't consider it a valid SQL
   # and throws an exception. We identify such trailing comments and ignore them (do not
   # send them to Impala).
-  if query_list and not sqlparse.format(query_list[-1], strip_comments=True).strip("\n"):
+  if query_list and not strip_comments(query_list[-1]).strip("\n"):
     query_list.pop()
   return query_list
 
diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py
index 858e46b..6fc34fe 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -23,12 +23,13 @@ import re
 import signal
 import shlex
 import socket
+import tempfile
 
 from subprocess import call, Popen
 from tests.common.impala_service import ImpaladService
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.skip import SkipIf
-from time import sleep
+from time import sleep, time
 from util import IMPALAD, SHELL_CMD
 from util import assert_var_substitution, run_impala_shell_cmd, ImpalaShell
 
@@ -667,3 +668,38 @@ class TestImpalaShell(ImpalaTestSuite):
     args = "-q \"with v as (select 1) \;\""
     result = run_impala_shell_cmd(args, expect_success=False)
     assert "Encountered: Unexpected character" in result.stderr
+
+  def test_large_sql(self, unique_database):
+    sql_file, sql_path = tempfile.mkstemp()
+    os.write(sql_file, "create table large_table (\n")
+    num_cols = 4000
+    for i in xrange(num_cols):
+      os.write(sql_file, "col_{0} int".format(i))
+      if i < num_cols - 1:
+        os.write(sql_file, ",")
+      os.write(sql_file, "\n")
+    os.write(sql_file, ");")
+    os.write(sql_file, "select \n")
+
+    for i in xrange(num_cols):
+      if i < num_cols:
+        os.write(sql_file, 'col_{0} as a{1},\n'.format(i, i))
+        os.write(sql_file, 'col_{0} as b{1},\n'.format(i, i))
+        os.write(sql_file, 'col_{0} as c{1}{2}\n'.format(i, i,
+                                                     ',' if i < num_cols - 1 else ''))
+    os.write(sql_file, 'from large_table;')
+    os.close(sql_file)
+
+    try:
+      args = "-f {0} -d {1}".format(sql_path, unique_database)
+      start_time = time()
+      result = run_impala_shell_cmd(args)
+      end_time = time()
+      assert "Fetched 0 row(s)" in result.stderr
+      time_limit_s = 30
+      actual_time_s = end_time - start_time
+      assert actual_time_s <= time_limit_s, (
+          "It took {0} seconds to execute the query. Time limit is {1} seconds.".format(
+              actual_time_s, time_limit_s))
+    finally:
+      os.remove(sql_path)