You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ph...@apache.org on 2018/07/19 21:01:22 UTC

[1/2] impala git commit: IMPALA-7259: Improve Impala shell performance

Repository: impala
Updated Branches:
  refs/heads/master 70e2d57fc -> 2a40e8f2a


IMPALA-7259: Improve Impala shell performance

This patch fixes the slow performance in Impala shell, especially for
large queries by replacing all calls to sqlparse.format(sql_string,
strip_comments=True) with the custom implementation of strip comments
that does not use grouping. The code to strip leading comments was also
refactored to not use grouping.

* Benchmark running a query with 12K columns *

Before the patch:
$ time impala-shell.sh -f large.sql --quiet
real    2m4.154s
user    2m0.536s
sys     0m0.088s

After the patch:
$ time impala-shell.sh -f large.sql --quiet
real    0m3.885s
user    0m1.516s
sys     0m0.048s

Testing:
- Added a new test to test the Impala shell performance
- Ran all shell tests on Python 2.6 and Python 2.7

Change-Id: Idac9f3caed7c44846a8c922dbe5ca3bf3b095b81
Reviewed-on: http://gerrit.cloudera.org:8080/10939
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/1d491d64
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/1d491d64
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/1d491d64

Branch: refs/heads/master
Commit: 1d491d64803908ece376df2d445c7a7f6d5920c8
Parents: 70e2d57
Author: Fredy Wijaya <fw...@cloudera.com>
Authored: Fri Jul 13 03:23:37 2018 -0500
Committer: Fredy Wijaya <fw...@cloudera.com>
Committed: Thu Jul 19 15:32:46 2018 +0000

----------------------------------------------------------------------
 shell/impala_shell.py                 | 47 +++++++++++++++++++++---------
 tests/shell/test_shell_commandline.py | 38 +++++++++++++++++++++++-
 2 files changed, 70 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/1d491d64/shell/impala_shell.py
----------------------------------------------------------------------
diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index aea8350..95ff8ab 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py
@@ -59,6 +59,18 @@ try:
 except Exception:
   pass
 
+def strip_comments(sql):
+  """sqlparse default implementation of strip comments has a bad performance when parsing
+  very large SQL due to the grouping. This is because the default implementation tries to
+  format the SQL for pretty-printing. Impala shell use of strip comments is mostly for
+  checking and not for altering the actual SQL, so having a pretty-formatted SQL is
+  irrelevant in Impala shell. Removing the grouping gives a significant performance boost.
+  """
+  stack = sqlparse.engine.FilterStack()
+  stack.stmtprocess.append(sqlparse.filters.StripCommentsFilter())
+  stack.postprocess.append(sqlparse.filters.SerializerUnicode())
+  return ''.join(stack.run(sql, 'utf-8')).strip()
+
 class CmdStatus:
   """Values indicate the execution status of a command to the cmd shell driver module
   SUCCESS and ERROR continue running the shell and ABORT exits the shell
@@ -400,13 +412,13 @@ class ImpalaShell(object, cmd.Cmd):
     # Strip any comments to make a statement such as the following be considered as
     # ending with a delimiter:
     # select 1 + 1; -- this is a comment
-    line = sqlparse.format(line, strip_comments=True).encode('utf-8').rstrip()
+    line = strip_comments(line).encode('utf-8').rstrip()
     if line.endswith(ImpalaShell.CMD_DELIM):
       try:
         # Look for an open quotation in the entire command, and not just the
         # current line.
         if self.partial_cmd:
-          line = sqlparse.format('%s %s' % (self.partial_cmd, line), strip_comments=True)
+          line = strip_comments('%s %s' % (self.partial_cmd, line))
         self._shlex_split(line)
         return True
       # If the command ends with a delimiter, check if it has an open quotation.
@@ -1138,7 +1150,7 @@ class ImpalaShell(object, cmd.Cmd):
     query = self._create_beeswax_query(args)
     # Set posix=True and add "'" to escaped quotes
     # to deal with escaped quotes in string literals
-    lexer = shlex.shlex(sqlparse.format(query.query.lstrip(), strip_comments=True)
+    lexer = shlex.shlex(strip_comments(query.query.lstrip())
                         .encode('utf-8'), posix=True)
     lexer.escapedquotes += "'"
     try:
@@ -1341,24 +1353,31 @@ class ImpalaShell(object, cmd.Cmd):
       def _process(self, tlist):
         token = tlist.token_first()
         if self._is_comment(token):
-          self.comment = token.value
-          tidx = tlist.token_index(token)
-          tlist.tokens.pop(tidx)
+          self.comment = ''
+          while token:
+            if self._is_comment(token) or self._is_whitespace(token):
+              tidx = tlist.token_index(token)
+              self.comment += token.value
+              tlist.tokens.pop(tidx)
+              tidx -= 1
+              token = tlist.token_next(tidx, False)
+            else:
+              break
 
       def _is_comment(self, token):
-        if isinstance(token, sqlparse.sql.Comment):
-          return True
-        for comment in sqlparse.tokens.Comment:
-          if token.ttype == comment:
-            return True
-        return False
+        return isinstance(token, sqlparse.sql.Comment) or \
+               token.ttype == sqlparse.tokens.Comment.Single or \
+               token.ttype == sqlparse.tokens.Comment.Multiline
+
+      def _is_whitespace(self, token):
+        return token.ttype == sqlparse.tokens.Whitespace or \
+               token.ttype == sqlparse.tokens.Newline
 
       def process(self, stack, stmt):
         [self.process(stack, sgroup) for sgroup in stmt.get_sublists()]
         self._process(stmt)
 
     stack = sqlparse.engine.FilterStack()
-    stack.enable_grouping()
     strip_leading_comment_filter = StripLeadingCommentFilter()
     stack.stmtprocess.append(strip_leading_comment_filter)
     stack.postprocess.append(sqlparse.filters.SerializerUnicode())
@@ -1481,7 +1500,7 @@ def parse_query_text(query_text, utf8_encode_policy='strict'):
   # "--comment2" is sent as is. Impala's parser however doesn't consider it a valid SQL
   # and throws an exception. We identify such trailing comments and ignore them (do not
   # send them to Impala).
-  if query_list and not sqlparse.format(query_list[-1], strip_comments=True).strip("\n"):
+  if query_list and not strip_comments(query_list[-1]).strip("\n"):
     query_list.pop()
   return query_list
 

http://git-wip-us.apache.org/repos/asf/impala/blob/1d491d64/tests/shell/test_shell_commandline.py
----------------------------------------------------------------------
diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py
index 93387fa..40453c8 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -23,12 +23,13 @@ import re
 import signal
 import shlex
 import socket
+import tempfile
 
 from subprocess import call, Popen
 from tests.common.impala_service import ImpaladService
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.skip import SkipIf
-from time import sleep
+from time import sleep, time
 from util import IMPALAD, SHELL_CMD
 from util import assert_var_substitution, run_impala_shell_cmd, ImpalaShell
 
@@ -659,3 +660,38 @@ class TestImpalaShell(ImpalaTestSuite):
     args = "-q \"with v as (select 1) \;\""
     result = run_impala_shell_cmd(args, expect_success=False)
     assert "Encountered: Unexpected character" in result.stderr
+
+  def test_large_sql(self, unique_database):
+    sql_file, sql_path = tempfile.mkstemp()
+    os.write(sql_file, "create table large_table (\n")
+    num_cols = 4000
+    for i in xrange(num_cols):
+      os.write(sql_file, "col_{0} int".format(i))
+      if i < num_cols - 1:
+        os.write(sql_file, ",")
+      os.write(sql_file, "\n")
+    os.write(sql_file, ");")
+    os.write(sql_file, "select \n")
+
+    for i in xrange(num_cols):
+      if i < num_cols:
+        os.write(sql_file, 'col_{0} as a{1},\n'.format(i, i))
+        os.write(sql_file, 'col_{0} as b{1},\n'.format(i, i))
+        os.write(sql_file, 'col_{0} as c{1}{2}\n'.format(i, i,
+                                                     ',' if i < num_cols - 1 else ''))
+    os.write(sql_file, 'from large_table;')
+    os.close(sql_file)
+
+    try:
+      args = "-f {0} -d {1}".format(sql_path, unique_database)
+      start_time = time()
+      result = run_impala_shell_cmd(args)
+      end_time = time()
+      assert "Fetched 0 row(s)" in result.stderr
+      time_limit_s = 30
+      actual_time_s = end_time - start_time
+      assert actual_time_s <= time_limit_s, (
+          "It took {0} seconds to execute the query. Time limit is {1} seconds.".format(
+              actual_time_s, time_limit_s))
+    finally:
+      os.remove(sql_path)


[2/2] impala git commit: test_recover_partitions.py had asserts that were always true.

Posted by ph...@apache.org.
test_recover_partitions.py had asserts that were always true.

Running "python -m compileall" discovered some assertions that
were always true. I've re-instated them to their true spirit.

Change-Id: Id49171304b853f15c43c8cfca066b6694c4a669f
Reviewed-on: http://gerrit.cloudera.org:8080/10993
Reviewed-by: Vuk Ercegovac <ve...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/2a40e8f2
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/2a40e8f2
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/2a40e8f2

Branch: refs/heads/master
Commit: 2a40e8f2a973391b61165ebd95cb30b9b67d93ba
Parents: 1d491d6
Author: Philip Zeyliger <ph...@cloudera.com>
Authored: Thu Jul 19 10:09:11 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Thu Jul 19 20:35:31 2018 +0000

----------------------------------------------------------------------
 tests/metadata/test_recover_partitions.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/2a40e8f2/tests/metadata/test_recover_partitions.py
----------------------------------------------------------------------
diff --git a/tests/metadata/test_recover_partitions.py b/tests/metadata/test_recover_partitions.py
index 1894676..9ba4164 100644
--- a/tests/metadata/test_recover_partitions.py
+++ b/tests/metadata/test_recover_partitions.py
@@ -346,12 +346,11 @@ class TestRecoverPartitions(ImpalaTestSuite):
         self.client, "ALTER TABLE %s RECOVER PARTITIONS" % FQ_TBL_NAME)
       result = self.execute_query_expect_success(
         self.client, "SHOW PARTITIONS %s" % FQ_TBL_NAME)
-      assert (self.count_partition(result.data) == 1,
-        "ALTER TABLE %s RECOVER PARTITIONS produced more than 1 partitions" %
-        FQ_TBL_NAME)
-      assert (self.count_value('p=100%25', result.data) == 1,
-        "ALTER TABLE %s RECOVER PARTITIONS failed to handle encoded partitioned value" %
-        FQ_TBL_NAME)
+      assert self.count_partition(result.data) == 1, \
+        "ALTER TABLE %s RECOVER PARTITIONS produced more than 1 partitions" % FQ_TBL_NAME
+      assert self.count_value('p=100%25', result.data) == 1, \
+        "ALTER TABLE %s RECOVER PARTITIONS failed to handle encoded partitioned value" % \
+        FQ_TBL_NAME
 
   @SkipIfLocal.hdfs_client
   @SkipIfS3.empty_directory