You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mi...@apache.org on 2018/08/23 02:46:34 UTC

impala git commit: IMPALA-7460 part 1: require user to install Paramiko and Fabric

Repository: impala
Updated Branches:
  refs/heads/master 649f175df -> 971cf179f


IMPALA-7460 part 1: require user to install Paramiko and Fabric

- Remove Fabric and Paramiko as requirements. They aren't needed by
  anything in buildall.sh.
- Add a means to install into the impala-python virtual environment by hand.
  impala-pip is fine for this.
- Add another requirements file for extended testing. The dependency
  situation is messy and untangling that out of impala-python and into
  lib/python should be out of the scope of IMPALA-7460.
- Update core tests, which cover real regressions that have happened in
  the past, to run against locations that don't require a Paramiko
  import. This moves some logic out of concurrent_select.py into a
  thinner module.
- Insulate ssh_util from globally-scoped import so that it only imports
  when needed.

Testing:
- This works in my development environment.
- This works in my downstream stress and query gen environments.
- This works when doing a full data load.
- Impala still builds on a variety of OSs.

Todo:
- A subsequent review will update the versions.

Change-Id: Ibf9010a0387b52c95b7bda5d1d4606eba1008b65
Reviewed-on: http://gerrit.cloudera.org:8080/11264
Reviewed-by: Michael Brown <mi...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/971cf179
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/971cf179
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/971cf179

Branch: refs/heads/master
Commit: 971cf179f674b312cd133397fe74ee8c5f29a215
Parents: 649f175
Author: Michael Brown <mi...@cloudera.com>
Authored: Mon Aug 20 10:24:37 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Thu Aug 23 00:20:15 2018 +0000

----------------------------------------------------------------------
 bin/impala-pip                                  | 21 ++++++
 infra/python/deps/compiled-requirements.txt     |  4 --
 .../python/deps/extended-test-requirements.txt  | 24 +++++++
 tests/comparison/cluster.py                     |  4 +-
 tests/comparison/leopard/impala_docker_env.py   |  7 +-
 tests/infra/test_stress_infra.py                | 10 ++-
 tests/stress/concurrent_select.py               | 69 ++------------------
 tests/util/cluster_controller.py                | 14 ++--
 tests/util/parse_util.py                        | 42 ++++++++++++
 tests/util/ssh_util.py                          | 10 ++-
 tests/util/test_file_parser.py                  | 34 +++++++++-
 11 files changed, 157 insertions(+), 82 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/bin/impala-pip
----------------------------------------------------------------------
diff --git a/bin/impala-pip b/bin/impala-pip
new file mode 100755
index 0000000..0041600
--- /dev/null
+++ b/bin/impala-pip
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+source "$(dirname "$0")/impala-python-common.sh"
+exec "$PY_DIR/env/bin/python" "$PY_DIR/env/bin/pip" "$@"

http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/infra/python/deps/compiled-requirements.txt
----------------------------------------------------------------------
diff --git a/infra/python/deps/compiled-requirements.txt b/infra/python/deps/compiled-requirements.txt
index 2c5590e..72e939a 100644
--- a/infra/python/deps/compiled-requirements.txt
+++ b/infra/python/deps/compiled-requirements.txt
@@ -19,10 +19,6 @@
 # after the toolchain is bootstrapped. Installed after requirements.txt
 
 argparse == 1.4.0
-Fabric == 1.10.2
-  paramiko == 1.15.2
-    ecdsa == 0.13
-  pycrypto == 2.6.1
 impyla == 0.14.0
   bitarray == 0.8.1
   sasl == 0.1.3

http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/infra/python/deps/extended-test-requirements.txt
----------------------------------------------------------------------
diff --git a/infra/python/deps/extended-test-requirements.txt b/infra/python/deps/extended-test-requirements.txt
new file mode 100644
index 0000000..c62c99a
--- /dev/null
+++ b/infra/python/deps/extended-test-requirements.txt
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Requirements for use of test tooling (tests/comparison and tests/stress).
+
+# IMPALA-7460 TODO: upgrade Fabric and Paramiko
+Fabric == 1.10.2
+  paramiko == 1.15.2
+    ecdsa == 0.13
+  pycrypto == 2.6.1

http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/comparison/cluster.py
----------------------------------------------------------------------
diff --git a/tests/comparison/cluster.py b/tests/comparison/cluster.py
index 1cf9516..a2d8fc0 100644
--- a/tests/comparison/cluster.py
+++ b/tests/comparison/cluster.py
@@ -48,7 +48,6 @@ from zipfile import ZipFile
 from db_connection import HiveConnection, ImpalaConnection
 from tests.common.errors import Timeout
 from tests.util.shell_util import shell as local_shell
-from tests.util.ssh_util import SshClient
 from tests.util.parse_util import parse_glog, parse_mem_to_mb
 
 LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0])
@@ -292,6 +291,9 @@ class CmCluster(Cluster):
       if clients:
         client = clients.pop()
       else:
+        # IMPALA-7460: Insulate this import away from the global context so as to avoid
+        # requiring Paramiko unless it's absolutely needed.
+        from tests.util.ssh_util import SshClient
         LOG.debug("Creating new SSH client for %s", host_name)
         client = SshClient()
         client.connect(host_name, username=self.ssh_user, key_filename=self.ssh_key_file)

http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/comparison/leopard/impala_docker_env.py
----------------------------------------------------------------------
diff --git a/tests/comparison/leopard/impala_docker_env.py b/tests/comparison/leopard/impala_docker_env.py
index 38555a4..a837c00 100755
--- a/tests/comparison/leopard/impala_docker_env.py
+++ b/tests/comparison/leopard/impala_docker_env.py
@@ -18,7 +18,12 @@
 '''This module generates a docker environment for a job'''
 
 from __future__ import division
-from fabric.api import sudo, run, settings
+try:
+  from fabric.api import sudo, run, settings
+except ImportError as e:
+  raise Exception(
+      "Please run impala-pip install -r $IMPALA_HOME/infra/python/deps/extended-test-"
+      "requirements.txt:\n{0}".format(str(e)))
 from logging import getLogger
 from os.path import (
     join as join_path,

http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/infra/test_stress_infra.py
----------------------------------------------------------------------
diff --git a/tests/infra/test_stress_infra.py b/tests/infra/test_stress_infra.py
index 58f7625..7e97ffa 100644
--- a/tests/infra/test_stress_infra.py
+++ b/tests/infra/test_stress_infra.py
@@ -24,12 +24,10 @@ import pytest
 from decimal import Decimal
 
 from tests.common.impala_test_suite import ImpalaTestSuite
-from tests.stress.concurrent_select import (
-    EXPECTED_TPCDS_QUERIES_COUNT,
-    EXPECTED_TPCH_NESTED_QUERIES_COUNT,
-    EXPECTED_TPCH_QUERIES_COUNT,
-    load_tpc_queries,
-    match_memory_estimate)
+from tests.util.parse_util import (
+    EXPECTED_TPCDS_QUERIES_COUNT, EXPECTED_TPCH_NESTED_QUERIES_COUNT,
+    EXPECTED_TPCH_QUERIES_COUNT, match_memory_estimate)
+from tests.util.test_file_parser import load_tpc_queries
 
 
 class TestStressInfra(ImpalaTestSuite):

http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/stress/concurrent_select.py
----------------------------------------------------------------------
diff --git a/tests/stress/concurrent_select.py b/tests/stress/concurrent_select.py
index 64e06e1..71fc146 100755
--- a/tests/stress/concurrent_select.py
+++ b/tests/stress/concurrent_select.py
@@ -85,25 +85,13 @@ from tests.comparison.db_types import Int, TinyInt, SmallInt, BigInt
 from tests.comparison.model_translator import SqlWriter
 from tests.comparison.query_generator import QueryGenerator
 from tests.comparison.query_profile import DefaultProfile
-from tests.util.parse_util import parse_mem_to_mb
+from tests.util.parse_util import (
+    EXPECTED_TPCDS_QUERIES_COUNT, EXPECTED_TPCH_NESTED_QUERIES_COUNT,
+    EXPECTED_TPCH_QUERIES_COUNT, match_memory_estimate, parse_mem_to_mb)
 from tests.util.thrift_util import op_handle_to_query_id
 
 LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0])
 
-# IMPALA-6715: Every so often the stress test or the TPC workload directories get
-# changed, and the stress test loses the ability to run the full set of queries. Set
-# these constants and assert that when a workload is used, all the queries we expect to
-# use are there.
-EXPECTED_TPCDS_QUERIES_COUNT = 71
-EXPECTED_TPCH_NESTED_QUERIES_COUNT = 22
-EXPECTED_TPCH_QUERIES_COUNT = 22
-
-# Regex to extract the estimated memory from an explain plan.
-# The unit prefixes can be found in
-# fe/src/main/java/org/apache/impala/common/PrintUtils.java
-MEM_ESTIMATE_PATTERN = re.compile(
-    r"Per-Host Resource Estimates: Memory=(\d+\.?\d*)(P|T|G|M|K)?B")
-
 PROFILES_DIR = "profiles"
 RESULT_HASHES_DIR = "result_hashes"
 
@@ -1230,25 +1218,10 @@ class QueryRunner(object):
 def load_tpc_queries(workload):
   """Returns a list of TPC queries. 'workload' should either be 'tpch' or 'tpcds'."""
   LOG.info("Loading %s queries", workload)
-  queries = list()
-  query_dir = os.path.join(
-      os.path.dirname(__file__), "..", "..", "testdata", "workloads", workload, "queries")
-  # IMPALA-6715 and others from the past: This pattern enforces the queries we actually
-  # find. Both workload directories contain other queries that are not part of the TPC
-  # spec.
-  file_name_pattern = re.compile(r"^{0}-(q.*).test$".format(workload))
-  for query_file in os.listdir(query_dir):
-    match = file_name_pattern.search(query_file)
-    if not match:
-      continue
-    file_path = os.path.join(query_dir, query_file)
-    file_queries = load_queries_from_test_file(file_path)
-    if len(file_queries) != 1:
-      raise Exception(
-          "Expected exactly 1 query to be in file %s but got %s"
-          % (file_path, len(file_queries)))
-    query = file_queries[0]
-    query.name = match.group(1)
+  queries = []
+  for query_text in test_file_parser.load_tpc_queries(workload):
+    query = Query()
+    query.sql = query_text
     queries.append(query)
   return queries
 
@@ -1537,34 +1510,6 @@ def populate_runtime_info(query, impala, converted_args, timeout_secs=maxint):
   LOG.debug("Query after populating runtime info: %s", query)
 
 
-def match_memory_estimate(explain_lines):
-  """
-  Given a list of strings from EXPLAIN output, find the estimated memory needed. This is
-  used as a binary search start point.
-
-  Params:
-    explain_lines: list of str
-
-  Returns:
-    2-tuple str of memory limit in decimal string and units (one of 'P', 'T', 'G', 'M',
-    'K', '' bytes)
-
-  Raises:
-    Exception if no match found
-  """
-  # IMPALA-6441: This method is a public, first class method so it can be importable and
-  # tested with actual EXPLAIN output to make sure we always find the start point.
-  mem_limit, units = None, None
-  for line in explain_lines:
-    regex_result = MEM_ESTIMATE_PATTERN.search(line)
-    if regex_result:
-      mem_limit, units = regex_result.groups()
-      break
-  if None in (mem_limit, units):
-    raise Exception('could not parse explain string:\n' + '\n'.join(explain_lines))
-  return mem_limit, units
-
-
 def estimate_query_mem_mb_usage(query, query_runner):
   """Runs an explain plan then extracts and returns the estimated memory needed to run
   the query.

http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/util/cluster_controller.py
----------------------------------------------------------------------
diff --git a/tests/util/cluster_controller.py b/tests/util/cluster_controller.py
index 6b45429..388fa49 100644
--- a/tests/util/cluster_controller.py
+++ b/tests/util/cluster_controller.py
@@ -15,17 +15,23 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import fabric.decorators
+try:
+  import fabric.decorators
+  from fabric.context_managers import hide, settings
+  from fabric.operations import local, run, sudo
+  from fabric.tasks import execute
+except ImportError as e:
+  raise Exception(
+      "Please run impala-pip install -r $IMPALA_HOME/infra/python/deps/extended-test-"
+      "requirements.txt:\n{0}".format(str(e)))
 import logging
 import os
 from contextlib import contextmanager
-from fabric.context_managers import hide, settings
-from fabric.operations import local, run, sudo
-from fabric.tasks import execute
 from textwrap import dedent
 
 LOG = logging.getLogger('cluster_controller')
 
+
 class ClusterController(object):
   """A convenience wrapper around fabric."""
 

http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/util/parse_util.py
----------------------------------------------------------------------
diff --git a/tests/util/parse_util.py b/tests/util/parse_util.py
index 592716b..b84f635 100644
--- a/tests/util/parse_util.py
+++ b/tests/util/parse_util.py
@@ -18,8 +18,21 @@
 import re
 from datetime import datetime
 
+# IMPALA-6715: Every so often the stress test or the TPC workload directories get
+# changed, and the stress test loses the ability to run the full set of queries. Set
+# these constants and assert that when a workload is used, all the queries we expect to
+# use are there.
+EXPECTED_TPCDS_QUERIES_COUNT = 71
+EXPECTED_TPCH_NESTED_QUERIES_COUNT = 22
+EXPECTED_TPCH_QUERIES_COUNT = 22
+# Regex to extract the estimated memory from an explain plan.
+# The unit prefixes can be found in
+# fe/src/main/java/org/apache/impala/common/PrintUtils.java
+MEM_ESTIMATE_PATTERN = re.compile(
+    r"Per-Host Resource Estimates: Memory=(\d+\.?\d*)(P|T|G|M|K)?B")
 NEW_GLOG_ENTRY_PATTERN = re.compile(r"[IWEF](?P<Time>\d{4} \d{2}:\d{2}:\d{2}\.\d{6}).*")
 
+
 def parse_glog(text, start_time=None):
   '''Parses the log 'text' and returns a list of log entries. If a 'start_time' is
      provided only log entries that are after the time will be returned.
@@ -71,6 +84,7 @@ def parse_mem_to_mb(mem, units):
     raise Exception('Unexpected memory unit "%s"' % units)
   return int(mem)
 
+
 def parse_duration_string_ms(duration):
   """Parses a duration string of the form 1h2h3m4s5.6ms4.5us7.8ns into milliseconds."""
   pattern = r'(?P<value>[0-9]+\.?[0-9]*?)(?P<units>\D+)'
@@ -83,3 +97,31 @@ def parse_duration_string_ms(duration):
     times[parsed['units']] = float(parsed['value'])
 
   return (times['h'] * 60 * 60 + times['m'] * 60 + times['s']) * 1000 + times['ms']
+
+
+def match_memory_estimate(explain_lines):
+  """
+  Given a list of strings from EXPLAIN output, find the estimated memory needed. This is
+  used as a binary search start point.
+
+  Params:
+    explain_lines: list of str
+
+  Returns:
+    2-tuple str of memory limit in decimal string and units (one of 'P', 'T', 'G', 'M',
+    'K', '' bytes)
+
+  Raises:
+    Exception if no match found
+  """
+  # IMPALA-6441: This method is a public, first class method so it can be importable and
+  # tested with actual EXPLAIN output to make sure we always find the start point.
+  mem_limit, units = None, None
+  for line in explain_lines:
+    regex_result = MEM_ESTIMATE_PATTERN.search(line)
+    if regex_result:
+      mem_limit, units = regex_result.groups()
+      break
+  if None in (mem_limit, units):
+    raise Exception('could not parse explain string:\n' + '\n'.join(explain_lines))
+  return mem_limit, units

http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/util/ssh_util.py
----------------------------------------------------------------------
diff --git a/tests/util/ssh_util.py b/tests/util/ssh_util.py
index 8a56d96..a26f1bc 100644
--- a/tests/util/ssh_util.py
+++ b/tests/util/ssh_util.py
@@ -18,13 +18,19 @@
 import atexit
 import logging
 import os
-import paramiko
+try:
+  import paramiko
+except ImportError as e:
+  raise Exception(
+      "Please run impala-pip install -r $IMPALA_HOME/infra/python/deps/extended-test-"
+      "requirements.txt:\n{0}".format(str(e)))
 import textwrap
 import time
 
 LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0])
 
-from tests.common.errors import Timeout
+from tests.common.errors import Timeout  # noqa:E402
+
 
 class SshClient(paramiko.SSHClient):
   """A paramiko SSH client modified to:

http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/util/test_file_parser.py
----------------------------------------------------------------------
diff --git a/tests/util/test_file_parser.py b/tests/util/test_file_parser.py
index 9fa8d70..8d3c58a 100644
--- a/tests/util/test_file_parser.py
+++ b/tests/util/test_file_parser.py
@@ -20,9 +20,11 @@
 import codecs
 import collections
 import logging
+import os
+import os.path
 import re
+
 from collections import defaultdict
-from os.path import isfile
 from textwrap import dedent
 
 LOG = logging.getLogger('impala_test_suite')
@@ -102,7 +104,7 @@ def parse_table_constraints(constraints_file):
   schema_include = defaultdict(list)
   schema_exclude = defaultdict(list)
   schema_only = defaultdict(list)
-  if not isfile(constraints_file):
+  if not os.path.isfile(constraints_file):
     LOG.info('No schema constraints file file found')
   else:
     with open(constraints_file, 'rb') as constraints_file:
@@ -313,3 +315,31 @@ def write_test_file(test_file_name, test_file_sections, encoding=None):
           test_file_text.append(section_value)
     test_file_text.append(SECTION_DELIMITER)
     test_file.write(('\n').join(test_file_text))
+
+
+def load_tpc_queries(workload):
+  """Returns a list of TPC queries. 'workload' should either be 'tpch' or 'tpcds'."""
+  LOG.info("Loading %s queries", workload)
+  queries = list()
+  query_dir = os.path.join(
+      os.environ['IMPALA_HOME'], "testdata", "workloads", workload, "queries")
+  # IMPALA-6715 and others from the past: This pattern enforces the queries we actually
+  # find. Both workload directories contain other queries that are not part of the TPC
+  # spec.
+  file_name_pattern = re.compile(r"^{0}-(q.*).test$".format(workload))
+  for query_file in os.listdir(query_dir):
+    match = file_name_pattern.search(query_file)
+    if not match:
+      continue
+    file_path = os.path.join(query_dir, query_file)
+    test_cases = parse_query_test_file(file_path)
+    file_queries = list()
+    for test_case in test_cases:
+      query_sql = remove_comments(test_case["QUERY"])
+      file_queries.append(query_sql)
+    if len(file_queries) != 1:
+      raise Exception(
+          "Expected exactly 1 query to be in file %s but got %s"
+          % (file_path, len(file_queries)))
+    queries.append(file_queries[0])
+  return queries