You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/10/11 00:15:40 UTC

[impala] branch master updated (0dff5ef -> 89dc05b)

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from 0dff5ef  IMPALA-9015: improve mt_dop scan scheduling
     new ba808f6  IMPALA-1071: Distributable python package for impala-shell
     new cb2430c  IMPALA-8893: [DOCS] Document the new startup flag for cookie-based authn
     new 89dc05b  IMPALA-9033: log on slow HDFS I/Os

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CMakeLists.txt                         |   4 +
 be/src/runtime/io/hdfs-file-reader.cc  |  37 +++++++-
 be/src/runtime/io/hdfs-file-reader.h   |   4 +-
 be/src/util/runtime-profile-counters.h |   3 +
 bin/rat_exclude_files.txt              |   3 +
 docs/topics/impala_client.xml          |  63 ++++++++++--
 shell/impala_client.py                 |  37 ++++++++
 shell/packaging/MANIFEST.in            |   3 +
 shell/packaging/README.md              |  73 ++++++++++++++
 shell/packaging/__init__.py            |  40 ++++++++
 shell/packaging/make_python_package.sh |  87 +++++++++++++++++
 shell/packaging/requirements.txt       |   8 ++
 shell/packaging/setup.py               | 169 +++++++++++++++++++++++++++++++++
 13 files changed, 519 insertions(+), 12 deletions(-)
 create mode 100644 shell/packaging/MANIFEST.in
 create mode 100644 shell/packaging/README.md
 create mode 100644 shell/packaging/__init__.py
 create mode 100755 shell/packaging/make_python_package.sh
 create mode 100644 shell/packaging/requirements.txt
 create mode 100644 shell/packaging/setup.py


[impala] 01/03: IMPALA-1071: Distributable python package for impala-shell

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit ba808f67ddb55639d468da1d06bcad0da332b9be
Author: David Knupp <dk...@cloudera.com>
AuthorDate: Fri Sep 20 17:09:28 2019 -0700

    IMPALA-1071: Distributable python package for impala-shell
    
    The patch adds a set of scripts for converting the impala-shell
    into a true distributable python package. The package can be
    installed using familiar python commands, e.g.:
    
      $ python setup.py (install|develop)
    
    or
    
      $ pip install -e /path/to/dist/dir
    
    The entry point script, make_python_package.sh, will run as a
    part of the standard sequence of steps that results from calling
    buildall.sh, and will produce a gzipped tarball inside of
    Impala/shell/dist as an artifact. Thereafter, make_python_package.sh
    can be run manually any time.
    
    The expectation is that an official maintainer would need to manually
    upload official releases to the Python Package Index as appropriate.
    
    Change-Id: Ib8c745bddddf6a16f0c039430152745a2f00e044
    Reviewed-on: http://gerrit.cloudera.org:8080/14181
    Reviewed-by: David Knupp <dk...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 CMakeLists.txt                         |   4 +
 bin/rat_exclude_files.txt              |   3 +
 shell/impala_client.py                 |  37 ++++++++
 shell/packaging/MANIFEST.in            |   3 +
 shell/packaging/README.md              |  73 ++++++++++++++
 shell/packaging/__init__.py            |  40 ++++++++
 shell/packaging/make_python_package.sh |  87 +++++++++++++++++
 shell/packaging/requirements.txt       |   8 ++
 shell/packaging/setup.py               | 169 +++++++++++++++++++++++++++++++++
 9 files changed, 424 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d72430..c355218 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -421,6 +421,10 @@ add_custom_target(shell_tarball DEPENDS gen-deps
   COMMAND "${CMAKE_SOURCE_DIR}/shell/make_shell_tarball.sh"
 )
 
+add_custom_target(shell_pypi_package DEPENDS shell_tarball
+  COMMAND "DIST_DIR=${CMAKE_SOURCE_DIR}/shell/dist CLEAN_DIST=true ${CMAKE_SOURCE_DIR}/shell/packaging/make_python_package.sh"
+)
+
 add_custom_target(cscope ALL DEPENDS gen-deps
   COMMAND "${CMAKE_SOURCE_DIR}/bin/gen-cscope.sh"
 )
diff --git a/bin/rat_exclude_files.txt b/bin/rat_exclude_files.txt
index 9e15b0a..aefaeef 100644
--- a/bin/rat_exclude_files.txt
+++ b/bin/rat_exclude_files.txt
@@ -24,6 +24,8 @@ bin/diagnostics/__init__.py
 www/index.html
 lib/python/impala_py_lib/__init__.py
 lib/python/impala_py_lib/jenkins/__init__.py
+shell/packaging/MANIFEST.in
+shell/packaging/requirements.txt
 
 # See $IMPALA_HOME/LICENSE.txt
 be/src/gutil/*
@@ -91,6 +93,7 @@ docker/README.md
 be/src/thirdparty/pcg-cpp-0.98/README.md
 lib/python/README.md
 lib/python/impala_py_lib/gdb/README.md
+shell/packaging/README.md
 
 # http://www.apache.org/legal/src-headers.html: "Test data for which the addition of a
 # source header would cause the tests to fail."
diff --git a/shell/impala_client.py b/shell/impala_client.py
index 6761e8e..4dd22a8 100755
--- a/shell/impala_client.py
+++ b/shell/impala_client.py
@@ -939,6 +939,8 @@ class ImpalaBeeswaxClient(ImpalaClient):
       if t.type == TApplicationException.UNKNOWN_METHOD:
         raise MissingThriftMethodException(t.message)
       raise
+    except TTransportException as e:
+      raise DisconnectedException("Error communicating with impalad: %s" % e)
     return (resp.version, resp.webserver_address)
 
   def _create_query_req(self, query_str, set_query_options):
@@ -1094,4 +1096,39 @@ class ImpalaBeeswaxClient(ImpalaClient):
       if t.type == TApplicationException.UNKNOWN_METHOD:
         raise MissingThriftMethodException(t.message)
       raise RPCException("Application Exception : %s" % t)
+    except Exception as e:
+      # This final except clause should ONLY be exercised in the case of Impala
+      # shell being installed as a standalone python package from public PyPI,
+      # rather than being included as part of a typical Impala deployment.
+      #
+      # Essentially, it's a hack that is required due to issues stemming from
+      # IMPALA-6808. Because of the way the Impala python environment has been
+      # somewhat haphazardly constructed, we end up polluting the top level Impala
+      # python environment with modules that should really be sub-modules. One of
+      # the principal places this occurs is with the various modules required by
+      # the Impala shell. This isn't a concern when the shell is invoked via a
+      # specially installed version of python that belongs to Impala, but it does
+      # become an issue when the shell is being run using the system python.
+      #
+      # When we install the shell as a standalone package, we need to construct
+      # it in such a way that all of the internal modules are contained within
+      # a top-level impala_shell namespace. However, this then breaks various
+      # imports and, in this case, exception handling in the original code.
+      # As far as I can tell, there's no clean way to address this without fully
+      # resolving IMPALA-6808.
+      #
+      # Without taking some additional measure here to recognize certain common
+      # exceptions, especially Beeswax exceptions raised by RPC calls, when
+      # errors occur during a standalone shell session, we wind up falling
+      # entirely through this block and returning nothing to the caller (which
+      # happens to be the primary command loop in impala_shell.py). This in turn
+      # has the result of disconnecting the shell in the case of, say, even simple
+      # typos in database or table names.
+      if suppress_error_on_cancel and self.is_query_cancelled:
+        raise QueryCancelledByShellException()
+      else:
+        if "BeeswaxException" in str(e):
+          raise RPCException("ERROR: %s" % e.message)
+        if "QueryNotFoundException" in str(e):
+          raise QueryStateException('Error: Stale query handle')
 
diff --git a/shell/packaging/MANIFEST.in b/shell/packaging/MANIFEST.in
new file mode 100644
index 0000000..ec0d80f
--- /dev/null
+++ b/shell/packaging/MANIFEST.in
@@ -0,0 +1,3 @@
+include *.txt *.md *.py
+recursive-include impala_shell *.py
+recursive-exclude impala_shell *.pyc
diff --git a/shell/packaging/README.md b/shell/packaging/README.md
new file mode 100644
index 0000000..cd40b12
--- /dev/null
+++ b/shell/packaging/README.md
@@ -0,0 +1,73 @@
+# Impala Interactive Shell
+
+You can use the Impala shell tool (impala-shell) to connect to an Impala
+service. The shell allows you to set up databases and tables, insert data,
+and issue queries. For ad hoc queries and exploration, you can submit SQL
+statements in an interactive session. The impala-shell interpreter accepts
+all the same SQL statements listed in
+[Impala SQL Statements](http://impala.apache.org/docs/build/html/topics/impala_langref_sql.html),
+plus some shell-only commands that you can use for tuning performance and
+diagnosing problems.
+
+To automate your work, you can specify command-line options to process a single
+statement or a script file. (Other avenues for Impala automation via python
+are provided by Impyla or ODBC.)
+
+## Installing
+
+```
+$ pip install impala-shell
+```
+
+## Online documentation
+
+* [Impala Shell Documentation](http://impala.apache.org/docs/build/html/topics/impala_impala_shell.html)
+* [Apache Impala Documentation](http://impala.apache.org/impala-docs.html)
+
+## Quickstart
+
+### Non-interactive mode
+
+Processing a single query, e.g., ```show tables```:
+
+```
+$ impala-shell -i impalad-host.domain.com -d some_database -q 'show tables'
+```
+
+Processing a text file with a series of queries:
+
+```
+$ impala-shell -i impalad-host.domain.com -d some_database -f /path/to/queries.sql
+```
+
+### Launching the interactive shell
+
+To connect to an impalad host at the default service port (21000):
+
+```
+$ impala-shell -i impalad-host.domain.com
+Starting Impala Shell without Kerberos authentication
+Connected to impalad-host.domain.com:21000
+Server version: impalad version 2.11.0-SNAPSHOT RELEASE (build d4596f9ca3ea32a8008cdc809a7ac9a3dea47962)
+***********************************************************************************
+Welcome to the Impala shell.
+(Impala Shell v3.0.0-SNAPSHOT (73e90d2) built on Thu Mar  8 00:59:00 PST 2018)
+
+The '-B' command line flag turns off pretty-printing for query results. Use this
+flag to remove formatting from results you want to save for later, or to benchmark
+Impala.
+***********************************************************************************
+[impalad-host.domain.com:21000] >
+```
+
+### Launching the interactive shell (secure mode)
+
+To connect to a secure host using kerberos and SSL:
+
+```
+$ impala-shell -k --ssl -i impalad-secure-host.domain.com
+```
+
+### Disconnecting
+
+To exit the shell when running interactively, press ```Ctrl-D``` at the shell prompt.
diff --git a/shell/packaging/__init__.py b/shell/packaging/__init__.py
new file mode 100644
index 0000000..43e0baa
--- /dev/null
+++ b/shell/packaging/__init__.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from os.path import dirname, abspath
+import sys
+
+# When installing the python shell as a standalone package, this __init__ is
+# used to workaround the issues stemming from IMPALA-6808. Because of the way
+# the Impala python environment has been somewhat haphazardly constructed in
+# a deployed cluster, it ends up being "polluted" with top-level modules that
+# should really be sub-modules. One of the principal places this occurs is with
+# the various modules required by the Impala shell. This isn't a concern when
+# the shell is invoked via a specially installed version of python that belongs
+# to Impala, but it does become an issue when the shell is being run using the
+# system python.
+#
+# If we want to install the shell as a standalone package, we need to construct
+# it in such a way that all of the internal modules are contained within a
+# top-level impala_shell namespace. However, this then breaks various imports
+# throughout the Impala shell code. The way this file corrects that is to add
+# the impala_shell directory to PYTHONPATH only when the shell is invoked. As
+# far as I can tell, there's no cleaner way to address this without fully
+# resolving IMPALA-6808.
+impala_shell_dir = dirname(abspath(__file__))
+sys.path.append(impala_shell_dir)
diff --git a/shell/packaging/make_python_package.sh b/shell/packaging/make_python_package.sh
new file mode 100755
index 0000000..ba95148
--- /dev/null
+++ b/shell/packaging/make_python_package.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# ----------------------------------------------------------------------
+# This script is invoked during the Impala build process, and creates
+# a distributable python package of the Impala shell. The resulting
+# archive will be saved to:
+#
+#   ${IMPALA_HOME}/shell/dist/impala_shell-<version>.tar.gz
+#
+# Until the thrift-generated python files in ${IMPALA_HOME}/shell/gen-py
+# have been created by the build process, this script will not work.
+# It also relies upon the impala_build_version.py file created by the
+# parent packaging script, ${IMPALA_HOME}/shell/make_shell_tarball.sh,
+# which needs to be run before this script will work.
+#
+# After those files exist, however, this script can be run again at will.
+
+set -eu -o pipefail
+
+WORKING_DIR="$(cd "$(dirname "$0")" ; pwd -P )"
+SHELL_HOME="${IMPALA_HOME}"/shell
+STAGING_DIR="${WORKING_DIR}"/staging
+DIST_DIR="${DIST_DIR:-$WORKING_DIR/dist}"
+PACKAGE_DIR="${STAGING_DIR}"/impala_shell_package
+MODULE_LIB_DIR="${PACKAGE_DIR}"/impala_shell
+NO_CLEAN_DIST="${NO_CLEAN_DIST:-}"
+
+assemble_package_files() {
+  mkdir -p "${MODULE_LIB_DIR}"
+
+  cp -r "${SHELL_HOME}/gen-py"/* "${MODULE_LIB_DIR}"
+  cp -r "${THRIFT_HOME}/python/lib/python2.7/site-packages/thrift" "${MODULE_LIB_DIR}"
+
+  cp "${WORKING_DIR}/__init__.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/impala_shell.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/impala_client.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/option_parser.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/shell_output.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/impala_shell_config_defaults.py" "${MODULE_LIB_DIR}"
+  cp "${SHELL_HOME}/TSSLSocketWithWildcardSAN.py" "${MODULE_LIB_DIR}"
+
+  cp "${SHELL_HOME}/packaging/README.md" "${PACKAGE_DIR}"
+  cp "${SHELL_HOME}/packaging/MANIFEST.in" "${PACKAGE_DIR}"
+  cp "${SHELL_HOME}/packaging/requirements.txt" "${PACKAGE_DIR}"
+  cp "${SHELL_HOME}/packaging/setup.py" "${PACKAGE_DIR}"
+
+  cp "${IMPALA_HOME}/LICENSE.txt" "${PACKAGE_DIR}"
+}
+
+create_distributable_python_package() {
+  # Generate a new python package tarball in ${IMPALA_HOME}/shell/dist
+  if [[ "${NO_CLEAN_DIST}" != "true" ]]; then
+    rm -rf "${DIST_DIR}"
+  fi
+
+  mkdir -p "${DIST_DIR}"
+
+  pushd "${PACKAGE_DIR}"
+  echo "Building package..."
+  PACKAGE_TYPE="${PACKAGE_TYPE:-}" OFFICIAL="${OFFICIAL:-}" \
+    python setup.py sdist --dist-dir "${DIST_DIR}"
+  popd
+
+  if [[ "${NO_CLEAN_DIST}" != "true" ]]; then
+    rm -rf "${STAGING_DIR}"
+  fi
+}
+
+assemble_package_files
+create_distributable_python_package
diff --git a/shell/packaging/requirements.txt b/shell/packaging/requirements.txt
new file mode 100644
index 0000000..32aef56
--- /dev/null
+++ b/shell/packaging/requirements.txt
@@ -0,0 +1,8 @@
+bitarray==1.0.1
+prettytable==0.7.1
+sasl==0.2.1
+setuptools>=36.8.0
+six==1.11.0
+sqlparse==0.1.19
+thrift==0.9.3
+thrift_sasl==0.2.1
diff --git a/shell/packaging/setup.py b/shell/packaging/setup.py
new file mode 100644
index 0000000..173e0d8
--- /dev/null
+++ b/shell/packaging/setup.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""Set up the Impala shell python package."""
+
+import datetime
+import os
+import re
+import sys
+import time
+
+from impala_shell import impala_build_version
+from setuptools import find_packages, setup
+from textwrap import dedent
+
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+def parse_requirements(requirements_file='requirements.txt'):
+    """
+    Parse requirements from the requirements file, stripping comments.
+
+    Args:
+      requirements_file: path to a requirements file
+
+    Returns:
+      a list of python packages
+    """
+    lines = []
+    with open(requirements_file) as reqs:
+        for _ in reqs:
+            line = _.split('#')[0]
+            if line.strip():
+                lines.append(line)
+    return lines
+
+
+def get_version():
+  """Generate package version string when calling 'setup.py'.
+
+  When setup.py is being used to CREATE a distribution, e.g., via setup.py sdist
+  or setup.py bdist, then use the output from impala_build_version.get_version(),
+  and append modifiers as specified by the RELEASE_TYPE and OFFICIAL environment
+  variables. By default, the package created will be a dev release, designated
+  by timestamp. For example, if get_version() returns the string 3.0.0-SNAPSHOT,
+  the package version may be something like 3.0.0.dev20180322154653.
+
+  It's also possible set an evironment variable for BUILD_VERSION to override the
+  default build value returned from impala_build_version.get_version().
+
+  E.g., to specify an offical 3.4 beta 2 release (3.4b2), one would call:
+
+    BUILD_VERSION=3.4 RELEASE_TYPE=b2 OFFICIAL=true python setup.py sdist
+
+  The generated version string will be written to a version.txt file to be
+  referenced when the distribution is installed.
+
+  When setup.py is invoked during installation, e.g., via pip install or
+  setup.py install, read the package version from the version.txt file, which
+  is presumed to contain a single line containing a valid PEP-440 version string.
+  The file should have been generated when the distribution being installed was
+  created. (Although a version.txt file can also be created manually.)
+
+  See https://www.python.org/dev/peps/pep-0440/ for more info on python
+  version strings.
+
+  Returns:
+    A package version string compliant with PEP-440
+  """
+  version_file = os.path.join(CURRENT_DIR, 'version.txt')
+
+  if not os.path.isfile(version_file):
+    # If setup.py is being executed to create a distribution, e.g., via setup.py
+    # sdist or setup.py bdist, then derive the version and WRITE the version.txt
+    # file that will later be used for installations.
+    if os.getenv('BUILD_VERSION') is not None:
+      package_version = os.getenv('BUILD_VERSION')
+    else:
+      version_match = re.search('\d+\.\d+\.\d+', impala_build_version.get_version())
+      if version_match is None:
+        sys.exit('Unable to acquire Impala version.')
+      package_version = version_match.group(0)
+
+    # packages can be marked as alpha, beta, or rc RELEASE_TYPE
+    release_type = os.getenv('RELEASE_TYPE')
+    if release_type:
+      if not re.match('(a|b|rc)\d+?', release_type):
+        msg = """\
+            RELEASE_TYPE \'{0}\' does not conform to any PEP-440 release format:
+
+              aN (for alpha releases)
+              bN (for beta releases)
+              rcN (for release candidates)
+
+            where N is the number of the release"""
+        sys.exit(dedent(msg).format(release_type))
+      package_version += release_type
+
+    # packages that are not marked OFFICIAL have ".dev" + a timestamp appended
+    if os.getenv('OFFICIAL') != 'true':
+      epoch_t = time.time()
+      ts_fmt = '%Y%m%d%H%M%S'
+      timestamp = datetime.datetime.fromtimestamp(epoch_t).strftime(ts_fmt)
+      package_version = '{0}.dev{1}'.format(package_version, timestamp)
+
+    with open('version.txt', 'w') as version_file:
+      version_file.write(package_version)
+  else:
+    # If setup.py is being invoked during installation, e.g., via pip install
+    # or setup.py install, we expect a version.txt file from which to READ the
+    # version string.
+    with open(version_file) as version_file:
+      package_version = version_file.readline()
+
+  return package_version
+
+
+setup(
+  name='impala_shell',
+  python_requires='>2.6, <3.0.0',
+  version=get_version(),
+  description='Impala Shell',
+  long_description_content_type='text/markdown',
+  long_description=open('README.md').read(),
+  author="Impala Dev",
+  author_email='dev@impala.apache.org',
+  url='https://impala.apache.org/',
+  license='Apache Software License',
+  packages=find_packages(),
+  include_package_data=True,
+  install_requires=parse_requirements(),
+  entry_points={
+    'console_scripts': [
+      'impala-shell = impala_shell.impala_shell:impala_shell_main'
+    ]
+  },
+  classifiers=[
+    'Development Status :: 5 - Production/Stable',
+    'Environment :: Console',
+    'Intended Audience :: Developers',
+    'Intended Audience :: End Users/Desktop',
+    'Intended Audience :: Science/Research',
+    'License :: OSI Approved :: Apache Software License',
+    'Operating System :: MacOS :: MacOS X',
+    'Operating System :: POSIX :: Linux',
+    'Programming Language :: Python :: 2 :: Only',
+    'Programming Language :: Python :: 2.6',
+    'Programming Language :: Python :: 2.7',
+    'Topic :: Database :: Front-Ends'
+  ]
+)


[impala] 02/03: IMPALA-8893: [DOCS] Document the new startup flag for cookie-based authn

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit cb2430c24967274819f2739213237789f36ca28a
Author: Alex Rodoni <ar...@cloudera.com>
AuthorDate: Tue Oct 8 16:07:25 2019 -0700

    IMPALA-8893: [DOCS] Document the new startup flag for cookie-based authn
    
    Change-Id: I4bd43430363839ae41e490cd35c92e77e6610f4f
    Reviewed-on: http://gerrit.cloudera.org:8080/14392
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Thomas Tauber-Marshall <tm...@cloudera.com>
---
 docs/topics/impala_client.xml | 63 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/docs/topics/impala_client.xml b/docs/topics/impala_client.xml
index 9d9b29d..b7ae452 100644
--- a/docs/topics/impala_client.xml
+++ b/docs/topics/impala_client.xml
@@ -21,8 +21,11 @@ under the License.
 <concept id="intro_client">
 
   <title>Impala Client Access</title>
+
   <titlealts audience="PDF">
+
     <navtitle>Client Access</navtitle>
+
   </titlealts>
 
   <conbody>
@@ -220,6 +223,33 @@ under the License.
         <dlentry>
 
           <dt>
+            --hs2_http_port
+          </dt>
+
+          <dd>
+            Specifies the port for clients to connect to Impala server over HTTP.
+            <p>
+              The default port is 28000.
+            </p>
+
+            <p>
+              You can disable the HTTP end point for clients by setting the flag to
+              <codeph>0</codeph>.
+            </p>
+
+            <p>
+              To enable TLS/SSL for HiveServer2 HTTP endpoint use
+              <codeph>--ssl_server_certificate</codeph> and <codeph>--ssl_private_key</codeph>.
+              See <xref
+                href="impala_ssl.xml#ssl"/> for detail.
+            </p>
+          </dd>
+
+        </dlentry>
+
+        <dlentry>
+
+          <dt>
             --idle_client_poll_time_s
           </dt>
 
@@ -252,21 +282,38 @@ under the License.
         <dlentry>
 
           <dt>
-            --hs2_http_port
+            --max_cookie_lifetime_s
           </dt>
 
           <dd>
-            Specifies the port for clients to connect to Impala server over HTTP.
+            Starting in Impala 3.4.0, Impala uses cookies for authentication when clients
+            connect via HiveServer2 over HTTP. Use the <codeph>--max_cookie_lifetime_s</codeph>
+            startup flag to control how long generated cookies are valid for.
             <p>
-              You can disable the HTTP end point for clients by setting the flag to
-              <codeph>0</codeph>.
+              Specify the value in seconds.
+            </p>
+            <p>
+              The default value is 1 day.
             </p>
 
             <p>
-              To enable TLS/SSL for HiveServer2 HTTP endpoint use
-              <codeph>--ssl_server_certificate</codeph> and <codeph>--ssl_private_key</codeph>.
-              See <xref
-                href="impala_ssl.xml#ssl"/> for detail.
+              Setting the flag to <codeph>0</codeph> disables cookie support.
+            </p>
+
+            <p>
+              When an unexpired cookie is successfully verified, the user name contained in the
+              cookie is set on the connection.
+            </p>
+
+            <p>
+              Each <codeph>impalad</codeph> uses its own key to generate the signature, so
+              clients that reconnect to a different <codeph>impalad</codeph> have to
+              re-authenticate.
+            </p>
+
+            <p>
+              On a single <codeph>impalad</codeph>, cookies are valid across sessions and
+              connections.
             </p>
           </dd>
 


[impala] 03/03: IMPALA-9033: log on slow HDFS I/Os

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 89dc05bb24d069b0816b1015795bd3371cd6979c
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Wed Oct 9 15:53:33 2019 -0700

    IMPALA-9033: log on slow HDFS I/Os
    
    This logs a message with the time taken and also logs basic HDFS
    statistics from the I/O, which would tell us if it was a remote
    read, etc.
    
    The threshold is 10s and is configurable via
    --fs_slow_read_log_threshold_ms in case we want to make it more or less
    sensitive.
    
    Here's some example output that I obtained by adding a 500ms sleep
    to the code path, and lowering the threshold to 500ms:
    
    I1010 12:09:38.211959 30292 hdfs-file-reader.cc:173] 2448e3196bf9ee94:69adb16f00000001] Slow FS I/O operation on hdfs://localhost:20500/test-warehouse/tpch.lineitem/lineitem.tbl for instance 2448e3196bf9ee94:69adb16f00000001 of query 2448e3196bf9ee94:69adb16f00000000. Last read returned 8.00 MB. This thread has read 8.00 MB/8.00 MB starting at offset 394264576 in this I/O scheduling quantum and taken 584.129ms so far. I/O status: OK
    I1010 12:09:38.212011 30292 hdfs-file-reader.cc:353] 2448e3196bf9ee94:69adb16f00000001] Stats for last read by this I/O thread: totalBytesRead=8388608 totalLocalBytesRead=8388608 totalShortCircuitBytesRead=8388608 totalZeroCopyBytesRead=0
    
    Change-Id: I1929921495706b482d91d91cffe27bee4478f5c4
    Reviewed-on: http://gerrit.cloudera.org:8080/14406
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/runtime/io/hdfs-file-reader.cc  | 37 +++++++++++++++++++++++++++++++---
 be/src/runtime/io/hdfs-file-reader.h   |  4 +++-
 be/src/util/runtime-profile-counters.h |  3 +++
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/be/src/runtime/io/hdfs-file-reader.cc b/be/src/runtime/io/hdfs-file-reader.cc
index 14cf2b5..c862ee0 100644
--- a/be/src/runtime/io/hdfs-file-reader.cc
+++ b/be/src/runtime/io/hdfs-file-reader.cc
@@ -23,6 +23,7 @@
 #include "runtime/io/hdfs-file-reader.h"
 #include "runtime/io/request-context.h"
 #include "runtime/io/request-ranges.h"
+#include "util/debug-util.h"
 #include "util/hdfs-util.h"
 #include "util/histogram-metric.h"
 #include "util/impalad-metrics.h"
@@ -36,6 +37,10 @@ DEFINE_bool(use_hdfs_pread, false, "Enables using hdfsPread() instead of hdfsRea
     "when performing HDFS read operations. This is necessary to use HDFS hedged reads "
     "(assuming the HDFS client is configured to do so).");
 
+DEFINE_int64(fs_slow_read_log_threshold_ms, 10L * 1000L,
+    "Log diagnostics about I/Os issued via the HDFS client that take longer than this "
+    "threshold.");
+
 #ifndef NDEBUG
 DECLARE_int32(stress_disk_read_delay_ms);
 #endif
@@ -159,6 +164,25 @@ Status HdfsFileReader::ReadFromPos(DiskQueue* queue, int64_t file_offset, uint8_
         status = ReadFromPosInternal(hdfs_file, queue, position_in_file,
             buffer + *bytes_read, chunk_size, &current_bytes_read);
       }
+      // Log diagnostics for failed and successful reads.
+      int64_t elapsed_time = req_context_read_timer.ElapsedTime();
+      bool is_slow_read = elapsed_time
+          > FLAGS_fs_slow_read_log_threshold_ms * NANOS_PER_MICRO * MICROS_PER_MILLI;
+      if (is_slow_read) {
+        LOG(INFO) << "Slow FS I/O operation on " << *scan_range_->file_string() << " for "
+                  << "instance " << PrintId(scan_range_->reader_->instance_id())
+                  << " of query " << PrintId(scan_range_->reader_->query_id()) << ". "
+                  << "Last read returned "
+                  << PrettyPrinter::PrintBytes(current_bytes_read) << ". "
+                  << "This thread has read "
+                  << PrettyPrinter::PrintBytes(*bytes_read + current_bytes_read)
+                  << "/" << PrettyPrinter::PrintBytes(bytes_to_read)
+                  << " starting at offset " << file_offset << " in this I/O scheduling "
+                  << "quantum and taken "
+                  << PrettyPrinter::Print(elapsed_time, TUnit::TIME_NS) << " so far. "
+                  << "I/O status: " << (status.ok() ? "OK" : status.GetDetail());
+      }
+
       if (!status.ok()) {
         break;
       }
@@ -171,7 +195,7 @@ Status HdfsFileReader::ReadFromPos(DiskQueue* queue, int64_t file_offset, uint8_
       *bytes_read += current_bytes_read;
 
       // Collect and accumulate statistics
-      GetHdfsStatistics(hdfs_file);
+      GetHdfsStatistics(hdfs_file, is_slow_read);
     }
 
     int64_t cached_bytes_missed = *bytes_read - cached_read;
@@ -274,7 +298,7 @@ void HdfsFileReader::WriteDataCache(DataCache* remote_data_cache, int64_t file_o
 void HdfsFileReader::Close() {
   unique_lock<SpinLock> hdfs_lock(lock_);
   if (exclusive_hdfs_fh_ != nullptr) {
-    GetHdfsStatistics(exclusive_hdfs_fh_->file());
+    GetHdfsStatistics(exclusive_hdfs_fh_->file(), false);
 
     if (cached_buffer_ != nullptr) {
       hadoopRzBufferFree(exclusive_hdfs_fh_->file(), cached_buffer_);
@@ -314,7 +338,7 @@ void HdfsFileReader::Close() {
   }
 }
 
-void HdfsFileReader::GetHdfsStatistics(hdfsFile hdfs_file) {
+void HdfsFileReader::GetHdfsStatistics(hdfsFile hdfs_file, bool log_stats) {
   struct hdfsReadStatistics* stats;
   if (IsHdfsPath(scan_range_->file())) {
     int success = hdfsFileGetReadStatistics(hdfs_file, &stats);
@@ -326,6 +350,13 @@ void HdfsFileReader::GetHdfsStatistics(hdfsFile hdfs_file) {
       if (stats->totalLocalBytesRead != stats->totalBytesRead) {
         num_remote_bytes_ += stats->totalBytesRead - stats->totalLocalBytesRead;
       }
+      if (log_stats) {
+        LOG(INFO) << "Stats for last read by this I/O thread:"
+                  << " totalBytesRead=" << stats->totalBytesRead
+                  << " totalLocalBytesRead=" << stats->totalLocalBytesRead
+                  << " totalShortCircuitBytesRead=" << stats->totalShortCircuitBytesRead
+                  << " totalZeroCopyBytesRead=" << stats->totalZeroCopyBytesRead;
+      }
       hdfsFileFreeReadStatistics(stats);
     }
     hdfsFileClearReadStatistics(hdfs_file);
diff --git a/be/src/runtime/io/hdfs-file-reader.h b/be/src/runtime/io/hdfs-file-reader.h
index 1e228e2..57b9332 100644
--- a/be/src/runtime/io/hdfs-file-reader.h
+++ b/be/src/runtime/io/hdfs-file-reader.h
@@ -74,7 +74,9 @@ private:
   Status ReadFromPosInternal(hdfsFile hdfs_file, DiskQueue* disk_queue,
       int64_t position_in_file, uint8_t* buffer, int64_t chunk_size, int* bytes_read);
 
-  void GetHdfsStatistics(hdfsFile hdfs_file);
+  /// Update counters with HDFS read statistics from 'hdfs_file'. If 'log_stats' is
+  /// true, the statistics are logged.
+  void GetHdfsStatistics(hdfsFile hdfs_file, bool log_stats);
 
   /// Hadoop filesystem that contains the file being read.
   hdfsFS const hdfs_fs_;
diff --git a/be/src/util/runtime-profile-counters.h b/be/src/util/runtime-profile-counters.h
index fe612ce..2777032 100644
--- a/be/src/util/runtime-profile-counters.h
+++ b/be/src/util/runtime-profile-counters.h
@@ -659,6 +659,9 @@ class ScopedTimer {
 
   bool IsCancelled() { return is_cancelled_ != nullptr && *is_cancelled_; }
 
+  /// Return the total elapsed time accumulated by this timer so far.
+  int64_t ElapsedTime() { return sw_.ElapsedTime(); }
+
   /// Update counter when object is destroyed
   ~ScopedTimer() {
     sw_.Stop();