You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by tm...@apache.org on 2019/02/12 04:06:54 UTC

[impala] branch master updated (8b8d935 -> 6fc05a6)

This is an automated email from the ASF dual-hosted git repository.

tmarshall pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from 8b8d935  IMPALA-5031: `uint8_t & int` type is int
     new fa604fb  IMPALA-8177: Log DDL failures in coordinator logs
     new 29df1d5  Additions to .gitignore
     new 697a15b  IMPALA-7214: [DOCS] Update Impala docs to decouple Impala and DataNodes
     new 8b5cb57  IMPALA-8173: Fix KeyError in run-workload.py
     new 6fc05a6  IMPALA-5031: signed overflow is undefined behavior

The 5 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .gitignore                                |  10 ++
 be/src/common/status.h                    |   9 +
 be/src/service/client-request-state.cc    |   3 +-
 be/src/util/string-parser.h               |   3 +-
 docs/shared/impala_common.xml             |   7 +-
 docs/topics/impala_components.xml         | 156 ++++++++----------
 docs/topics/impala_concepts.xml           | 266 ------------------------------
 docs/topics/impala_connecting.xml         |  65 +++-----
 docs/topics/impala_impala_shell.xml       |   7 +-
 docs/topics/impala_kudu.xml               |   8 +-
 docs/topics/impala_order_by.xml           | 115 ++++++-------
 docs/topics/impala_runtime_filtering.xml  |  14 +-
 docs/topics/impala_scalability.xml        |   4 +-
 tests/performance/query_exec_functions.py |   4 +-
 tests/performance/scheduler.py            |  22 ++-
 15 files changed, 208 insertions(+), 485 deletions(-)

[impala] 03/05: IMPALA-7214: [DOCS] Update Impala docs to decouple Impala and DataNodes

Posted by tm...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tmarshall pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 697a15b341186046d8fae3a2139f1ad13d304734
Author: Alex Rodoni <ar...@cloudera.com>
AuthorDate: Thu Feb 7 17:06:47 2019 -0800

    IMPALA-7214: [DOCS] Update Impala docs to decouple Impala and DataNodes
    
    - Take 1: Let's review these docs before we go clean up many more.
    
    Change-Id: I1c91f7975c09dae9908591eeeac0d55e5355b2d4
    Reviewed-on: http://gerrit.cloudera.org:8080/12400
    Reviewed-by: Alex Rodoni <ar...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 docs/shared/impala_common.xml            |   7 +-
 docs/topics/impala_components.xml        | 156 +++++++++---------
 docs/topics/impala_concepts.xml          | 266 -------------------------------
 docs/topics/impala_connecting.xml        |  65 ++++----
 docs/topics/impala_impala_shell.xml      |   7 +-
 docs/topics/impala_kudu.xml              |   8 +-
 docs/topics/impala_order_by.xml          | 115 ++++++-------
 docs/topics/impala_runtime_filtering.xml |  14 +-
 docs/topics/impala_scalability.xml       |   4 +-
 9 files changed, 169 insertions(+), 473 deletions(-)

diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index 3c11f68..929af52 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -2079,7 +2079,7 @@ show functions in _impala_builtins like '*<varname>substring</varname>*';
         <b>Sorting considerations:</b> Although you can specify an <codeph>ORDER BY</codeph> clause in an
         <codeph>INSERT ... SELECT</codeph> statement, any <codeph>ORDER BY</codeph> clause is ignored and the
         results are not necessarily sorted. An <codeph>INSERT ... SELECT</codeph> operation potentially creates
-        many different data files, prepared on different data nodes, and therefore the notion of the data being
+        many different data files, prepared by different executor Impala daemons, and therefore the notion of the data being
         stored in sorted order is impractical.
       </p>
 
@@ -3185,8 +3185,9 @@ select max(height), avg(height) from census_data where age &gt; 20;
         <codeph><xref href="../topics/impala_order_by.xml#order_by">ORDER BY</xref></codeph> clause to also use a
         <codeph><xref href="../topics/impala_limit.xml#limit">LIMIT</xref></codeph> clause. In Impala 1.4.0 and
         higher, the <codeph>LIMIT</codeph> clause is optional for <codeph>ORDER BY</codeph> queries. In cases where
-        sorting a huge result set requires enough memory to exceed the Impala memory limit for a particular node,
-        Impala automatically uses a temporary disk work area to perform the sort operation.
+        sorting a huge result set requires enough memory to exceed the Impala
+        memory limit for a particular executor Impala daemon, Impala automatically uses a
+        temporary disk work area to perform the sort operation.
       </p>
 
       <p rev="1.2" id="limit_and_offset">
diff --git a/docs/topics/impala_components.xml b/docs/topics/impala_components.xml
index 8909536..8f5f7f3 100644
--- a/docs/topics/impala_components.xml
+++ b/docs/topics/impala_components.xml
@@ -34,10 +34,9 @@ under the License.
 
   <conbody>
 
-    <p>
-      The Impala server is a distributed, massively parallel processing (MPP) database engine. It consists of
-      different daemon processes that run on specific hosts within your <keyword keyref="distro"/> cluster.
-    </p>
+    <p> The Impala server is a distributed, massively parallel processing (MPP)
+      database engine. It consists of different daemon processes that run on
+      specific hosts within your cluster. </p>
 
     <p outputclass="toc inpage"/>
   </conbody>
@@ -48,35 +47,35 @@ under the License.
 
     <conbody>
 
-      <p>
-        The core Impala component is a daemon process that runs on each DataNode of the cluster, physically represented
-        by the <codeph>impalad</codeph> process. It reads and writes to data files; accepts queries transmitted
-        from the <codeph>impala-shell</codeph> command, Hue, JDBC, or ODBC; parallelizes the queries and
-        distributes work across the cluster; and transmits intermediate query results back to the
-        central coordinator node.
-      </p>
-
-      <p>
-        You can submit a query to the Impala daemon running on any DataNode, and that instance of the daemon serves as the
-        <term>coordinator node</term> for that query. The other nodes transmit partial results back to the
-        coordinator, which constructs the final result set for a query. When running experiments with functionality
-        through the <codeph>impala-shell</codeph> command, you might always connect to the same Impala daemon for
-        convenience. For clusters running production workloads, you might load-balance by
-        submitting each query to a different Impala daemon in round-robin style, using the JDBC or ODBC interfaces.
-      </p>
-
-      <p>
-        The Impala daemons are in constant communication with the <term>statestore</term>, to confirm which nodes
-        are healthy and can accept new work.
-      </p>
-
-      <p rev="1.2">
-        They also receive broadcast messages from the <cmdname>catalogd</cmdname> daemon (introduced in Impala 1.2)
-        whenever any Impala node in the cluster creates, alters, or drops any type of object, or when an
-        <codeph>INSERT</codeph> or <codeph>LOAD DATA</codeph> statement is processed through Impala. This
-        background communication minimizes the need for <codeph>REFRESH</codeph> or <codeph>INVALIDATE
-        METADATA</codeph> statements that were needed to coordinate metadata across nodes prior to Impala 1.2.
-      </p>
+      <p> The core Impala component is the Impala daemon, physically represented
+        by the <codeph>impalad</codeph> process. A few of the key functions that
+        an Impala daemon performs are:<ul>
+          <li>Reads and writes to data files.</li>
+          <li>Accepts queries transmitted from the <codeph>impala-shell</codeph>
+            command, Hue, JDBC, or ODBC.</li>
+          <li>Parallelizes the queries and distributes work across the
+            cluster.</li>
+          <li>Transmits intermediate query results back to the central
+            coordinator. </li>
+        </ul></p>
+      <p>Impala daemons can be deployed in one of the following ways:<ul>
+          <li>HDFS and Impala are co-located, and each Impala daemon runs on the
+            same host as a DataNode.</li>
+          <li>Impala is deployed separately in a compute cluster and reads
+            remotely from HDFS, S3, ADLS, etc.</li>
+        </ul></p>
+
+      <p> The Impala daemons are in constant communication with StateStore, to
+        confirm which daemons are healthy and can accept new work. </p>
+
+      <p rev="1.2"> They also receive broadcast messages from the
+          <cmdname>catalogd</cmdname> daemon (introduced in Impala 1.2) whenever
+        any Impala daemon in the cluster creates, alters, or drops any type of
+        object, or when an <codeph>INSERT</codeph> or <codeph>LOAD DATA</codeph>
+        statement is processed through Impala. This background communication
+        minimizes the need for <codeph>REFRESH</codeph> or <codeph>INVALIDATE
+          METADATA</codeph> statements that were needed to coordinate metadata
+        across Impala daemons prior to Impala 1.2. </p>
 
       <p rev="2.9.0 IMPALA-3807 IMPALA-5147 IMPALA-5503">
         In <keyword keyref="impala29_full"/> and higher, you can control which hosts act as query coordinators
@@ -98,32 +97,28 @@ under the License.
 
     <conbody>
 
-      <p>
-        The Impala component known as the <term>statestore</term> checks on the health of Impala daemons on all the
-        DataNodes in a cluster, and continuously relays its findings to each of those daemons. It is physically
-        represented by a daemon process named <codeph>statestored</codeph>; you only need such a process on one
-        host in the cluster. If an Impala daemon goes offline due to hardware failure, network error, software issue,
-        or other reason, the statestore informs all the other Impala daemons so that future queries can avoid making
-        requests to the unreachable node.
-      </p>
+      <p> The Impala component known as the StateStore checks on the health of
+        all Impala daemons in a cluster, and continuously relays its findings to
+        each of those daemons. It is physically represented by a daemon process
+        named <codeph>statestored</codeph>. You only need such a process on one
+        host in a cluster. If an Impala daemon goes offline due to hardware
+        failure, network error, software issue, or other reason, the StateStore
+        informs all the other Impala daemons so that future queries can avoid
+        making requests to the unreachable Impala daemon. </p>
 
-      <p>
-        Because the statestore's purpose is to help when things go wrong and
+      <p> Because the StateStore's purpose is to help when things go wrong and
         to broadcast metadata to coordinators, it is not always critical to the
-        normal operation of an Impala cluster. If the statestore is not running
+        normal operation of an Impala cluster. If the StateStore is not running
         or becomes unreachable, the Impala daemons continue running and
         distributing work among themselves as usual when working with the data
         known to Impala. The cluster just becomes less robust if other Impala
         daemons fail, and metadata becomes less consistent as it changes while
-        the statestore is offline. When the statestore comes back online, it
+        the StateStore is offline. When the StateStore comes back online, it
         re-establishes communication with the Impala daemons and resumes its
-        monitoring and broadcasting functions.
-      </p>
+        monitoring and broadcasting functions. </p>
 
-      <p>
-        If you issue a DDL statement while the statestore is down, the queries
-        that access the new object the DDL created will fail.
-      </p>
+      <p> If you issue a DDL statement while the StateStore is down, the queries
+        that access the new object the DDL created will fail. </p>
 
       <p conref="../shared/impala_common.xml#common/statestored_catalogd_ha_blurb"/>
 
@@ -145,35 +140,25 @@ under the License.
 
     <conbody>
 
-      <p>
-        The Impala component known as the <term>catalog service</term> relays the metadata changes from Impala SQL
-        statements to all the Impala daemons in a cluster. It is physically represented by a daemon process named
-        <codeph>catalogd</codeph>; you only need such a process on one host in the cluster. Because the requests
-        are passed through the statestore daemon, it makes sense to run the <cmdname>statestored</cmdname> and
-        <cmdname>catalogd</cmdname> services on the same host.
-      </p>
-
-      <p>
-        The catalog service avoids the need to issue
-        <codeph>REFRESH</codeph> and <codeph>INVALIDATE METADATA</codeph> statements when the metadata changes are
-        performed by statements issued through Impala. When you create a table, load data, and so on through Hive,
-        you do need to issue <codeph>REFRESH</codeph> or <codeph>INVALIDATE METADATA</codeph> on an Impala node
-        before executing a query there.
-      </p>
+      <p> The Impala component known as the Catalog Service relays the metadata
+        changes from Impala SQL statements to all the Impala daemons in a
+        cluster. It is physically represented by a daemon process named
+          <codeph>catalogd</codeph>. You only need such a process on one host in
+        a cluster. Because the requests are passed through the StateStore
+        daemon, it makes sense to run the <cmdname>statestored</cmdname> and
+          <cmdname>catalogd</cmdname> services on the same host. </p>
+
+      <p> The catalog service avoids the need to issue <codeph>REFRESH</codeph>
+        and <codeph>INVALIDATE METADATA</codeph> statements when the metadata
+        changes are performed by statements issued through Impala. When you
+        create a table, load data, and so on through Hive, you do need to issue
+          <codeph>REFRESH</codeph> or <codeph>INVALIDATE METADATA</codeph> on an
+        Impala daemon before executing a query there. </p>
 
       <p>
         This feature touches a number of aspects of Impala:
       </p>
 
-<!-- This was formerly a conref, but since the list of links also included a link
-     to this same topic, materializing the list here and removing that
-     circular link. (The conref is still used in Incompatible Changes.)
-
-      <ul conref="../shared/impala_common.xml#common/catalogd_xrefs">
-        <li/>
-      </ul>
--->
-
       <ul id="catalogd_xrefs">
         <li>
           <p>
@@ -184,16 +169,17 @@ under the License.
         </li>
 
         <li>
-          <p>
-            The <codeph>REFRESH</codeph> and <codeph>INVALIDATE METADATA</codeph> statements are not needed
-            when the <codeph>CREATE TABLE</codeph>, <codeph>INSERT</codeph>, or other table-changing or
-            data-changing operation is performed through Impala. These statements are still needed if such
-            operations are done through Hive or by manipulating data files directly in HDFS, but in those cases the
-            statements only need to be issued on one Impala node rather than on all nodes. See
-            <xref href="impala_refresh.xml#refresh"/> and
-            <xref href="impala_invalidate_metadata.xml#invalidate_metadata"/> for the latest usage information for
-            those statements.
-          </p>
+          <p> The <codeph>REFRESH</codeph> and <codeph>INVALIDATE
+              METADATA</codeph> statements are not needed when the
+              <codeph>CREATE TABLE</codeph>, <codeph>INSERT</codeph>, or other
+            table-changing or data-changing operation is performed through
+            Impala. These statements are still needed if such operations are
+            done through Hive or by manipulating data files directly in HDFS,
+            but in those cases the statements only need to be issued on one
+            Impala daemon rather than on all daemons. See <xref
+              href="impala_refresh.xml#refresh"/> and <xref
+              href="impala_invalidate_metadata.xml#invalidate_metadata"/> for
+            the latest usage information for those statements. </p>
         </li>
       </ul>
 
diff --git a/docs/topics/impala_concepts.xml b/docs/topics/impala_concepts.xml
index 45143f5..2bb96b6 100644
--- a/docs/topics/impala_concepts.xml
+++ b/docs/topics/impala_concepts.xml
@@ -43,270 +43,4 @@ under the License.
 
     <p outputclass="toc"/>
   </conbody>
-
-<!-- These other topics are waiting to be filled in. Could become subtopics or top-level topics depending on the depth of coverage in each case. -->
-
-  <concept id="intro_data_lifecycle" audience="hidden">
-
-    <title>Overview of the Data Lifecycle for Impala</title>
-
-    <conbody/>
-  </concept>
-
-  <concept id="intro_etl" audience="hidden">
-
-    <title>Overview of the Extract, Transform, Load (ETL) Process for Impala</title>
-  <prolog>
-    <metadata>
-      <data name="Category" value="ETL"/>
-      <data name="Category" value="Ingest"/>
-      <data name="Category" value="Concepts"/>
-    </metadata>
-  </prolog>
-
-    <conbody/>
-  </concept>
-
-  <concept id="intro_hadoop_data" audience="hidden">
-
-    <title>How Impala Works with Hadoop Data Files</title>
-
-    <conbody/>
-  </concept>
-
-  <concept id="intro_web_ui" audience="hidden">
-
-    <title>Overview of the Impala Web Interface</title>
-
-    <conbody/>
-  </concept>
-
-  <concept id="intro_bi" audience="hidden">
-
-    <title>Using Impala with Business Intelligence Tools</title>
-
-    <conbody/>
-  </concept>
-
-  <concept id="intro_ha" audience="hidden">
-
-    <title>Overview of Impala Availability and Fault Tolerance</title>
-
-    <conbody/>
-  </concept>
-
-<!-- This is pretty much ready to go. Decide if it should go under "Concepts" or "Performance",
-     and if it should be split out into a separate file, and then take out the audience= attribute
-     to make it visible.
--->
-
-  <concept id="intro_llvm" audience="hidden">
-
-    <title>Overview of Impala Runtime Code Generation</title>
-
-    <conbody>
-
-<!-- Adapted from the CIDR15 paper written by the Impala team. -->
-
-      <p>
-        Impala uses <term>LLVM</term> (a compiler library and collection of related tools) to perform just-in-time
-        (JIT) compilation within the running <cmdname>impalad</cmdname> process. This runtime code generation
-        technique improves query execution times by generating native code optimized for the architecture of each
-        host in your particular cluster. Performance gains of 5 times or more are typical for representative
-        workloads.
-      </p>
-
-      <p>
-        Impala uses runtime code generation to produce query-specific versions of functions that are critical to
-        performance. In particular, code generation is applied to <term>inner loop</term> functions, that is, those
-        that are executed many times (for every tuple) in a given query, and thus constitute a large portion of the
-        total time the query takes to execute. For example, when Impala scans a data file, it calls a function to
-        parse each record into Impala’s in-memory tuple format. For queries scanning large tables, billions of
-        records could result in billions of function calls. This function must therefore be extremely efficient for
-        good query performance, and removing even a few instructions from each function call can result in large
-        query speedups.
-      </p>
-
-      <p>
-        Overall, JIT compilation has an effect similar to writing custom code to process a query. For example, it
-        eliminates branches, unrolls loops, propagates constants, offsets and pointers, and inlines functions.
-        Inlining is especially valuable for functions used internally to evaluate expressions, where the function
-        call itself is more expensive than the function body (for example, a function that adds two numbers).
-        Inlining functions also increases instruction-level parallelism, and allows the compiler to make further
-        optimizations such as subexpression elimination across expressions.
-      </p>
-
-      <p>
-        Impala generates runtime query code automatically, so you do not need to do anything special to get this
-        performance benefit. This technique is most effective for complex and long-running queries that process
-        large numbers of rows. If you need to issue a series of short, small queries, you might turn off this
-        feature to avoid the overhead of compilation time for each query. In this case, issue the statement
-        <codeph>SET DISABLE_CODEGEN=true</codeph> to turn off runtime code generation for the duration of the
-        current session.
-      </p>
-
-<!--
-      <p>
-        Without code generation,
-        functions tend to be suboptimal
-        to handle situations that cannot be predicted in advance.
-        For example,
-        a record-parsing function that
-        only handles integer types will be faster at parsing an integer-only file
-        than a function that handles other data types
-        such as strings and floating-point numbers.
-        However, the schemas of the files to
-        be scanned are unknown at compile time,
-        and so a general-purpose function must be used, even if at runtime
-        it is known that more limited functionality is sufficient.
-      </p>
-
-      <p>
-        A source of large runtime overheads are virtual functions. Virtual function calls incur a large performance
-        penalty, particularly when the called function is very simple, as the calls cannot be inlined.
-        If the type of the object instance is known at runtime, we can use code generation to replace the virtual
-        function call with a call directly to the correct function, which can then be inlined. This is especially
-        valuable when evaluating expression trees. In Impala (as in many systems), expressions are composed of a
-        tree of individual operators and functions.
-      </p>
-
-      <p>
-        Each type of expression that can appear in a query is implemented internally by overriding a virtual function.
-        Many of these expression functions are quite simple, for example, adding two numbers.
-        The virtual function call can be more expensive than the function body itself. By resolving the virtual
-        function calls with code generation and then inlining the resulting function calls, Impala can evaluate expressions
-        directly with no function call overhead. Inlining functions also increases
-        instruction-level parallelism, and allows the compiler to make further optimizations such as subexpression
-        elimination across expressions.
-      </p>
--->
-    </conbody>
-  </concept>
-
-<!-- Same as the previous section: adapted from CIDR paper, ready to externalize after deciding where to go. -->
-
-  <concept audience="hidden" id="intro_io">
-
-    <title>Overview of Impala I/O</title>
-
-    <conbody>
-
-      <p>
-        Efficiently retrieving data from HDFS is a challenge for all SQL-on-Hadoop systems. To perform
-        data scans from both disk and memory at or near hardware speed, Impala uses an HDFS feature called
-        <term>short-circuit local reads</term> to bypass the DataNode protocol when reading from local disk. Impala
-        can read at almost disk bandwidth (approximately 100 MB/s per disk) and is typically able to saturate all
-        available disks. For example, with 12 disks, Impala is typically capable of sustaining I/O at 1.2 GB/sec.
-        Furthermore, <term>HDFS caching</term> allows Impala to access memory-resident data at memory bus speed,
-        and saves CPU cycles as there is no need to copy or checksum data blocks within memory.
-      </p>
-
-      <p>
-        The I/O manager component interfaces with storage devices to read and write data. I/O manager assigns a
-        fixed number of worker threads per physical disk (currently one thread per rotational disk and eight per
-        SSD), providing an asynchronous interface to clients (<term>scanner threads</term>).
-      </p>
-    </conbody>
-  </concept>
-
-<!-- Same as the previous section: adapted from CIDR paper, ready to externalize after deciding where to go. -->
-
-<!-- Although good idea to get some answers from Henry first. -->
-
-  <concept audience="hidden" id="intro_state_distribution">
-
-    <title>State distribution</title>
-
-    <conbody>
-
-      <p>
-        As a massively parallel database that can run on hundreds of nodes, Impala must coordinate and synchronize
-        its metadata across the entire cluster. Impala's symmetric-node architecture means that any node can accept
-        and execute queries, and thus each node needs up-to-date versions of the system catalog and a knowledge of
-        which hosts the <cmdname>impalad</cmdname> daemons run on. To avoid the overhead of TCP connections and
-        remote procedure calls to retrieve metadata during query planning, Impala implements a simple
-        publish-subscribe service called the <term>statestore</term> to push metadata changes to a set of
-        subscribers (the <cmdname>impalad</cmdname> daemons running on all the DataNodes).
-      </p>
-
-      <p>
-        The statestore maintains a set of topics, which are arrays of <codeph>(<varname>key</varname>,
-        <varname>value</varname>, <varname>version</varname>)</codeph> triplets called <term>entries</term> where
-        <varname>key</varname> and <varname>value</varname> are byte arrays, and <varname>version</varname> is a
-        64-bit integer. A topic is defined by an application, and so the statestore has no understanding of the
-        contents of any topic entry. Topics are persistent through the lifetime of the statestore, but are not
-        persisted across service restarts. Processes that receive updates to any topic are called
-        <term>subscribers</term>, and express their interest by registering with the statestore at startup and
-        providing a list of topics. The statestore responds to registration by sending the subscriber an initial
-        topic update for each registered topic, which consists of all the entries currently in that topic.
-      </p>
-
-<!-- Henry: OK, but in practice, what is in these topic messages for Impala? -->
-
-      <p>
-        After registration, the statestore periodically sends two kinds of messages to each subscriber. The first
-        kind of message is a topic update, and consists of all changes to a topic (new entries, modified entries
-        and deletions) since the last update was successfully sent to the subscriber. Each subscriber maintains a
-        per-topic most-recent-version identifier which allows the statestore to only send the delta between
-        updates. In response to a topic update, each subscriber sends a list of changes it intends to make to its
-        subscribed topics. Those changes are guaranteed to have been applied by the time the next update is
-        received.
-      </p>
-
-      <p>
-        The second kind of statestore message is a <term>heartbeat</term>, formerly sometimes called
-        <term>keepalive</term>. The statestore uses heartbeat messages to maintain the connection to each
-        subscriber, which would otherwise time out its subscription and attempt to re-register.
-      </p>
-
-      <p>
-        Prior to Impala 2.0, both kinds of communication were combined in a single kind of message. Because these
-        messages could be very large in instances with thousands of tables, partitions, data files, and so on,
-        Impala 2.0 and higher divides the types of messages so that the small heartbeat pings can be transmitted
-        and acknowledged quickly, increasing the reliability of the statestore mechanism that detects when Impala
-        nodes become unavailable.
-      </p>
-
-      <p>
-        If the statestore detects a failed subscriber (for example, by repeated failed heartbeat deliveries), it
-        stops sending updates to that node.
-<!-- Henry: what are examples of these transient topic entries? -->
-        Some topic entries are marked as transient, meaning that if their owning subscriber fails, they are
-        removed.
-      </p>
-
-      <p>
-        Although the asynchronous nature of this mechanism means that metadata updates might take some time to
-        propagate across the entire cluster, that does not affect the consistency of query planning or results.
-        Each query is planned and coordinated by a particular node, so as long as the coordinator node is aware of
-        the existence of the relevant tables, data files, and so on, it can distribute the query work to other
-        nodes even if those other nodes have not received the latest metadata updates.
-<!-- Henry: need another example here of what's in a topic, e.g. is it the list of available tables? -->
-<!--
-        For example, query planning is performed on a single node based on the
-        catalog metadata topic, and once a full plan has been computed, all information required to execute that
-        plan is distributed directly to the executing nodes.
-        There is no requirement that an executing node should
-        know about the same version of the catalog metadata topic.
--->
-      </p>
-
-      <p>
-        We have found that the statestore process with default settings scales well to medium sized clusters, and
-        can serve our largest deployments with some configuration changes.
-<!-- Henry: elaborate on the configuration changes. -->
-      </p>
-
-      <p>
-<!-- Henry: other examples like load information? How is load information used? -->
-        The statestore does not persist any metadata to disk: all current metadata is pushed to the statestore by
-        its subscribers (for example, load information). Therefore, should a statestore restart, its state can be
-        recovered during the initial subscriber registration phase. Or if the machine that the statestore is
-        running on fails, a new statestore process can be started elsewhere, and subscribers can fail over to it.
-        There is no built-in failover mechanism in Impala, instead deployments commonly use a retargetable DNS
-        entry to force subscribers to automatically move to the new process instance.
-<!-- Henry: translate that last sentence into instructions / guidelines. -->
-      </p>
-    </conbody>
-  </concept>
 </concept>
diff --git a/docs/topics/impala_connecting.xml b/docs/topics/impala_connecting.xml
index b6b4928..84d7ba0 100644
--- a/docs/topics/impala_connecting.xml
+++ b/docs/topics/impala_connecting.xml
@@ -57,33 +57,30 @@ Lots of nuances to illustrate through sample code.
       See <xref href="impala_shell_options.xml"/> for the command-line and configuration file options you can use.
     </p>
 
-    <p>
-      You can connect to any DataNode where an instance of <cmdname>impalad</cmdname> is running,
-      and that host coordinates the execution of all queries sent to it.
-    </p>
+    <p> You can connect to any Impala daemon (<cmdname>impalad</cmdname>), and
+      that daemon coordinates the execution of all queries sent to it. </p>
 
     <p>
       For simplicity during development, you might always connect to the same host, perhaps running <cmdname>impala-shell</cmdname> on
       the same host as <cmdname>impalad</cmdname> and specifying the hostname as <codeph>localhost</codeph>.
     </p>
 
-    <p>
-      In a production environment, you might enable load balancing, in which you connect to specific host/port combination
-      but queries are forwarded to arbitrary hosts. This technique spreads the overhead of acting as the coordinator
-      node among all the DataNodes in the cluster. See <xref href="impala_proxy.xml"/> for details.
-    </p>
+    <p> In a production environment, you might enable load balancing, in which
+      you connect to specific host/port combination but queries are forwarded to
+      arbitrary hosts. This technique spreads the overhead of acting as the
+      coordinator node among all the Impala daemons in the cluster. See <xref
+        href="impala_proxy.xml"/> for details. </p>
 
     <p>
       <b>To connect the Impala shell during shell startup:</b>
     </p>
 
     <ol>
-      <li>
-        Locate the hostname of a DataNode within the cluster that is running an instance of the
-        <cmdname>impalad</cmdname> daemon. If that DataNode uses a non-default port (something
-        other than port 21000) for <cmdname>impala-shell</cmdname> connections, find out the
-        port number also.
-      </li>
+      <li> Locate the hostname that is running an instance of the
+          <cmdname>impalad</cmdname> daemon. If that <cmdname>impalad</cmdname>
+        uses a non-default port (something other than port 21000) for
+          <cmdname>impala-shell</cmdname> connections, find out the port number
+        also. </li>
 
       <li>
         Use the <codeph>-i</codeph> option to the
@@ -126,21 +123,17 @@ $ impala-shell -i <varname>some.other.hostname</varname>:<varname>port_number</v
 [Not connected] &gt; </codeblock>
       </li>
 
-      <li>
-        Locate the hostname of a DataNode within the cluster that is running an instance of the
-        <cmdname>impalad</cmdname> daemon. If that DataNode uses a non-default port (something
-        other than port 21000) for <cmdname>impala-shell</cmdname> connections, find out the
-        port number also.
-      </li>
+      <li> Locate the hostname that is running the <cmdname>impalad</cmdname>
+        daemon. If that <cmdname>impalad</cmdname> uses a non-default port
+        (something other than port 21000) for <cmdname>impala-shell</cmdname>
+        connections, find out the port number also. </li>
 
-      <li>
-        Use the <codeph>connect</codeph> command to connect to an Impala instance. Enter a command of the form:
-<codeblock>[Not connected] &gt; connect <varname>impalad-host</varname>
+      <li> Use the <codeph>connect</codeph> command to connect to an Impala
+        instance. Enter a command of the form: <codeblock>[Not connected] &gt; connect <varname>impalad-host</varname>
 [<varname>impalad-host</varname>:21000] &gt;</codeblock>
-        <note>
-          Replace <varname>impalad-host</varname> with the hostname you have configured for any DataNode running
-          Impala in your environment. The changed prompt indicates a successful connection.
-        </note>
+        <note> Replace <varname>impalad-host</varname> with the hostname you
+          have configured to run Impala in your environment. The changed prompt
+          indicates a successful connection. </note>
       </li>
     </ol>
 
@@ -148,11 +141,9 @@ $ impala-shell -i <varname>some.other.hostname</varname>:<varname>port_number</v
       <b>To start <cmdname>impala-shell</cmdname> in a specific database:</b>
     </p>
 
-    <p>
-      You can use all the same connection options as in previous examples.
-      For simplicity, these examples assume that you are logged into one of
-      the DataNodes that is running the <cmdname>impalad</cmdname> daemon.
-    </p>
+    <p> You can use all the same connection options as in previous examples. For
+      simplicity, these examples assume that you are logged into one of the
+      Impala daemons. </p>
 
     <ol>
       <li>
@@ -182,11 +173,9 @@ $ impala-shell -i localhost -d parquet_gzip_compression
       <b>To run one or several statements in non-interactive mode:</b>
     </p>
 
-    <p>
-      You can use all the same connection options as in previous examples.
-      For simplicity, these examples assume that you are logged into one of
-      the DataNodes that is running the <cmdname>impalad</cmdname> daemon.
-    </p>
+    <p> You can use all the same connection options as in previous examples. For
+      simplicity, these examples assume that you are logged into one of the
+      Impala daemons. </p>
 
     <ol>
       <li>
diff --git a/docs/topics/impala_impala_shell.xml b/docs/topics/impala_impala_shell.xml
index 6f74a41..ccbee95 100644
--- a/docs/topics/impala_impala_shell.xml
+++ b/docs/topics/impala_impala_shell.xml
@@ -98,10 +98,9 @@ under the License.
       For information on installing the Impala shell, see <xref href="impala_install.xml#install"/>.
     </p>
 
-    <p>
-      For information about establishing a connection to a DataNode running the <codeph>impalad</codeph> daemon
-      through the <codeph>impala-shell</codeph> command, see <xref href="impala_connecting.xml#connecting"/>.
-    </p>
+    <p> For information about establishing a connection to a coordinator Impala
+      daemon through the <codeph>impala-shell</codeph> command, see <xref
+        href="impala_connecting.xml#connecting"/>. </p>
 
     <p>
       For a list of the <codeph>impala-shell</codeph> command-line options, see
diff --git a/docs/topics/impala_kudu.xml b/docs/topics/impala_kudu.xml
index c308c37..2e0f364 100644
--- a/docs/topics/impala_kudu.xml
+++ b/docs/topics/impala_kudu.xml
@@ -143,15 +143,13 @@ under the License.
           </li>
 
           <li>
-            <p>
-              Data is physically divided based on units of storage called
+            <p> Data is physically divided based on units of storage called
                 <term>tablets</term>. Tablets are stored by <term>tablet
                 servers</term>. Each tablet server can store multiple tablets,
               and each tablet is replicated across multiple tablet servers,
               managed automatically by Kudu. Where practical, co-locate the
-              tablet servers on the same hosts as the DataNodes, although that
-              is not required.
-            </p>
+              tablet servers on the same hosts as the Impala daemons, although
+              that is not required. </p>
           </li>
         </ul>
 
diff --git a/docs/topics/impala_order_by.xml b/docs/topics/impala_order_by.xml
index d600d1a..2b4d043 100644
--- a/docs/topics/impala_order_by.xml
+++ b/docs/topics/impala_order_by.xml
@@ -49,15 +49,6 @@ under the License.
         <codeph>LIMIT</codeph> clause, which reduces network overhead and the
       memory requirement on the coordinator node. </p>
 
-    <note>
-      <p rev="1.4.0 obwl">
-        In Impala 1.4.0 and higher, the <codeph>LIMIT</codeph> clause is now optional (rather than required) for
-        queries that use the <codeph>ORDER BY</codeph> clause. Impala automatically uses a temporary disk work area
-        to perform the sort if the sort operation would otherwise exceed the Impala memory limit for a particular
-        DataNode.
-      </p>
-    </note>
-
     <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
 
     <p>
@@ -105,8 +96,6 @@ col_ref ::= <varname>column_name</varname> | <varname>integer_literal</varname>
     </p>
 
     <p rev="obwl" conref="../shared/impala_common.xml#common/order_by_limit"/>
-
-    <!-- Good to show an example of cases where ORDER BY does and doesn't work with complex types. -->
     <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
 
     <p rev="2.3.0">
@@ -131,13 +120,12 @@ ERROR: AnalysisException: ORDER BY expression 'score' with complex type 'ARRAY&l
 
     <p conref="../shared/impala_common.xml#common/example_blurb"/>
 
-    <p>
-      The following query retrieves the user ID and score, only for scores greater than one million,
-      with the highest scores for each user listed first.
-      Because the individual array elements are now represented as separate rows in the result set,
-      they can be used in the <codeph>ORDER BY</codeph> clause, referenced using the <codeph>ITEM</codeph>
-      pseudocolumn that represents each array element.
-    </p>
+    <p> The following query retrieves the user ID and score, only for scores
+      greater than one million, with the highest scores for each user listed
+      first. Because the individual array elements are now represented as
+      separate rows in the result set, they can be used in the <codeph>ORDER
+        BY</codeph> clause, referenced using the <codeph>ITEM</codeph>
+      pseudo-column that represents each array element. </p>
 
 <codeblock>SELECT id, item FROM games, games.score
   WHERE item &gt; 1000000
@@ -168,13 +156,14 @@ ORDER BY id, info.value desc;
 
     <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
 
-    <p>
-      Although the <codeph>LIMIT</codeph> clause is now optional on <codeph>ORDER BY</codeph> queries, if your
-      query only needs some number of rows that you can predict in advance, use the <codeph>LIMIT</codeph> clause
-      to reduce unnecessary processing. For example, if the query has a clause <codeph>LIMIT 10</codeph>, each data
-      node sorts its portion of the relevant result set and only returns 10 rows to the coordinator node. The
-      coordinator node picks the 10 highest or lowest row values out of this small intermediate result set.
-    </p>
+    <p> Although the <codeph>LIMIT</codeph> clause is now optional on
+        <codeph>ORDER BY</codeph> queries, if your query only needs some number
+      of rows that you can predict in advance, use the <codeph>LIMIT</codeph>
+      clause to reduce unnecessary processing. For example, if the query has a
+      clause <codeph>LIMIT 10</codeph>, each executor Impala daemon sorts its
+      portion of the relevant result set and only returns 10 rows to the
+      coordinator node. The coordinator node picks the 10 highest or lowest row
+      values out of this small intermediate result set. </p>
 
     <p>
       If an <codeph>ORDER BY</codeph> clause is applied to an early phase of query processing, such as a subquery
@@ -212,45 +201,46 @@ SELECT page_title AS "Page 3 of search results", page_url FROM search_content
 
     <p conref="../shared/impala_common.xml#common/internals_blurb"/>
 
-    <p>
-      Impala sorts the intermediate results of an <codeph>ORDER BY</codeph> clause in memory whenever practical. In
-      a cluster of N DataNodes, each node sorts roughly 1/Nth of the result set, the exact proportion varying
-      depending on how the data matching the query is distributed in HDFS.
-    </p>
-
-    <p>
-      If the size of the sorted intermediate result set on any DataNode would cause the query to exceed the Impala
-      memory limit, Impala sorts as much as practical in memory, then writes partially sorted data to disk. (This
-      technique is known in industry terminology as <q>external sorting</q> and <q>spilling to disk</q>.) As each
-      8 MB batch of data is written to disk, Impala frees the corresponding memory to sort a new 8 MB batch of
-      data. When all the data has been processed, a final merge sort operation is performed to correctly order the
-      in-memory and on-disk results as the result set is transmitted back to the coordinator node. When external
-      sorting becomes necessary, Impala requires approximately 60 MB of RAM at a minimum for the buffers needed to
-      read, write, and sort the intermediate results. If more RAM is available on the DataNode, Impala will use
-      the additional RAM to minimize the amount of disk I/O for sorting.
-    </p>
-
-    <p>
-      This external sort technique is used as appropriate on each DataNode (possibly including the coordinator
-      node) to sort the portion of the result set that is processed on that node. When the sorted intermediate
-      results are sent back to the coordinator node to produce the final result set, the coordinator node uses a
-      merge sort technique to produce a final sorted result set without using any extra resources on the
-      coordinator node.
-    </p>
+    <p> Impala sorts the intermediate results of an <codeph>ORDER BY</codeph>
+      clause in memory whenever practical. In a cluster of N executor Impala
+      daemons, each daemon sorts roughly 1/Nth of the result set, the exact
+      proportion varying depending on how the data matching the query is
+      distributed in HDFS. </p>
+
+    <p> If the size of the sorted intermediate result set on any executor Impala
+      daemon would cause the query to exceed the Impala memory limit, Impala
+      sorts as much as practical in memory, then writes partially sorted data to
+      disk. (This technique is known in industry terminology as <q>external
+        sorting</q> and <q>spilling to disk</q>.) As each 8 MB batch of data is
+      written to disk, Impala frees the corresponding memory to sort a new 8 MB
+      batch of data. When all the data has been processed, a final merge sort
+      operation is performed to correctly order the in-memory and on-disk
+      results as the result set is transmitted back to the coordinator node.
+      When external sorting becomes necessary, Impala requires approximately 60
+      MB of RAM at a minimum for the buffers needed to read, write, and sort the
+      intermediate results. If more RAM is available on the Impala daemon,
+      Impala will use the additional RAM to minimize the amount of disk I/O for
+      sorting. </p>
+
+    <p> This external sort technique is used as appropriate on each Impala
+      daemon (possibly including the coordinator node) to sort the portion of
+      the result set that is processed on that node. When the sorted
+      intermediate results are sent back to the coordinator node to produce the
+      final result set, the coordinator node uses a merge sort technique to
+      produce a final sorted result set without using any extra resources on the
+      coordinator node. </p>
 
     <p rev="obwl">
       <b>Configuration for disk usage:</b>
     </p>
 
-    <p rev="obwl" conref="../shared/impala_common.xml#common/order_by_scratch_dir"/>
-
-<!-- Here is actually the more logical place to collect all those examples, move them from SELECT and cross-reference to here. -->
-
-<!--     <p rev="obwl" conref="../shared/impala_common.xml#common/restrictions_blurb"/> -->
+    <p rev="obwl"
+      conref="../shared/impala_common.xml#common/order_by_scratch_dir"/>
 
     <p rev="obwl" conref="../shared/impala_common.xml#common/insert_sort_blurb"/>
 
-    <p rev="obwl" conref="../shared/impala_common.xml#common/order_by_view_restriction"/>
+    <p rev="obwl"
+      conref="../shared/impala_common.xml#common/order_by_view_restriction"/>
 
     <p>
       With the lifting of the requirement to include a <codeph>LIMIT</codeph> clause in every <codeph>ORDER
@@ -259,16 +249,17 @@ SELECT page_title AS "Page 3 of search results", page_url FROM search_content
 
     <ul>
       <li>
-        <p>
-          Now the use of scratch disk space raises the possibility of an <q>out of disk space</q> error on a
-          particular DataNode, as opposed to the previous possibility of an <q>out of memory</q> error. Make sure
-          to keep at least 1 GB free on the filesystem used for temporary sorting work.
-        </p>
+        <p> Now the use of scratch disk space raises the possibility of an
+            <q>out of disk space</q> error on a particular Impala daemon, as
+          opposed to the previous possibility of an <q>out of memory</q> error.
+          Make sure to keep at least 1 GB free on the filesystem used for
+          temporary sorting work. </p>
       </li>
 
     </ul>
 
-    <p rev="obwl" conref="../shared/impala_common.xml#common/null_sorting_change"/>
+    <p rev="obwl"
+      conref="../shared/impala_common.xml#common/null_sorting_change"/>
 <codeblock>[localhost:21000] > create table numbers (x int);
 [localhost:21000] > insert into numbers values (1), (null), (2), (null), (3);
 [localhost:21000] > select x from numbers order by x nulls first;
diff --git a/docs/topics/impala_runtime_filtering.xml b/docs/topics/impala_runtime_filtering.xml
index d496098..ec519d0 100644
--- a/docs/topics/impala_runtime_filtering.xml
+++ b/docs/topics/impala_runtime_filtering.xml
@@ -82,13 +82,13 @@ under the License.
       </p>
       <ul>
         <li>
-          <p>
-            What a <term>plan fragment</term> is.
-            Impala decomposes each query into smaller units of work that are distributed across the cluster.
-            Wherever possible, a data block is read, filtered, and aggregated by plan fragments executing
-            on the same host. For some operations, such as joins and combining intermediate results into
-            a final result set, data is transmitted across the network from one DataNode to another.
-          </p>
+          <p> What a <term>plan fragment</term> is. Impala decomposes each query
+            into smaller units of work that are distributed across the cluster.
+            Wherever possible, a data block is read, filtered, and aggregated by
+            plan fragments executing on the same host. For some operations, such
+            as joins and combining intermediate results into a final result set,
+            data is transmitted across the network from one Impala daemon to
+            another. </p>
         </li>
         <li>
           <p>
diff --git a/docs/topics/impala_scalability.xml b/docs/topics/impala_scalability.xml
index 71b8425..c1264fa 100644
--- a/docs/topics/impala_scalability.xml
+++ b/docs/topics/impala_scalability.xml
@@ -601,9 +601,7 @@ Memory Usage: Additional Notes
               available to Impala and reduce the amount of memory required on each node.
             </li>
 
-            <li>
-              Increase the overall memory capacity of each DataNode at the hardware level.
-            </li>
+            <li> Add more memory to the hosts running Impala daemons. </li>
 
             <li>
               On a cluster with resources shared between Impala and other Hadoop components, use

[impala] 01/05: IMPALA-8177: Log DDL failures in coordinator logs

Posted by tm...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tmarshall pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit fa604fb5059307321a174db02469b4f5fb78410f
Author: Bharath Vissapragada <bh...@cloudera.com>
AuthorDate: Fri Feb 8 11:29:55 2019 -0800

    IMPALA-8177: Log DDL failures in coordinator logs
    
    If a DDL fails for some reason, it helps to log the failure message in
    the coordinator logs so that we can differentiate between failed and
    successful DDL queries.
    
    For ex:
    
    [0d66cd6004b4:21000] default> drop database foo;
    Query: drop database foo
    ERROR: ImpalaRuntimeException: Error making 'dropDatabase' RPC to Hive Metastore:
    CAUSED BY: NoSuchObjectException: foo
    ----- coordinator logs -------
    I0208 12:13:26.251695 25474 Frontend.java:1242] 704e86fff482c0b5:f0501f9400000000] Analyzing query: drop database foo
    I0208 12:13:26.253773 25474 Frontend.java:1282] 704e86fff482c0b5:f0501f9400000000] Analysis finished.
    I0208 12:13:26.419946 25474 client-request-state.cc:176] 704e86fff482c0b5:f0501f9400000000] ImpalaRuntimeException: Error making 'dropDatabase' RPC to Hive Metastore:
    CAUSED BY: NoSuchObjectException: foo
    I0208 12:13:26.419992 25474 impala-server.cc:1142] 704e86fff482c0b5:f0501f9400000000] UnregisterQuery(): query_id=704e86fff482c0b5:f0501f9400000000
    I0208 12:13:26.419997 25474 impala-server.cc:1249] 704e86fff482c0b5:f0501f9400000000] Cancel(): query_id=704e86fff482c0b5:f0501f9400000000
    -------------------------------
    
    Testing: Verified manually by running a few DDLs that fail and then inspecting
    the coordinator log file.
    
    Change-Id: Ie89291ee27156c701e07cea44ad3ee07ec54ab42
    Reviewed-on: http://gerrit.cloudera.org:8080/12414
    Reviewed-by: Bharath Vissapragada <bh...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/common/status.h                 | 9 +++++++++
 be/src/service/client-request-state.cc | 3 ++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/be/src/common/status.h b/be/src/common/status.h
index 3fd5b5b..41f7663 100644
--- a/be/src/common/status.h
+++ b/be/src/common/status.h
@@ -359,6 +359,15 @@ std::ostream& operator<<(std::ostream& os, const Status& status);
     if (UNLIKELY(!_status.ok())) return _status; \
   } while (false)
 
+#define LOG_AND_RETURN_IF_ERROR(stmt) \
+  do { \
+    const ::impala::Status& _status = (stmt); \
+    if (UNLIKELY(!_status.ok()))  { \
+      LOG(INFO) << _status.GetDetail(); \
+      return _status; \
+    } \
+  } while (false)
+
 #define RETURN_VOID_IF_ERROR(stmt)                     \
   do {                                                 \
     if (UNLIKELY(!(stmt).ok())) return;                \
diff --git a/be/src/service/client-request-state.cc b/be/src/service/client-request-state.cc
index bac6c1c..b1070bd 100644
--- a/be/src/service/client-request-state.cc
+++ b/be/src/service/client-request-state.cc
@@ -21,6 +21,7 @@
 #include <limits>
 #include <gutil/strings/substitute.h>
 
+#include "common/status.h"
 #include "exec/kudu-util.h"
 #include "kudu/rpc/rpc_controller.h"
 #include "runtime/backend-client.h"
@@ -183,7 +184,7 @@ Status ClientRequestState::Exec(TExecRequest* exec_request) {
     }
     case TStmtType::DDL: {
       DCHECK(exec_request_.__isset.catalog_op_request);
-      RETURN_IF_ERROR(ExecDdlRequest());
+      LOG_AND_RETURN_IF_ERROR(ExecDdlRequest());
       break;
     }
     case TStmtType::LOAD: {

[impala] 02/05: Additions to .gitignore

Posted by tm...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tmarshall pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 29df1d55f668d18450b86428844308c0f536f374
Author: Paul Rogers <pr...@cloudera.com>
AuthorDate: Sun Feb 10 18:33:39 2019 -0800

    Additions to .gitignore
    
    Adds entries for Eclipse-created files and for a couple of temporary
    files commonly created during front-end debugging.
    
    Change-Id: Ia8ea436f5e108cc08389f43d639a4cb7315271c1
    Reviewed-on: http://gerrit.cloudera.org:8080/12430
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .gitignore | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.gitignore b/.gitignore
index 5bd1ce6..704f03d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -85,3 +85,13 @@ gdb.txt
 
 # OS X Artifacts
 .DS_Store
+
+# Eclipse
+.cproject
+.project
+.classpath
+.settings
+
+# Debugging
+ad-hoc.test
+nohup.out

[impala] 05/05: IMPALA-5031: signed overflow is undefined behavior

Posted by tm...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tmarshall pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 6fc05a6f7385bf21de1c01a3071b9a91efc85b07
Author: Jim Apple <jb...@apache.org>
AuthorDate: Sat Feb 9 21:17:51 2019 -0800

    IMPALA-5031: signed overflow is undefined behavior
    
    Tis patch fixes a signed overflow in the test
    StringToDecimal.LargeDecimals. The interesting part of the backtrace
    is:
    
    util/string-parser.h:397:14: runtime error: signed integer overflow:
      0x4b3b4ca85a86c47a098a223fffffffff * 10 cannot be represented in
      type '__int128'
        #0 void StringParser::ApplyExponent<__int128>(int, int,
          signed char, __int128*, int*, int*) util/string-parser.h:397:14
        #1 DecimalValue<__int128> StringParser::StringToDecimal<__int128>
          (char const*, int, int, int, bool, StringParser::ParseResult*)
          util/string-parser.h:221:5
        #2 void VerifyParse<__int128>(string const&, int, int, bool,
          DecimalValue<__int128> const&, StringParser::ParseResult)
          runtime/decimal-test.cc:53:25
        #3 void VerifyParse<__int128>(string const&, int, int,
          DecimalValue<__int128> const&, StringParser::ParseResult)
          runtime/decimal-test.cc:65:3
        #4 StringToDecimal_LargeDecimals_Test::TestBody()
          runtime/decimal-test.cc:443:3
    
    Change-Id: Ifb4effa291e1e4dac62b84251c74c483d99b06e7
    Reviewed-on: http://gerrit.cloudera.org:8080/12426
    Reviewed-by: Jim Apple <jb...@apache.org>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/util/string-parser.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/be/src/util/string-parser.h b/be/src/util/string-parser.h
index 606e5b8..7f3a191 100644
--- a/be/src/util/string-parser.h
+++ b/be/src/util/string-parser.h
@@ -394,7 +394,8 @@ class StringParser {
     if (exponent > digits_after_dot_count) {
       // Ex: 0.1e3 (which at this point would have precision == 1 and scale == 1), the
       //     scale must be set to 0 and the value set to 100 which means a precision of 3.
-      *value *= DecimalUtil::GetScaleMultiplier<T>(exponent - digits_after_dot_count);
+      *value = ArithmeticUtil::AsUnsigned<std::multiplies>(
+          *value, DecimalUtil::GetScaleMultiplier<T>(exponent - digits_after_dot_count));
       *precision = total_digits_count + (exponent - digits_after_dot_count);
       *scale = 0;
     } else {

[impala] 04/05: IMPALA-8173: Fix KeyError in run-workload.py

Posted by tm...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tmarshall pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 8b5cb576d23d771456ed571ee0d362d96af6ae13
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
AuthorDate: Thu Feb 7 14:09:38 2019 -0800

    IMPALA-8173: Fix KeyError in run-workload.py
    
    A recent change (IMPALA-7694) causes run-workload.py to fail with a
    KeyError due to trying to construct an ImpalaBeeswaxResult without a
    query id.
    
    This patch fixes that issue and two related issues:
    - This problem was not caught by automated testing even though we run
      run-workload.py in run-all-tests.sh because of an issue where
      queries that fail to produce results are silently ignored.
    - Fixes an issue where queries that fail to produce results would
      confusingly produce the error:
      'NoneType' object has no attribute 'time_taken'
    
    Testing:
    - Ran run-workload.py locally and demonstrated that it works now.
    - Ran run-all-tests.sh locally and demonstrated that it fails now when
      the KeyError issue isn't fixed.
    
    Change-Id: I5b8a3c3dd7499335b9290d5667c194e8c0eabd12
    Reviewed-on: http://gerrit.cloudera.org:8080/12397
    Reviewed-by: Thomas Marshall <th...@cmu.edu>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/performance/query_exec_functions.py |  4 ++--
 tests/performance/scheduler.py            | 22 ++++++++++++++--------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/tests/performance/query_exec_functions.py b/tests/performance/query_exec_functions.py
index 1c144bd..352c9a7 100644
--- a/tests/performance/query_exec_functions.py
+++ b/tests/performance/query_exec_functions.py
@@ -192,7 +192,7 @@ def execute_using_impala_beeswax(query, query_config):
   # create a map for query options and the query names to send to the plugin
   context = build_context(query, query_config)
   if plugin_runner: plugin_runner.run_plugins_pre(context=context, scope="Query")
-  result = ImpalaBeeswaxResult()
+  result = None
   try:
     result = client.execute(query.query_str)
   except Exception, e:
@@ -232,7 +232,7 @@ def construct_exec_result(result, exec_result):
   """
 
   # Return immedietely if the query failed.
-  if not result.success: return exec_result
+  if result is None or not result.success: return exec_result
   exec_result.success = True
   attrs = ['data', 'runtime_profile', 'start_time',
       'time_taken', 'summary', 'exec_summary']
diff --git a/tests/performance/scheduler.py b/tests/performance/scheduler.py
index 760b331..e612454 100644
--- a/tests/performance/scheduler.py
+++ b/tests/performance/scheduler.py
@@ -112,19 +112,20 @@ class Scheduler(object):
           try:
             query_executor.prepare(self._get_next_impalad())
             query_executor.execute(plan_first=self.plan_first)
-          # QueryExecutor only throws an exception if the query fails and abort_on_error
-          # is set to True. If abort_on_error is False, then the exception is logged on
+          # QueryExecutor only throws an exception if the query fails and exit_on_error
+          # is set to True. If exit_on_error is False, then the exception is logged on
           # the console and execution moves on to the next query.
           except Exception as e:
             LOG.error("Query %s Failed: %s" % (query_name, str(e)))
             self._exit.set()
           finally:
-            LOG.info("%s query iteration %d finished in %.2f seconds" % (query_name, i+1,
-              query_executor.result.time_taken))
-            result = query_executor.result
-            result.client_name = thread_num + 1
-            self._results.append(result)
-          workload_time_sec += query_executor.result.time_taken
+            if query_executor.result:
+              LOG.info("%s query iteration %d finished in %.2f seconds" %
+                       (query_name, i + 1, query_executor.result.time_taken))
+              result = query_executor.result
+              result.client_name = thread_num + 1
+              self._results.append(result)
+              workload_time_sec += query_executor.result.time_taken
       if self.query_iterations == 1:
         LOG.info("Workload iteration %d finished in %s seconds" % (j+1, workload_time_sec))
       cursor = getattr(threading.current_thread(), 'cursor', None)
@@ -139,3 +140,8 @@ class Scheduler(object):
     for thread_num,t in enumerate(self._threads):
       t.join()
       LOG.info("Finished %s" % self._thread_name % thread_num)
+    num_expected_results = len(self._threads) * self.iterations * \
+        self.query_iterations * len(self.query_executors)
+    if len(self._results) != num_expected_results:
+      raise RuntimeError("Unexpected number of results generated (%s vs. %s)." %
+          (len(self._results), num_expected_results))