You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by ta...@apache.org on 2019/06/25 15:45:12 UTC

[impala] branch 2.x updated (943357f -> dc888ba)

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a change to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from 943357f  IMPALA-8669 Support giving a terminal commit in compare_branches.py
     new c8b03a7  IMPALA-7314: Doc generation should fail on error
     new 2940d2b  IMPALA-7252: Backport rate limiting of fadvise calls into toolchain glog
     new c4b9c98  IMPALA-7298: Stop passing IP address as hostname in Kerberos principal
     new 7562635  IMPALA-7315: fix test_update_with_clear_entries_flag race
     new eee1034  IMPALA-7259: Improve Impala shell performance
     new 9e089ff  test_recover_partitions.py had asserts that were always true.
     new cb2064e  KUDU-2492: Make the use of SO_REUSEPORT conditional on it being defined
     new 081d6de  IMPALA-6677: [DOCS] Document the next_day function
     new 8304591  IMPALA-7256: Aggregator mem usage isn't reflected in summary
     new 2a1f2fb  IMPALA-7173: [DOCS] Added check options in the load balancer examples
     new efe7510  IMPALA-3675: part 1: -Werror for ASAN
     new 58ae370  IMPALA-5031: Fix undefined behavior: memset NULL
     new 4479442  IMPALA-6299: Use LlvmCodeGen's internal list of white-listed CPU attributes for handcrafting IRs
     new 3d6a895  IMPALA-7291: [DOCS] Note about no codegen support for CHAR
     new 45d2ce7  IMPALA-3040: Remove cache directives if a partition is dropped externally
     new 798cfef  IMPALA-6174: [DOCS] Fixed the seed data type for RAND and RANDOM functions
     new ba241d6  IMPALA-5826 IMPALA-7162: [DOCS] Documented the IDLE_SESSION_TIMEOUT query option
     new cfbffac  IMPALA-7330. After LOAD DATA, only refresh affected partition
     new 446939f  IMPALA-5031: Fix undefined behavior: memset NULL
     new dc888ba  IMPALA-7218: [DOCS] Support column list in ALTER VIEW

The 20 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .clang-tidy                                        |   1 -
 be/CMakeLists.txt                                  |  12 +-
 be/src/benchmarks/bloom-filter-benchmark.cc        |   2 +
 be/src/codegen/llvm-codegen-test.cc                |  49 ++++-
 be/src/codegen/llvm-codegen.cc                     |  39 +++-
 be/src/codegen/llvm-codegen.h                      |  24 ++-
 be/src/exec/aggregator.cc                          |   3 +-
 be/src/exec/aggregator.h                           |   1 +
 be/src/exec/data-source-scan-node.cc               |   3 +-
 be/src/exec/exec-node.cc                           |   3 +-
 be/src/exec/filter-context.cc                      |   2 +-
 be/src/exec/grouping-aggregator.cc                 |  11 +-
 be/src/exec/grouping-aggregator.h                  |   1 +
 be/src/exec/hdfs-table-writer.h                    |   2 +-
 be/src/exprs/expr.h                                |   2 +-
 be/src/exprs/scalar-expr.h                         |   2 +-
 be/src/kudu/util/net/socket.cc                     |  12 +-
 be/src/rpc/rpc-mgr-kerberized-test.cc              |   2 +-
 be/src/rpc/rpc-mgr-test-base.h                     |   6 +-
 be/src/rpc/rpc-mgr-test.cc                         |   5 +-
 be/src/rpc/rpc-mgr.h                               |  10 +-
 be/src/rpc/rpc-mgr.inline.h                        |   5 +-
 be/src/runtime/data-stream-sender.cc               |   2 +-
 be/src/runtime/data-stream-test.cc                 |   6 +-
 be/src/runtime/krpc-data-stream-recvr.h            |   2 +-
 be/src/runtime/krpc-data-stream-sender.cc          |  14 +-
 be/src/runtime/mem-tracker.h                       |   2 +-
 be/src/runtime/reservation-manager.cc              |   9 +-
 be/src/runtime/reservation-manager.h               |   5 +-
 be/src/scheduling/scheduler.cc                     |   4 +-
 be/src/service/client-request-state.cc             |   4 +
 be/src/util/bitmap.h                               |   3 +-
 be/src/util/cpu-info.cc                            |   8 +
 be/src/{runtime/string-value.cc => util/ubsan.h}   |  30 +--
 bin/impala-config.sh                               |   4 +-
 common/thrift/ImpalaInternalService.thrift         |   6 +-
 docs/.gitignore                                    |   1 +
 docs/Makefile                                      |   4 +-
 bin/jenkins/all-tests.sh => docs/build-doc.sh      |  21 +-
 docs/impala.ditamap                                |   1 +
 docs/shared/impala_common.xml                      |  19 +-
 docs/topics/impala_alter_view.xml                  |  87 ++++----
 docs/topics/impala_char.xml                        | 223 ++++++++-------------
 docs/topics/impala_datetime_functions.xml          |  73 +++++--
 docs/topics/impala_idle_session_timeout.xml        | 100 +++++++++
 docs/topics/impala_math_functions.xml              |   4 +-
 docs/topics/impala_proxy.xml                       |   8 +-
 docs/topics/impala_timeouts.xml                    |  90 ++++++---
 .../java/org/apache/impala/catalog/HdfsTable.java  |  40 +++-
 .../apache/impala/service/CatalogOpExecutor.java   |   3 -
 shell/impala_shell.py                              |  47 +++--
 tests/metadata/test_recover_partitions.py          |  11 +-
 tests/query_test/test_hdfs_caching.py              |  19 ++
 tests/shell/test_shell_commandline.py              |  38 +++-
 tests/statestore/test_statestore.py                |   2 +-
 55 files changed, 717 insertions(+), 370 deletions(-)
 copy be/src/{runtime/string-value.cc => util/ubsan.h} (69%)
 copy bin/jenkins/all-tests.sh => docs/build-doc.sh (70%)
 mode change 100644 => 100755
 create mode 100644 docs/topics/impala_idle_session_timeout.xml

[impala] 08/20: IMPALA-6677: [DOCS] Document the next_day function

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 081d6deed342f3978cb7daef3bcb78593788fe8a
Author: Alex Rodoni <ar...@cloudera.com>
AuthorDate: Mon Jul 9 11:48:04 2018 -0700

    IMPALA-6677: [DOCS] Document the next_day function
    
    Change-Id: I2dacc86ff69a1016b1863d9db66dd29fd832b715
    Reviewed-on: http://gerrit.cloudera.org:8080/10893
    Reviewed-by: Vuk Ercegovac <ve...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 docs/topics/impala_datetime_functions.xml | 73 ++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 15 deletions(-)

diff --git a/docs/topics/impala_datetime_functions.xml b/docs/topics/impala_datetime_functions.xml
index cb2f03f..7c8d5e1 100644
--- a/docs/topics/impala_datetime_functions.xml
+++ b/docs/topics/impala_datetime_functions.xml
@@ -1913,6 +1913,42 @@ select now() as right_now, nanoseconds_sub(now(), 1e9) as 1_second_earlier;
         </dd>
 
       </dlentry>
+      <dlentry id="next_day">
+        <dt>
+          <codeph>next_day(timestamp date, string weekday)</codeph>
+        </dt>
+        <dd>
+          <b>Purpose:</b> Returns the date of the <varname>weekday</varname>
+          that follows the specified <varname>date</varname>. <p>
+            <b>Return type:</b>
+            <codeph>timestamp</codeph>
+          </p>
+          <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+          <p> The <varname>weekday</varname> parameter is case-insensitive. The
+            following values are accepted for <varname>weekday</varname>:
+              <codeph>"Sunday"</codeph>/<codeph>"Sun"</codeph>,
+              <codeph>"Monday"</codeph>/<codeph>"Mon"</codeph>,
+              <codeph>"Tuesday"</codeph>/<codeph>"Tue"</codeph>,
+              <codeph>"Wednesday"</codeph>/<codeph>"Wed"</codeph>,
+              <codeph>"Thursday"</codeph>/<codeph>"Thu"</codeph>,
+              <codeph>"Friday"</codeph>/<codeph>"Fri"</codeph>,
+              <codeph>"Saturday"</codeph>/<codeph>"Sat"</codeph>
+          </p>
+          <p> Calling the function with the current date and weekday returns the
+            date that is one week later.</p><p
+            conref="../shared/impala_common.xml#common/example_blurb"/>
+          <p>
+            <codeblock>select next_day('2013-12-25','Saturday');
+-- Returns '2013-12-28 00:00:00' the first Saturday after December 25, 2013.
+select next_day(to_timestamp('08-1987-21', 'mm-yyyy-dd'), 'Friday');
+-- Returns '1987-08-28 00:00:00' the first Friday after August 28, 1987.
+
+select next_day(now(), 'Thu');
+-- Executed on 2018-07-12, the function returns '2018-07-13 00:00:00', one week
+-- after the current date.</codeblock>
+          </p>
+        </dd>
+      </dlentry>
 
       <dlentry id="now">
 
@@ -1922,34 +1958,41 @@ select now() as right_now, nanoseconds_sub(now(), 1e9) as 1_second_earlier;
 
         <dd>
           <indexterm audience="hidden">now() function</indexterm>
-          <b>Purpose:</b> Returns the current date and time (in the local time zone) as a
-          <codeph>TIMESTAMP</codeph> value.
+          <b>Purpose:</b> Returns the current date and time (in the local time
+          zone) as a <codeph>TIMESTAMP</codeph> value.
+
           <p>
             <b>Return type:</b> <codeph>timestamp</codeph>
           </p>
           <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
           <p>
-            To find a date/time value in the future or the past relative to the current date
-            and time, add or subtract an <codeph>INTERVAL</codeph> expression to the return value of
-            <codeph>now()</codeph>. See <xref href="impala_timestamp.xml#timestamp"/> for examples.
+            To find a date/time value in the future or the past relative to the
+            current date and time, add or subtract an <codeph>INTERVAL</codeph>
+            expression to the return value of <codeph>now()</codeph>. See <xref
+              href="impala_timestamp.xml#timestamp"/> for examples.
           </p>
           <p>
-            To produce a <codeph>TIMESTAMP</codeph> representing the current date and time that can be
-            shared or stored without interoperability problems due to time zone differences, use the
-            <codeph>to_utc_timestamp()</codeph> function and specify the time zone of the server.
-            When <codeph>TIMESTAMP</codeph> data is stored in UTC form, any application that queries
-            those values can convert them to the appropriate local time zone by calling the inverse
+            To produce a <codeph>TIMESTAMP</codeph> representing the current
+            date and time that can be shared or stored without interoperability
+            problems due to time zone differences, use the
+              <codeph>to_utc_timestamp()</codeph> function and specify the time
+            zone of the server. When <codeph>TIMESTAMP</codeph> data is stored
+            in UTC form, any application that queries those values can convert
+            them to the appropriate local time zone by calling the inverse
             function, <codeph>from_utc_timestamp()</codeph>.
           </p>
+
           <p conref="../shared/impala_common.xml#common/current_timezone_tip"/>
-          <p>
-            Any references to the <codeph>now()</codeph> function are evaluated at the start of a query.
-            All calls to <codeph>now()</codeph> within the same query return the same value,
-            and the value does not depend on how long the query takes.
+
+          <p> Any references to the <codeph>now()</codeph> function are
+            evaluated at the start of a query. All calls to
+              <codeph>now()</codeph> within the same query return the same
+            value, and the value does not depend on how long the query takes.
           </p>
 
           <p conref="../shared/impala_common.xml#common/example_blurb"/>
-<codeblock>
+          <codeblock>
 select now() as 'Current time in California USA',
   to_utc_timestamp(now(), 'PDT') as 'Current time in Greenwich UK';
 +--------------------------------+-------------------------------+

[impala] 02/20: IMPALA-7252: Backport rate limiting of fadvise calls into toolchain glog

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 2940d2b8bcb3b04fe437ecd11c6c457b9ecd4d4c
Author: Tianyi Wang <tw...@cloudera.com>
AuthorDate: Tue Jul 17 15:18:05 2018 -0700

    IMPALA-7252: Backport rate limiting of fadvise calls into toolchain glog
    
    This patch bumps glog version to 0.3.4-p3 to include the patch limiting
    fadvise calls.
    
    Change-Id: I41fd855fbf0e9ec58845ac0d2eb96a87b0172152
    Reviewed-on: http://gerrit.cloudera.org:8080/10965
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/impala-config.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index d6ea425..a6c28eb 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -68,7 +68,7 @@ fi
 # moving to a different build of the toolchain, e.g. when a version is bumped or a
 # compile option is changed. The build id can be found in the output of the toolchain
 # build jobs, it is constructed from the build number and toolchain git hash prefix.
-export IMPALA_TOOLCHAIN_BUILD_ID=146-f2d5380be6
+export IMPALA_TOOLCHAIN_BUILD_ID=155-3c4a3251df
 # Versions of toolchain dependencies.
 # -----------------------------------
 export IMPALA_AVRO_VERSION=1.7.4-p4
@@ -93,7 +93,7 @@ export IMPALA_GCC_VERSION=4.9.2
 unset IMPALA_GCC_URL
 export IMPALA_GFLAGS_VERSION=2.2.0-p1
 unset IMPALA_GFLAGS_URL
-export IMPALA_GLOG_VERSION=0.3.4-p2
+export IMPALA_GLOG_VERSION=0.3.4-p3
 unset IMPALA_GLOG_URL
 export IMPALA_GPERFTOOLS_VERSION=2.5
 unset IMPALA_GPERFTOOLS_URL

[impala] 17/20: IMPALA-5826 IMPALA-7162: [DOCS] Documented the IDLE_SESSION_TIMEOUT query option

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit ba241d6822133e38eaa2f9c590684881a0d30201
Author: Alex Rodoni <ar...@cloudera.com>
AuthorDate: Fri Jul 20 15:09:13 2018 -0700

    IMPALA-5826 IMPALA-7162: [DOCS] Documented the IDLE_SESSION_TIMEOUT query option
    
    Also, clarified cancelled queries vs closed queries
    
    Change-Id: I37182a3c5cf19fdcbb5f247ed71d43f963143510
    Reviewed-on: http://gerrit.cloudera.org:8080/11004
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 docs/impala.ditamap                         |   1 +
 docs/shared/impala_common.xml               |  19 ++++--
 docs/topics/impala_idle_session_timeout.xml | 100 ++++++++++++++++++++++++++++
 docs/topics/impala_timeouts.xml             |  90 +++++++++++++++++--------
 4 files changed, 177 insertions(+), 33 deletions(-)

diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index 8872fa1..02eb9a7 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -195,6 +195,7 @@ under the License.
           <topicref href="topics/impala_explain_level.xml"/>
           <topicref href="topics/impala_hbase_cache_blocks.xml"/>
           <topicref href="topics/impala_hbase_caching.xml"/>
+          <topicref href="topics/impala_idle_session_timeout.xml"/>
           <topicref href="topics/impala_kudu_read_mode.xml"/>
           <topicref href="topics/impala_live_progress.xml"/>
           <topicref href="topics/impala_live_summary.xml"/>
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index f96e3d7..314d157 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -3708,13 +3708,20 @@ sudo pip-python install ssl</codeblock>
 
       <note id="timeout_clock_blurb">
         <p>
-          The timeout clock for queries and sessions only starts ticking when the query or session is idle.
-          For queries, this means the query has results ready but is waiting for a client to fetch the data. A
-          query can run for an arbitrary time without triggering a timeout, because the query is computing results
-          rather than sitting idle waiting for the results to be fetched. The timeout period is intended to prevent
-          unclosed queries from consuming resources and taking up slots in the admission count of running queries,
-          potentially preventing other queries from starting.
+          The timeout clock for queries and sessions only starts ticking when
+          the query or session is idle.
         </p>
+
+        <p>
+          For queries, this means the query has results ready but is waiting
+          for a client to fetch the data. A query can run for an arbitrary time
+          without triggering a timeout, because the query is computing results
+          rather than sitting idle waiting for the results to be fetched. The
+          timeout period is intended to prevent unclosed queries from consuming
+          resources and taking up slots in the admission count of running
+          queries, potentially preventing other queries from starting.
+        </p>
+
         <p>
           For sessions, this means that no query has been submitted for some period of time.
         </p>
diff --git a/docs/topics/impala_idle_session_timeout.xml b/docs/topics/impala_idle_session_timeout.xml
new file mode 100644
index 0000000..ac718ed
--- /dev/null
+++ b/docs/topics/impala_idle_session_timeout.xml
@@ -0,0 +1,100 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.12.0" id="idle_session_timeout">
+
+  <title>IDLE_SESSION_TIMEOUT Query Option (<keyword keyref="impala212_full"/> or higher only)</title>
+
+  <titlealts audience="PDF">
+
+    <navtitle>IDLE_SESSION_TIMEOUT</navtitle>
+
+  </titlealts>
+
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.12.0">
+      The <codeph>IDLE_SESSION_TIMEOUT</codeph> query option sets the time in seconds after
+      which an idle session is cancelled. A session is idle when no activity is occurring for
+      any of the queries in that session, and the session has not started any new queries. Once
+      a session is expired, you cannot issue any new query requests to it. The session remains
+      open, but the only operation you can perform is to close it.
+    </p>
+
+    <p rev="2.12.0">
+      The <codeph>IDLE_SESSION_TIMEOUT</codeph> query option overrides the
+      <codeph>--idle_session_timeout</codeph> startup option. See
+      <xref href="impala_timeouts.xml#timeouts"/> for the
+      <codeph>--idle_session_timeout</codeph> startup option.
+    </p>
+
+    <p>
+      The <codeph>IDLE_SESSION_TIMEOUT</codeph> query option allows JDBC/ODBC connections to set
+      the session timeout as a query option with the <codeph>SET</codeph> statement.
+    </p>
+
+    <p>
+      <b>Syntax:</b>
+    </p>
+
+<codeblock>SET IDLE_SESSION_TIMEOUT=<varname>seconds</varname>;</codeblock>
+
+    <p>
+      <b>Type:</b> numeric
+    </p>
+
+    <p>
+      <b>Default:</b> 0
+      <ul>
+        <li>
+          If <codeph>--idle_session_timeout</codeph> is not set, the session never expires.
+        </li>
+
+        <li>
+          If <codeph>--idle_session_timeout</codeph> is set, use that timeout value.
+        </li>
+      </ul>
+    </p>
+
+    <p>
+      <b>Added in:</b> <keyword keyref="impala212_full"/>
+    </p>
+
+    <p>
+      <b>Related information:</b>
+    </p>
+
+    <p>
+      <xref href="impala_timeouts.xml#timeouts"/>
+    </p>
+
+  </conbody>
+
+</concept>
diff --git a/docs/topics/impala_timeouts.xml b/docs/topics/impala_timeouts.xml
index 80e5c9b..3389f5a 100644
--- a/docs/topics/impala_timeouts.xml
+++ b/docs/topics/impala_timeouts.xml
@@ -96,38 +96,59 @@ Trying to re-register with state-store</codeblock>
       <note conref="../shared/impala_common.xml#common/timeout_clock_blurb"/>
 
       <p>
-        Specify the following startup options for the <cmdname>impalad</cmdname> daemon:
+        Use the following startup options for the <cmdname>impalad</cmdname>
+        daemon to specify timeout values:
       </p>
 
       <ul>
-        <li>
+        <li><codeph>--idle_query_timeout</codeph>
           <p>
-            The <codeph>--idle_query_timeout</codeph> option specifies the time in seconds after
-            which an idle query is cancelled. This could be a query whose results were all fetched
-            but was never closed, or one whose results were partially fetched and then the client
-            program stopped requesting further results. This condition is most likely to occur in
-            a client program using the JDBC or ODBC interfaces, rather than in the interactive
-            <cmdname>impala-shell</cmdname> interpreter. Once the query is cancelled, the client
-            program cannot retrieve any further results.
+            Specifies the time in
+            seconds after which an idle query is cancelled. This could be a
+            query whose results were all fetched but was never closed, or one
+            whose results were partially fetched and then the client program
+            stopped requesting further results. This condition is most likely to
+            occur in a client program using the JDBC or ODBC interfaces, rather
+            than in the interactive <cmdname>impala-shell</cmdname> interpreter.
+            Once a query is cancelled, the client program cannot retrieve any
+            further results from the query.
           </p>
-
           <p rev="2.0.0">
-            You can reduce the idle query timeout by using the <codeph>QUERY_TIMEOUT_S</codeph>
-            query option. Any non-zero value specified for the <codeph>--idle_query_timeout</codeph> startup
-            option serves as an upper limit for the <codeph>QUERY_TIMEOUT_S</codeph> query option.
-            A zero value for <codeph>--idle_query_timeout</codeph> disables query timeouts.
-            See <xref href="impala_query_timeout_s.xml#query_timeout_s"/> for details.
+            You can reduce
+            the idle query timeout by using the <codeph>QUERY_TIMEOUT_S</codeph>
+            query option. Any non-zero value specified for the
+              <codeph>--idle_query_timeout</codeph> startup option serves as an
+            upper limit for the <codeph>QUERY_TIMEOUT_S</codeph> query option.
+            See <xref href="impala_query_timeout_s.xml#query_timeout_s"/> about
+            the query option.
+          </p>
+          <p rev="2.0.0">A zero value for
+              <codeph>--idle_query_timeout</codeph> disables query timeouts.
+          </p>
+          <p>
+            Cancelled queries remain in the open state but use only the
+            minimal resources.
           </p>
         </li>
 
-        <li>
+        <li><codeph>--idle_session_timeout</codeph>
+          <p>
+            Specifies the time in
+            seconds after which an idle session expires. A session is idle when
+            no activity is occurring for any of the queries in that session, and
+            the session has not started any new queries. Once a session is
+            expired, you cannot issue any new query requests to it. The session
+            remains open, but the only operation you can perform is to close it.
+          </p>
           <p>
-            The <codeph>--idle_session_timeout</codeph> option specifies the time in seconds after
-            which an idle session is expired. A session is idle when no activity is occurring for
-            any of the queries in that session, and the session has not started any new queries.
-            Once a session is expired, you cannot issue any new query requests to it. The session
-            remains open, but the only operation you can perform is to close it. The default value
-            of 0 means that sessions never expire.
+            The default value of 0 specifies sessions never
+            expire.
+          </p>
+          <p rev="2.12.0">
+            You can override the
+              <codeph>--idle_session_timeout</codeph> value with the <xref
+              href="impala_idle_session_timeout.xml#idle_session_timeout"/> at
+            the session level.
           </p>
         </li>
       </ul>
@@ -185,11 +206,26 @@ Trying to re-register with state-store</codeblock>
     <conbody>
 
       <p>
-        Sometimes, an Impala query might run for an unexpectedly long time, tying up resources
-        in the cluster. You can cancel the query explicitly, independent of the timeout period,
-        by going into the web UI for the <cmdname>impalad</cmdname> host (on port 25000 by
-        default), and using the link on the <codeph>/queries</codeph> tab to cancel the running
-        query. For example, press <codeph>^C</codeph> in <cmdname>impala-shell</cmdname>.
+        Sometimes, an Impala query might run for an unexpectedly long time,
+        tying up resources in the cluster. You can cancel the query explicitly,
+        independent of the timeout period, by going into the web UI for the
+          <cmdname>impalad</cmdname> host (on port 25000 by default), and using
+        the link on the <codeph>/queries</codeph> tab to cancel the running
+        query.
+      </p>
+
+      <p>
+        Various client applications let you interactively cancel queries
+        submitted or monitored through those applications. For example:
+        <ul>
+          <li>
+            Press <systemoutput>^C</systemoutput> in
+            <cmdname>impala-shell</cmdname>.
+          </li>
+          <li>
+            Click <b>Cancel</b> from the <b>Watch</b>page in Hue.
+          </li>
+        </ul>
       </p>
 
     </conbody>

[impala] 07/20: KUDU-2492: Make the use of SO_REUSEPORT conditional on it being defined

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit cb2064eb374806d58331731974e826b7b0fd14b6
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
AuthorDate: Thu Jul 5 11:45:00 2018 -0700

    KUDU-2492: Make the use of SO_REUSEPORT conditional on it being defined
    
    A recent commit to Kudu, "rpc: add experimental rpc_reuseport flag",
    added the use of the rpc flag SO_REUSEREPORT. This flag is not
    available with older versions of Linux, resulting in a compiler error.
    
    This patch avoids the compiler error with a macro that checks if
    SO_REUSEPORT is defined, and if it's not attempting to set it will
    return an error.
    
    --------------------------
    
    IMPALA-7302: This is cherry-picked to fix builds breaking on CentOS 6.4.
    Since some of our Jenkins machines are CentOS 6.4, and upgrading them
    to our new minimum supported OS of CentOS 6.8 is non-trivial, we cherry-
    pick this patch to temporarily unblock these builds until the Jenkins
    AMIs are upgraded.
    
    Change-Id: I042125cdaedafa5ebbc09e5a3c12112dfeec59a1
    Reviewed-on: http://gerrit.cloudera.org:8080/10994
    Reviewed-by: Thomas Marshall <th...@cmu.edu>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/kudu/util/net/socket.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/be/src/kudu/util/net/socket.cc b/be/src/kudu/util/net/socket.cc
index cc14702..bc3b525 100644
--- a/be/src/kudu/util/net/socket.cc
+++ b/be/src/kudu/util/net/socket.cc
@@ -245,10 +245,14 @@ Status Socket::SetReuseAddr(bool flag) {
 }
 
 Status Socket::SetReusePort(bool flag) {
-  int int_flag = flag ? 1 : 0;
-  RETURN_NOT_OK_PREPEND(SetSockOpt(SOL_SOCKET, SO_REUSEPORT, int_flag),
-                        "failed to set SO_REUSEPORT");
-  return Status::OK();
+  #ifdef SO_REUSEPORT
+    int int_flag = flag ? 1 : 0;
+    RETURN_NOT_OK_PREPEND(SetSockOpt(SOL_SOCKET, SO_REUSEPORT, int_flag),
+                          "failed to set SO_REUSEPORT");
+    return Status::OK();
+  #else
+    return Status::NotSupported("failed to set SO_REUSEPORT: protocol not available");
+  #endif
 }
 
 Status Socket::BindAndListen(const Sockaddr &sockaddr,

[impala] 04/20: IMPALA-7315: fix test_update_with_clear_entries_flag race

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 7562635bbbd2d5516d909f972b2b3b844b88fdeb
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Wed Jul 18 14:36:37 2018 -0700

    IMPALA-7315: fix test_update_with_clear_entries_flag race
    
    We need to wait for the subscriber to process the second update in order
    to guarantee that the first update for that subscriber has been applied.
    Otherwise there is a race window where the second subscriber may see the
    older version of the statestore topic.
    
    Change-Id: I2be2b61b6deb0228fbc5a242e43076beb8871454
    Reviewed-on: http://gerrit.cloudera.org:8080/10986
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/statestore/test_statestore.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/statestore/test_statestore.py b/tests/statestore/test_statestore.py
index 698a2a6..682a306 100644
--- a/tests/statestore/test_statestore.py
+++ b/tests/statestore/test_statestore.py
@@ -558,7 +558,7 @@ class TestStatestore():
         .wait_for_failure()
         .start()
         .register(topics=reg)
-        .wait_for_update(topic_name, 1)
+        .wait_for_update(topic_name, 2)
     )
 
     sub2 = StatestoreSubscriber(update_cb=check_entries)

[impala] 05/20: IMPALA-7259: Improve Impala shell performance

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit eee10343558c790cdb20238c3d132d486b190a9c
Author: Fredy Wijaya <fw...@cloudera.com>
AuthorDate: Fri Jul 13 03:23:37 2018 -0500

    IMPALA-7259: Improve Impala shell performance
    
    This patch fixes the slow performance in Impala shell, especially for
    large queries by replacing all calls to sqlparse.format(sql_string,
    strip_comments=True) with the custom implementation of strip comments
    that does not use grouping. The code to strip leading comments was also
    refactored to not use grouping.
    
    * Benchmark running a query with 12K columns *
    
    Before the patch:
    $ time impala-shell.sh -f large.sql --quiet
    real    2m4.154s
    user    2m0.536s
    sys     0m0.088s
    
    After the patch:
    $ time impala-shell.sh -f large.sql --quiet
    real    0m3.885s
    user    0m1.516s
    sys     0m0.048s
    
    Testing:
    - Added a new test to test the Impala shell performance
    - Ran all shell tests on Python 2.6 and Python 2.7
    
    Change-Id: Idac9f3caed7c44846a8c922dbe5ca3bf3b095b81
    Reviewed-on: http://gerrit.cloudera.org:8080/10939
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 shell/impala_shell.py                 | 47 ++++++++++++++++++++++++-----------
 tests/shell/test_shell_commandline.py | 38 +++++++++++++++++++++++++++-
 2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index a6ab277..0446f69 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py
@@ -59,6 +59,18 @@ try:
 except Exception:
   pass
 
+def strip_comments(sql):
+  """sqlparse default implementation of strip comments has a bad performance when parsing
+  very large SQL due to the grouping. This is because the default implementation tries to
+  format the SQL for pretty-printing. Impala shell use of strip comments is mostly for
+  checking and not for altering the actual SQL, so having a pretty-formatted SQL is
+  irrelevant in Impala shell. Removing the grouping gives a significant performance boost.
+  """
+  stack = sqlparse.engine.FilterStack()
+  stack.stmtprocess.append(sqlparse.filters.StripCommentsFilter())
+  stack.postprocess.append(sqlparse.filters.SerializerUnicode())
+  return ''.join(stack.run(sql, 'utf-8')).strip()
+
 class CmdStatus:
   """Values indicate the execution status of a command to the cmd shell driver module
   SUCCESS and ERROR continue running the shell and ABORT exits the shell
@@ -401,13 +413,13 @@ class ImpalaShell(object, cmd.Cmd):
     # Strip any comments to make a statement such as the following be considered as
     # ending with a delimiter:
     # select 1 + 1; -- this is a comment
-    line = sqlparse.format(line, strip_comments=True).encode('utf-8').rstrip()
+    line = strip_comments(line).encode('utf-8').rstrip()
     if line.endswith(ImpalaShell.CMD_DELIM):
       try:
         # Look for an open quotation in the entire command, and not just the
         # current line.
         if self.partial_cmd:
-          line = sqlparse.format('%s %s' % (self.partial_cmd, line), strip_comments=True)
+          line = strip_comments('%s %s' % (self.partial_cmd, line))
         self._shlex_split(line)
         return True
       # If the command ends with a delimiter, check if it has an open quotation.
@@ -1142,7 +1154,7 @@ class ImpalaShell(object, cmd.Cmd):
     query = self._create_beeswax_query(args)
     # Set posix=True and add "'" to escaped quotes
     # to deal with escaped quotes in string literals
-    lexer = shlex.shlex(sqlparse.format(query.query.lstrip(), strip_comments=True)
+    lexer = shlex.shlex(strip_comments(query.query.lstrip())
                         .encode('utf-8'), posix=True)
     lexer.escapedquotes += "'"
     try:
@@ -1345,24 +1357,31 @@ class ImpalaShell(object, cmd.Cmd):
       def _process(self, tlist):
         token = tlist.token_first()
         if self._is_comment(token):
-          self.comment = token.value
-          tidx = tlist.token_index(token)
-          tlist.tokens.pop(tidx)
+          self.comment = ''
+          while token:
+            if self._is_comment(token) or self._is_whitespace(token):
+              tidx = tlist.token_index(token)
+              self.comment += token.value
+              tlist.tokens.pop(tidx)
+              tidx -= 1
+              token = tlist.token_next(tidx, False)
+            else:
+              break
 
       def _is_comment(self, token):
-        if isinstance(token, sqlparse.sql.Comment):
-          return True
-        for comment in sqlparse.tokens.Comment:
-          if token.ttype == comment:
-            return True
-        return False
+        return isinstance(token, sqlparse.sql.Comment) or \
+               token.ttype == sqlparse.tokens.Comment.Single or \
+               token.ttype == sqlparse.tokens.Comment.Multiline
+
+      def _is_whitespace(self, token):
+        return token.ttype == sqlparse.tokens.Whitespace or \
+               token.ttype == sqlparse.tokens.Newline
 
       def process(self, stack, stmt):
         [self.process(stack, sgroup) for sgroup in stmt.get_sublists()]
         self._process(stmt)
 
     stack = sqlparse.engine.FilterStack()
-    stack.enable_grouping()
     strip_leading_comment_filter = StripLeadingCommentFilter()
     stack.stmtprocess.append(strip_leading_comment_filter)
     stack.postprocess.append(sqlparse.filters.SerializerUnicode())
@@ -1492,7 +1511,7 @@ def parse_query_text(query_text, utf8_encode_policy='strict'):
   # "--comment2" is sent as is. Impala's parser however doesn't consider it a valid SQL
   # and throws an exception. We identify such trailing comments and ignore them (do not
   # send them to Impala).
-  if query_list and not sqlparse.format(query_list[-1], strip_comments=True).strip("\n"):
+  if query_list and not strip_comments(query_list[-1]).strip("\n"):
     query_list.pop()
   return query_list
 
diff --git a/tests/shell/test_shell_commandline.py b/tests/shell/test_shell_commandline.py
index 858e46b..6fc34fe 100644
--- a/tests/shell/test_shell_commandline.py
+++ b/tests/shell/test_shell_commandline.py
@@ -23,12 +23,13 @@ import re
 import signal
 import shlex
 import socket
+import tempfile
 
 from subprocess import call, Popen
 from tests.common.impala_service import ImpaladService
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.skip import SkipIf
-from time import sleep
+from time import sleep, time
 from util import IMPALAD, SHELL_CMD
 from util import assert_var_substitution, run_impala_shell_cmd, ImpalaShell
 
@@ -667,3 +668,38 @@ class TestImpalaShell(ImpalaTestSuite):
     args = "-q \"with v as (select 1) \;\""
     result = run_impala_shell_cmd(args, expect_success=False)
     assert "Encountered: Unexpected character" in result.stderr
+
+  def test_large_sql(self, unique_database):
+    sql_file, sql_path = tempfile.mkstemp()
+    os.write(sql_file, "create table large_table (\n")
+    num_cols = 4000
+    for i in xrange(num_cols):
+      os.write(sql_file, "col_{0} int".format(i))
+      if i < num_cols - 1:
+        os.write(sql_file, ",")
+      os.write(sql_file, "\n")
+    os.write(sql_file, ");")
+    os.write(sql_file, "select \n")
+
+    for i in xrange(num_cols):
+      if i < num_cols:
+        os.write(sql_file, 'col_{0} as a{1},\n'.format(i, i))
+        os.write(sql_file, 'col_{0} as b{1},\n'.format(i, i))
+        os.write(sql_file, 'col_{0} as c{1}{2}\n'.format(i, i,
+                                                     ',' if i < num_cols - 1 else ''))
+    os.write(sql_file, 'from large_table;')
+    os.close(sql_file)
+
+    try:
+      args = "-f {0} -d {1}".format(sql_path, unique_database)
+      start_time = time()
+      result = run_impala_shell_cmd(args)
+      end_time = time()
+      assert "Fetched 0 row(s)" in result.stderr
+      time_limit_s = 30
+      actual_time_s = end_time - start_time
+      assert actual_time_s <= time_limit_s, (
+          "It took {0} seconds to execute the query. Time limit is {1} seconds.".format(
+              actual_time_s, time_limit_s))
+    finally:
+      os.remove(sql_path)

[impala] 14/20: IMPALA-7291: [DOCS] Note about no codegen support for CHAR

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3d6a8959f8077a53194dd5a7b9e339b09046fa49
Author: Alex Rodoni <ar...@cloudera.com>
AuthorDate: Mon Jul 23 15:22:42 2018 -0700

    IMPALA-7291: [DOCS] Note about no codegen support for CHAR
    
    Also, cleaned up confusing examples.
    
    Change-Id: Id89dcf44e31f1bc56d888527585b3ec90229981a
    Reviewed-on: http://gerrit.cloudera.org:8080/11022
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 docs/topics/impala_char.xml | 223 +++++++++++++++++---------------------------
 1 file changed, 87 insertions(+), 136 deletions(-)

diff --git a/docs/topics/impala_char.xml b/docs/topics/impala_char.xml
index 9204812..5286a3c 100644
--- a/docs/topics/impala_char.xml
+++ b/docs/topics/impala_char.xml
@@ -21,7 +21,13 @@ under the License.
 <concept id="char" rev="2.0.0">
 
   <title>CHAR Data Type (<keyword keyref="impala20"/> or higher only)</title>
-  <titlealts audience="PDF"><navtitle>CHAR</navtitle></titlealts>
+
+  <titlealts audience="PDF">
+
+    <navtitle>CHAR</navtitle>
+
+  </titlealts>
+
   <prolog>
     <metadata>
       <data name="Category" value="Impala"/>
@@ -36,9 +42,9 @@ under the License.
   <conbody>
 
     <p rev="2.0.0">
-      <indexterm audience="hidden">CHAR data type</indexterm>
-      A fixed-length character type, padded with trailing spaces if necessary to achieve the specified length. If
-      values are longer than the specified length, Impala truncates any trailing characters.
+      A fixed-length character type, padded with trailing spaces if necessary to achieve the
+      specified length. If values are longer than the specified length, Impala truncates any
+      trailing characters.
     </p>
 
     <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
@@ -50,7 +56,7 @@ under the License.
 <codeblock><varname>column_name</varname> CHAR(<varname>length</varname>)</codeblock>
 
     <p>
-      The maximum length you can specify is 255.
+      The maximum <varname>length</varname> you can specify is 255.
     </p>
 
     <p>
@@ -59,21 +65,54 @@ under the License.
 
     <ul>
       <li>
-        When you store a <codeph>CHAR</codeph> value shorter than the specified length in a table, queries return
-        the value padded with trailing spaces if necessary; the resulting value has the same length as specified in
-        the column definition.
+        When you store a <codeph>CHAR</codeph> value shorter than the specified length in a
+        table, queries return the value padded with trailing spaces if necessary; the resulting
+        value has the same length as specified in the column definition.
+      </li>
+
+      <li>
+        Leading spaces in <codeph>CHAR</codeph> are preserved within the data file.
+      </li>
+
+      <li>
+        If you store a <codeph>CHAR</codeph> value containing trailing spaces in a table, those
+        trailing spaces are not stored in the data file. When the value is retrieved by a query,
+        the result could have a different number of trailing spaces. That is, the value includes
+        however many spaces are needed to pad it to the specified length of the column.
       </li>
 
       <li>
-        If you store a <codeph>CHAR</codeph> value containing trailing spaces in a table, those trailing spaces are
-        not stored in the data file. When the value is retrieved by a query, the result could have a different
-        number of trailing spaces. That is, the value includes however many spaces are needed to pad it to the
-        specified length of the column.
+        If you compare two <codeph>CHAR</codeph> values that differ only in the number of
+        trailing spaces, those values are considered identical.
       </li>
 
       <li>
-        If you compare two <codeph>CHAR</codeph> values that differ only in the number of trailing spaces, those
-        values are considered identical.
+        When comparing or processing <codeph>CHAR</codeph> values:
+        <ul>
+          <li>
+            <codeph>CAST()</codeph> truncates any longer string to fit within
+            the defined length. For example:
+<codeblock>SELECT CAST('x' AS CHAR(4)) = CAST('x        ' AS CHAR(4)); -- Returns TRUE.
+</codeblock>
+          </li>
+          <li>
+            If a <codeph>CHAR</codeph> value is shorter than the specified
+            length, it is padded on the right with spaces until it matches the
+            specified length.
+          </li>
+          <li>
+            <codeph>CHAR_LENGTH()</codeph> returns the length including any
+            trailing spaces.
+          </li>
+          <li>
+            <codeph>LENGTH()</codeph> returns the length excluding trailing
+            spaces.
+          </li>
+          <li>
+            <codeph>CONCAT()</codeph> returns the length including trailing
+            spaces.
+          </li>
+        </ul>
       </li>
     </ul>
 
@@ -93,18 +132,19 @@ under the License.
       </li>
 
       <li>
-        Parquet files generated by Impala and containing this type can be freely interchanged with other components
-        such as Hive and MapReduce.
+        Parquet files generated by Impala and containing this type can be freely interchanged
+        with other components such as Hive and MapReduce.
       </li>
 
       <li>
-        Any trailing spaces, whether implicitly or explicitly specified, are not written to the Parquet data files.
+        Any trailing spaces, whether implicitly or explicitly specified, are not written to the
+        Parquet data files.
       </li>
 
       <li>
         Parquet data files might contain values that are longer than allowed by the
-        <codeph>CHAR(<varname>n</varname>)</codeph> length limit. Impala ignores any extra trailing characters when
-        it processes those values during a query.
+        <codeph>CHAR(<varname>n</varname>)</codeph> length limit. Impala ignores any extra
+        trailing characters when it processes those values during a query.
       </li>
     </ul>
 
@@ -112,14 +152,18 @@ under the License.
 
     <p>
       Text data files might contain values that are longer than allowed for a particular
-      <codeph>CHAR(<varname>n</varname>)</codeph> column. Any extra trailing characters are ignored when Impala
-      processes those values during a query. Text data files can also contain values that are shorter than the
-      defined length limit, and Impala pads them with trailing spaces up to the specified length. Any text data
-      files produced by Impala <codeph>INSERT</codeph> statements do not include any trailing blanks for
+      <codeph>CHAR(<varname>n</varname>)</codeph> column. Any extra trailing characters are
+      ignored when Impala processes those values during a query. Text data files can also
+      contain values that are shorter than the defined length limit, and Impala pads them with
+      trailing spaces up to the specified length. Any text data files produced by Impala
+      <codeph>INSERT</codeph> statements do not include any trailing blanks for
       <codeph>CHAR</codeph> columns.
     </p>
 
-    <p><b>Avro considerations:</b></p>
+    <p>
+      <b>Avro considerations:</b>
+    </p>
+
     <p conref="../shared/impala_common.xml#common/avro_2gb_strings"/>
 
     <p conref="../shared/impala_common.xml#common/compatibility_blurb"/>
@@ -129,7 +173,8 @@ under the License.
     </p>
 
     <p>
-      Some other database systems make the length specification optional. For Impala, the length is required.
+      Some other database systems make the length specification optional. For Impala, the length
+      is required.
     </p>
 
 <!--
@@ -146,142 +191,46 @@ it silently treats the value as length 255.
 
     <p conref="../shared/impala_common.xml#common/column_stats_constant"/>
 
-<!-- Seems like a logical design decision but don't think it's currently implemented like this.
-<p>
-Because both the maximum and average length are always known and always the same for
-any given <codeph>CHAR(<varname>n</varname>)</codeph> column, those fields are always filled
-in for <codeph>SHOW COLUMN STATS</codeph> output, even before you run
-<codeph>COMPUTE STATS</codeph> on the table.
-</p>
--->
-
     <p conref="../shared/impala_common.xml#common/udf_blurb_no"/>
 
-    <p conref="../shared/impala_common.xml#common/example_blurb"/>
-
-    <p>
-      These examples show how trailing spaces are not considered significant when comparing or processing
-      <codeph>CHAR</codeph> values. <codeph>CAST()</codeph> truncates any longer string to fit within the defined
-      length. If a <codeph>CHAR</codeph> value is shorter than the specified length, it is padded on the right with
-      spaces until it matches the specified length. Therefore, <codeph>LENGTH()</codeph> represents the length
-      including any trailing spaces, and <codeph>CONCAT()</codeph> also treats the column value as if it has
-      trailing spaces.
-    </p>
-
-<codeblock>select cast('x' as char(4)) = cast('x   ' as char(4)) as "unpadded equal to padded";
-+--------------------------+
-| unpadded equal to padded |
-+--------------------------+
-| true                     |
-+--------------------------+
-
-create table char_length(c char(3));
-insert into char_length values (cast('1' as char(3))), (cast('12' as char(3))), (cast('123' as char(3))), (cast('123456' as char(3)));
-select concat("[",c,"]") as c, length(c) from char_length;
-+-------+-----------+
-| c     | length(c) |
-+-------+-----------+
-| [1  ] | 3         |
-| [12 ] | 3         |
-| [123] | 3         |
-| [123] | 3         |
-+-------+-----------+
-</codeblock>
-
-    <p>
-      This example shows a case where data values are known to have a specific length, where <codeph>CHAR</codeph>
-      is a logical data type to use.
-<!--
-Because all the <codeph>CHAR</codeph> values have a constant predictable length,
-Impala can efficiently analyze how best to use these values in join queries,
-aggregation queries, and other contexts where column length is significant.
--->
-    </p>
+    <p conref="../shared/impala_common.xml#common/kudu_blurb"/>
 
-<codeblock>create table addresses
-  (id bigint,
-   street_name string,
-   state_abbreviation char(2),
-   country_abbreviation char(2));
-</codeblock>
+    <p conref="../shared/impala_common.xml#common/kudu_unsupported_data_type"/>
 
     <p>
-      The following example shows how values written by Impala do not physically include the trailing spaces. It
-      creates a table using text format, with <codeph>CHAR</codeph> values much shorter than the declared length,
-      and then prints the resulting data file to show that the delimited values are not separated by spaces. The
-      same behavior applies to binary-format Parquet data files.
+      <b>Performance consideration:</b>
     </p>
 
-<codeblock>create table char_in_text (a char(20), b char(30), c char(40))
-  row format delimited fields terminated by ',';
-
-insert into char_in_text values (cast('foo' as char(20)), cast('bar' as char(30)), cast('baz' as char(40))), (cast('hello' as char(20)), cast('goodbye' as char(30)), cast('aloha' as char(40)));
-
--- Running this Linux command inside impala-shell using the ! shortcut.
-!hdfs dfs -cat 'hdfs://127.0.0.1:8020/user/hive/warehouse/impala_doc_testing.db/char_in_text/*.*';
-foo,bar,baz
-hello,goodbye,aloha
-</codeblock>
-
     <p>
-      The following example further illustrates the treatment of spaces. It replaces the contents of the previous
-      table with some values including leading spaces, trailing spaces, or both. Any leading spaces are preserved
-      within the data file, but trailing spaces are discarded. Then when the values are retrieved by a query, the
-      leading spaces are retrieved verbatim while any necessary trailing spaces are supplied by Impala.
+      The <codeph>CHAR</codeph> type currently does not have the Impala Codegen support, and we
+      recommend using <codeph>VARCHAR</codeph> or <codeph>STRING</codeph> over
+      <codeph>CHAR</codeph> as the performance gain of Codegen outweighs the benefits of fixed
+      width <codeph>CHAR</codeph>.
     </p>
 
-<codeblock>insert overwrite char_in_text values (cast('trailing   ' as char(20)), cast('   leading and trailing   ' as char(30)), cast('   leading' as char(40)));
-!hdfs dfs -cat 'hdfs://127.0.0.1:8020/user/hive/warehouse/impala_doc_testing.db/char_in_text/*.*';
-trailing,   leading and trailing,   leading
-
-select concat('[',a,']') as a, concat('[',b,']') as b, concat('[',c,']') as c from char_in_text;
-+------------------------+----------------------------------+--------------------------------------------+
-| a                      | b                                | c                                          |
-+------------------------+----------------------------------+--------------------------------------------+
-| [trailing            ] | [   leading and trailing       ] | [   leading                              ] |
-+------------------------+----------------------------------+--------------------------------------------+
-</codeblock>
-
-    <p conref="../shared/impala_common.xml#common/kudu_blurb"/>
-    <p conref="../shared/impala_common.xml#common/kudu_unsupported_data_type"/>
-
     <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
 
     <p>
-      Because the blank-padding behavior requires allocating the maximum length for each value in memory, for
-      scalability reasons avoid declaring <codeph>CHAR</codeph> columns that are much longer than typical values in
-      that column.
+      Because the blank-padding behavior requires allocating the maximum length for each value
+      in memory, for scalability reasons, you should avoid declaring <codeph>CHAR</codeph>
+      columns that are much longer than typical values in that column.
     </p>
 
     <p conref="../shared/impala_common.xml#common/blobs_are_strings"/>
 
     <p>
       When an expression compares a <codeph>CHAR</codeph> with a <codeph>STRING</codeph> or
-      <codeph>VARCHAR</codeph>, the <codeph>CHAR</codeph> value is implicitly converted to <codeph>STRING</codeph>
-      first, with trailing spaces preserved.
+      <codeph>VARCHAR</codeph>, the <codeph>CHAR</codeph> value is implicitly converted to
+      <codeph>STRING</codeph> first, with trailing spaces preserved.
     </p>
 
-<codeblock>select cast("foo  " as char(5)) = 'foo' as "char equal to string";
-+----------------------+
-| char equal to string |
-+----------------------+
-| false                |
-+----------------------+
-</codeblock>
-
     <p>
       This behavior differs from other popular database systems. To get the expected result of
-      <codeph>TRUE</codeph>, cast the expressions on both sides to <codeph>CHAR</codeph> values of the appropriate
-      length:
+      <codeph>TRUE</codeph>, cast the expressions on both sides to <codeph>CHAR</codeph> values
+      of the appropriate length. For example:
     </p>
 
-<codeblock>select cast("foo  " as char(5)) = cast('foo' as char(3)) as "char equal to string";
-+----------------------+
-| char equal to string |
-+----------------------+
-| true                 |
-+----------------------+
-</codeblock>
+<codeblock>SELECT CAST("foo  " AS CHAR(5)) = CAST('foo' AS CHAR(3)); -- Returns TRUE.</codeblock>
 
     <p>
       This behavior is subject to change in future releases.
@@ -294,5 +243,7 @@ select concat('[',a,']') as a, concat('[',b,']') as b, concat('[',c,']') as c fr
       <xref href="impala_literals.xml#string_literals"/>,
       <xref href="impala_string_functions.xml#string_functions"/>
     </p>
+
   </conbody>
+
 </concept>

[impala] 06/20: test_recover_partitions.py had asserts that were always true.

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 9e089ffe5b06467ade26b9063a1fff0ae7a83ba5
Author: Philip Zeyliger <ph...@cloudera.com>
AuthorDate: Thu Jul 19 10:09:11 2018 -0700

    test_recover_partitions.py had asserts that were always true.
    
    Running "python -m compileall" discovered some assertions that
    were always true. I've re-instated them to their true spirit.
    
    Change-Id: Id49171304b853f15c43c8cfca066b6694c4a669f
    Reviewed-on: http://gerrit.cloudera.org:8080/10993
    Reviewed-by: Vuk Ercegovac <ve...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/metadata/test_recover_partitions.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/metadata/test_recover_partitions.py b/tests/metadata/test_recover_partitions.py
index 1894676..9ba4164 100644
--- a/tests/metadata/test_recover_partitions.py
+++ b/tests/metadata/test_recover_partitions.py
@@ -346,12 +346,11 @@ class TestRecoverPartitions(ImpalaTestSuite):
         self.client, "ALTER TABLE %s RECOVER PARTITIONS" % FQ_TBL_NAME)
       result = self.execute_query_expect_success(
         self.client, "SHOW PARTITIONS %s" % FQ_TBL_NAME)
-      assert (self.count_partition(result.data) == 1,
-        "ALTER TABLE %s RECOVER PARTITIONS produced more than 1 partitions" %
-        FQ_TBL_NAME)
-      assert (self.count_value('p=100%25', result.data) == 1,
-        "ALTER TABLE %s RECOVER PARTITIONS failed to handle encoded partitioned value" %
-        FQ_TBL_NAME)
+      assert self.count_partition(result.data) == 1, \
+        "ALTER TABLE %s RECOVER PARTITIONS produced more than 1 partitions" % FQ_TBL_NAME
+      assert self.count_value('p=100%25', result.data) == 1, \
+        "ALTER TABLE %s RECOVER PARTITIONS failed to handle encoded partitioned value" % \
+        FQ_TBL_NAME
 
   @SkipIfLocal.hdfs_client
   @SkipIfS3.empty_directory

[impala] 03/20: IMPALA-7298: Stop passing IP address as hostname in Kerberos principal

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c4b9c9868b067705594a2065bae7777056708f5c
Author: Michael Ho <kw...@cloudera.com>
AuthorDate: Tue Jul 17 16:46:55 2018 -0700

    IMPALA-7298: Stop passing IP address as hostname in Kerberos principal
    
    Previously, we pass the resolved IP address of a KRPC destination
    host as the hostname when creating a proxy for making KRPC calls.
    This may lead to connection negotiation failure in KRPC when Kerberos
    is enabled. In particular, if reversed DNS isn't enabled in Kerberos,
    KDC may fail to look up the principal of the destination host if the
    principal includes the hostname instead of resolved IP address.
    
    This change fixes the problem above by passing the actual hostname
    of the destination host when calling RpcMgr::GetProxy().
    
    rpc-mgr-kerberized-test.cc is also updated to use hostname
    instead of the resolved IP address as Kerberos principal.
    
    Change-Id: I3e3e978746cf03766eee151835aad5877d9ed63e
    Reviewed-on: http://gerrit.cloudera.org:8080/10980
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/rpc/rpc-mgr-kerberized-test.cc      |  2 +-
 be/src/rpc/rpc-mgr-test-base.h             |  6 ++++--
 be/src/rpc/rpc-mgr-test.cc                 |  5 +++--
 be/src/rpc/rpc-mgr.h                       | 10 +++++-----
 be/src/rpc/rpc-mgr.inline.h                |  5 +++--
 be/src/runtime/data-stream-sender.cc       |  2 +-
 be/src/runtime/data-stream-test.cc         |  6 +++---
 be/src/runtime/krpc-data-stream-sender.cc  | 14 ++++++++------
 be/src/scheduling/scheduler.cc             |  4 ++--
 common/thrift/ImpalaInternalService.thrift |  6 +++---
 10 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/be/src/rpc/rpc-mgr-kerberized-test.cc b/be/src/rpc/rpc-mgr-kerberized-test.cc
index 141f359..3c8d045 100644
--- a/be/src/rpc/rpc-mgr-kerberized-test.cc
+++ b/be/src/rpc/rpc-mgr-kerberized-test.cc
@@ -93,7 +93,7 @@ int main(int argc, char** argv) {
   impala::IpAddr ip;
   impala::Status status = impala::HostnameToIpAddr(FLAGS_hostname, &ip);
   DCHECK(status.ok());
-  kdc_principal = Substitute("impala-test/$0", ip);
+  kdc_principal = Substitute("impala-test/$0", FLAGS_hostname);
   kdc_realm = "KRBTEST.COM";
 
   int port = impala::FindUnusedEphemeralPort(nullptr);
diff --git a/be/src/rpc/rpc-mgr-test-base.h b/be/src/rpc/rpc-mgr-test-base.h
index 872e408..0ff2bb7 100644
--- a/be/src/rpc/rpc-mgr-test-base.h
+++ b/be/src/rpc/rpc-mgr-test-base.h
@@ -246,10 +246,12 @@ Status RunMultipleServicesTestTemplate(RpcMgrTestBase<T>* test_base,
   RETURN_IF_ERROR(rpc_mgr->StartServices(krpc_address));
 
   unique_ptr<PingServiceProxy> ping_proxy;
-  RETURN_IF_ERROR(rpc_mgr->GetProxy<PingServiceProxy>(krpc_address, &ping_proxy));
+  RETURN_IF_ERROR(rpc_mgr->GetProxy<PingServiceProxy>(krpc_address, FLAGS_hostname,
+      &ping_proxy));
 
   unique_ptr<ScanMemServiceProxy> scan_mem_proxy;
-  RETURN_IF_ERROR(rpc_mgr->GetProxy<ScanMemServiceProxy>(krpc_address, &scan_mem_proxy));
+  RETURN_IF_ERROR(rpc_mgr->GetProxy<ScanMemServiceProxy>(krpc_address, FLAGS_hostname,
+      &scan_mem_proxy));
 
   RpcController controller;
   srand(0);
diff --git a/be/src/rpc/rpc-mgr-test.cc b/be/src/rpc/rpc-mgr-test.cc
index 880cb45..6de5c6f 100644
--- a/be/src/rpc/rpc-mgr-test.cc
+++ b/be/src/rpc/rpc-mgr-test.cc
@@ -191,7 +191,7 @@ TEST_F(RpcMgrTest, SlowCallback) {
   ASSERT_OK(rpc_mgr_.StartServices(krpc_address_));
 
   unique_ptr<PingServiceProxy> proxy;
-  ASSERT_OK(rpc_mgr_.GetProxy<PingServiceProxy>(krpc_address_, &proxy));
+  ASSERT_OK(rpc_mgr_.GetProxy<PingServiceProxy>(krpc_address_, FLAGS_hostname, &proxy));
 
   PingRequestPB request;
   PingResponsePB response;
@@ -211,7 +211,8 @@ TEST_F(RpcMgrTest, AsyncCall) {
       static_cast<ScanMemServiceImpl*>(scan_mem_impl)->mem_tracker()));
 
   unique_ptr<ScanMemServiceProxy> scan_mem_proxy;
-  ASSERT_OK(rpc_mgr_.GetProxy<ScanMemServiceProxy>(krpc_address_, &scan_mem_proxy));
+  ASSERT_OK(rpc_mgr_.GetProxy<ScanMemServiceProxy>(krpc_address_, FLAGS_hostname,
+      &scan_mem_proxy));
 
   FLAGS_num_acceptor_threads = 2;
   FLAGS_num_reactor_threads = 10;
diff --git a/be/src/rpc/rpc-mgr.h b/be/src/rpc/rpc-mgr.h
index c7107d2..c25f754 100644
--- a/be/src/rpc/rpc-mgr.h
+++ b/be/src/rpc/rpc-mgr.h
@@ -133,12 +133,12 @@ class RpcMgr {
       kudu::rpc::GeneratedServiceIf* service_ptr, MemTracker* service_mem_tracker)
       WARN_UNUSED_RESULT;
 
-  /// Creates a new proxy for a remote service of type P at location 'address', and places
-  /// it in 'proxy'. 'P' must descend from kudu::rpc::ServiceIf. Note that 'address' must
-  /// be a resolved IP address.
+  /// Creates a new proxy for a remote service of type P at location 'address' with
+  /// hostname 'hostname' and places it in 'proxy'. 'P' must descend from
+  /// kudu::rpc::ServiceIf. Note that 'address' must be a resolved IP address.
   template <typename P>
-  Status GetProxy(const TNetworkAddress& address, std::unique_ptr<P>* proxy)
-      WARN_UNUSED_RESULT;
+  Status GetProxy(const TNetworkAddress& address, const std::string& hostname,
+      std::unique_ptr<P>* proxy) WARN_UNUSED_RESULT;
 
   /// Shut down all previously registered services. All service pools are shut down.
   /// All acceptor and reactor threads within the messenger are also shut down.
diff --git a/be/src/rpc/rpc-mgr.inline.h b/be/src/rpc/rpc-mgr.inline.h
index 474ac45..934b28b 100644
--- a/be/src/rpc/rpc-mgr.inline.h
+++ b/be/src/rpc/rpc-mgr.inline.h
@@ -30,13 +30,14 @@ namespace impala {
 
 /// Always inline to avoid having to provide a definition for each use type P.
 template <typename P>
-Status RpcMgr::GetProxy(const TNetworkAddress& address, std::unique_ptr<P>* proxy) {
+Status RpcMgr::GetProxy(const TNetworkAddress& address, const std::string& hostname,
+    std::unique_ptr<P>* proxy) {
   DCHECK(proxy != nullptr);
   DCHECK(is_inited()) << "Must call Init() before GetProxy()";
   DCHECK(IsResolvedAddress(address));
   kudu::Sockaddr sockaddr;
   RETURN_IF_ERROR(TNetworkAddressToSockaddr(address, &sockaddr));
-  proxy->reset(new P(messenger_, sockaddr, address.hostname));
+  proxy->reset(new P(messenger_, sockaddr, hostname));
   return Status::OK();
 }
 
diff --git a/be/src/runtime/data-stream-sender.cc b/be/src/runtime/data-stream-sender.cc
index a37c3c1..ad500e9 100644
--- a/be/src/runtime/data-stream-sender.cc
+++ b/be/src/runtime/data-stream-sender.cc
@@ -353,7 +353,7 @@ DataStreamSender::DataStreamSender(int sender_id, const RowDescriptor* row_desc,
   // TODO: use something like google3's linked_ptr here (scoped_ptr isn't copyable)
   for (int i = 0; i < destinations.size(); ++i) {
     channels_.push_back(
-        new Channel(this, row_desc, destinations[i].server,
+        new Channel(this, row_desc, destinations[i].thrift_backend,
             destinations[i].fragment_instance_id, sink.dest_node_id,
             per_channel_buffer_size));
   }
diff --git a/be/src/runtime/data-stream-test.cc b/be/src/runtime/data-stream-test.cc
index 8851d1a..9f866ba 100644
--- a/be/src/runtime/data-stream-test.cc
+++ b/be/src/runtime/data-stream-test.cc
@@ -358,10 +358,10 @@ class DataStreamTest : public DataStreamTestBase<testing::TestWithParam<KrpcSwit
     dest_.push_back(TPlanFragmentDestination());
     TPlanFragmentDestination& dest = dest_.back();
     dest.fragment_instance_id = next_instance_id_;
-    dest.server.hostname = "127.0.0.1";
-    dest.server.port = FLAGS_port;
+    dest.thrift_backend.hostname = "localhost";
+    dest.thrift_backend.port = FLAGS_port;
     if (GetParam() == USE_KRPC) {
-      dest.__set_krpc_server(krpc_address_);
+      dest.__set_krpc_backend(krpc_address_);
     }
     *instance_id = next_instance_id_;
     ++next_instance_id_.lo;
diff --git a/be/src/runtime/krpc-data-stream-sender.cc b/be/src/runtime/krpc-data-stream-sender.cc
index b362758..e6cea1f 100644
--- a/be/src/runtime/krpc-data-stream-sender.cc
+++ b/be/src/runtime/krpc-data-stream-sender.cc
@@ -105,10 +105,11 @@ class KrpcDataStreamSender::Channel : public CacheLineAligned {
   // data is getting accumulated before being sent; it only applies when data is added via
   // AddRow() and not sent directly via SendBatch().
   Channel(KrpcDataStreamSender* parent, const RowDescriptor* row_desc,
-      const TNetworkAddress& destination, const TUniqueId& fragment_instance_id,
-      PlanNodeId dest_node_id, int buffer_size)
+      const std::string& hostname, const TNetworkAddress& destination,
+      const TUniqueId& fragment_instance_id, PlanNodeId dest_node_id, int buffer_size)
     : parent_(parent),
       row_desc_(row_desc),
+      hostname_(hostname),
       address_(destination),
       fragment_instance_id_(fragment_instance_id),
       dest_node_id_(dest_node_id) {
@@ -160,6 +161,7 @@ class KrpcDataStreamSender::Channel : public CacheLineAligned {
   const RowDescriptor* row_desc_;
 
   // The triplet of IP-address:port/finst-id/node-id uniquely identifies the receiver.
+  const std::string hostname_;
   const TNetworkAddress address_;
   const TUniqueId fragment_instance_id_;
   const PlanNodeId dest_node_id_;
@@ -305,7 +307,7 @@ Status KrpcDataStreamSender::Channel::Init(RuntimeState* state) {
 
   // Create a DataStreamService proxy to the destination.
   RpcMgr* rpc_mgr = ExecEnv::GetInstance()->rpc_mgr();
-  RETURN_IF_ERROR(rpc_mgr->GetProxy(address_, &proxy_));
+  RETURN_IF_ERROR(rpc_mgr->GetProxy(address_, hostname_, &proxy_));
   return Status::OK();
 }
 
@@ -593,9 +595,9 @@ KrpcDataStreamSender::KrpcDataStreamSender(int sender_id, const RowDescriptor* r
 
   for (int i = 0; i < destinations.size(); ++i) {
     channels_.push_back(
-        new Channel(this, row_desc, destinations[i].krpc_server,
-            destinations[i].fragment_instance_id, sink.dest_node_id,
-            per_channel_buffer_size));
+        new Channel(this, row_desc, destinations[i].thrift_backend.hostname,
+            destinations[i].krpc_backend, destinations[i].fragment_instance_id,
+            sink.dest_node_id, per_channel_buffer_size));
   }
 
   if (partition_type_ == TPartitionType::UNPARTITIONED ||
diff --git a/be/src/scheduling/scheduler.cc b/be/src/scheduling/scheduler.cc
index 950d218..af00116 100644
--- a/be/src/scheduling/scheduler.cc
+++ b/be/src/scheduling/scheduler.cc
@@ -341,12 +341,12 @@ void Scheduler::ComputeFragmentExecParams(
         TPlanFragmentDestination& dest = src_params->destinations[i];
         dest.__set_fragment_instance_id(dest_params->instance_exec_params[i].instance_id);
         const TNetworkAddress& host = dest_params->instance_exec_params[i].host;
-        dest.__set_server(host);
+        dest.__set_thrift_backend(host);
         if (FLAGS_use_krpc) {
           const TBackendDescriptor& desc = LookUpBackendDesc(executor_config, host);
           DCHECK(desc.__isset.krpc_address);
           DCHECK(IsResolvedAddress(desc.krpc_address));
-          dest.__set_krpc_server(desc.krpc_address);
+          dest.__set_krpc_backend(desc.krpc_address);
         }
       }
 
diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift
index ae5e139..0c87fa4 100644
--- a/common/thrift/ImpalaInternalService.thrift
+++ b/common/thrift/ImpalaInternalService.thrift
@@ -452,11 +452,11 @@ struct TPlanFragmentDestination {
   // the globally unique fragment instance id
   1: required Types.TUniqueId fragment_instance_id
 
-  // IP address + port of the thrift based ImpalaInteralService on the destination
-  2: required Types.TNetworkAddress server
+  // hostname + port of the Thrift based ImpalaInteralService on the destination
+  2: required Types.TNetworkAddress thrift_backend
 
   // IP address + port of the KRPC based ImpalaInternalService on the destination
-  3: optional Types.TNetworkAddress krpc_server
+  3: optional Types.TNetworkAddress krpc_backend
 }
 
 // Context to collect information, which is shared among all instances of that plan

[impala] 16/20: IMPALA-6174: [DOCS] Fixed the seed data type for RAND and RANDOM functions

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 798cfef29a345d513b86a254ab09fce60e6d9974
Author: Alex Rodoni <ar...@cloudera.com>
AuthorDate: Mon Jul 23 15:37:18 2018 -0700

    IMPALA-6174: [DOCS] Fixed the seed data type for RAND and RANDOM functions
    
    Change-Id: If6393bd618a26148dd668b3323c32af263637e14
    Reviewed-on: http://gerrit.cloudera.org:8080/11023
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 docs/topics/impala_math_functions.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/topics/impala_math_functions.xml b/docs/topics/impala_math_functions.xml
index 6bf0921..13c9a1c 100644
--- a/docs/topics/impala_math_functions.xml
+++ b/docs/topics/impala_math_functions.xml
@@ -1149,9 +1149,9 @@ select pmod(5,-2);
       <dlentry id="rand">
 
         <dt>
-          <codeph>rand()</codeph>, <codeph>rand(int seed)</codeph>,
+          <codeph>rand()</codeph>, <codeph>rand(bigint seed)</codeph>,
           <codeph rev="2.3.0" id="random">random()</codeph>,
-          <codeph rev="2.3.0">random(int seed)</codeph>
+          <codeph rev="2.3.0">random(bigint seed)</codeph>
         </dt>
 
         <dd>

[impala] 13/20: IMPALA-6299: Use LlvmCodeGen's internal list of white-listed CPU attributes for handcrafting IRs

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 44794423bada8a44a2b7b05879764c6bc13b26fd
Author: poojanilangekar <po...@cloudera.com>
AuthorDate: Thu Jul 12 15:09:17 2018 -0700

    IMPALA-6299: Use LlvmCodeGen's internal list of white-listed CPU attributes for handcrafting IRs
    
    Previously, the IRBuilder of the LlvmCodeGen class used CpuInfo's
    list of enabled features to determine the validity of certain
    instructions to emit intrinsics. It did not consider the whitelist
    which passed while initializing the LlvmCodeGen class. Now, the
    IRBuilder inspects its own CPU attributes before emitting
    instruction. This change also adds functionality to modify the CPU
    attributes of the LlvmCodeGen class for testing.
    
    Testing: Verified that the current tests which use and modify
    CpuInfo produce expected results.
    
    Change-Id: Ifece8949c143146d2a1b38d72d21c2d733bed90f
    Reviewed-on: http://gerrit.cloudera.org:8080/10979
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/benchmarks/bloom-filter-benchmark.cc |  2 ++
 be/src/codegen/llvm-codegen-test.cc         | 49 ++++++++++++++++++++++-------
 be/src/codegen/llvm-codegen.cc              | 39 ++++++++++++++++-------
 be/src/codegen/llvm-codegen.h               | 24 ++++++++++----
 be/src/exec/filter-context.cc               |  2 +-
 be/src/util/cpu-info.cc                     |  8 +++++
 6 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/be/src/benchmarks/bloom-filter-benchmark.cc b/be/src/benchmarks/bloom-filter-benchmark.cc
index 4657a18..f216911 100644
--- a/be/src/benchmarks/bloom-filter-benchmark.cc
+++ b/be/src/benchmarks/bloom-filter-benchmark.cc
@@ -48,6 +48,8 @@ using namespace impala;
 // As in bloom-filter.h, ndv refers to the number of unique items inserted into a filter
 // and fpp is the probability of false positives.
 //
+// This benchmark must be executed only in RELEASE mode. Since it executes some codepath
+// which would not occur in Impala's execution, it crashes due to a DCHECK in DEBUG mode.
 //
 // Machine Info: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
 //
diff --git a/be/src/codegen/llvm-codegen-test.cc b/be/src/codegen/llvm-codegen-test.cc
index 9f668e0..3748ced 100644
--- a/be/src/codegen/llvm-codegen-test.cc
+++ b/be/src/codegen/llvm-codegen-test.cc
@@ -111,6 +111,31 @@ class LlvmCodeGenTest : public testing:: Test {
     const auto& hf = codegen->handcrafted_functions_;
     return find(hf.begin(), hf.end(), function) != hf.end();
   }
+
+  // Helper function to enable and disable LlvmCodeGen's cpu_attrs_.
+  static void EnableCodeGenCPUFeature(int64_t flag, bool enable) {
+    DCHECK(LlvmCodeGen::llvm_initialized_);
+    auto enable_flag_it = LlvmCodeGen::cpu_flag_mappings_.find(flag);
+    auto disable_flag_it = LlvmCodeGen::cpu_flag_mappings_.find(~flag);
+    DCHECK(enable_flag_it != LlvmCodeGen::cpu_flag_mappings_.end());
+    DCHECK(disable_flag_it != LlvmCodeGen::cpu_flag_mappings_.end());
+    const std::string& enable_feature = enable_flag_it->second;
+    const std::string& disable_feature = disable_flag_it->second;
+    if (enable) {
+      auto attr_it = LlvmCodeGen::cpu_attrs_.find(disable_feature);
+      if (attr_it != LlvmCodeGen::cpu_attrs_.end()) {
+        LlvmCodeGen::cpu_attrs_.erase(attr_it);
+      }
+      LlvmCodeGen::cpu_attrs_.insert(enable_feature);
+    } else {
+      auto attr_it = LlvmCodeGen::cpu_attrs_.find(enable_feature);
+      // Disable feature if currently enabled.
+      if (attr_it != LlvmCodeGen::cpu_attrs_.end()) {
+        LlvmCodeGen::cpu_attrs_.erase(attr_it);
+        LlvmCodeGen::cpu_attrs_.insert(disable_feature);
+      }
+    }
+  }
 };
 
 // Simple test to just make and destroy llvmcodegen objects.  LLVM
@@ -460,9 +485,12 @@ TEST_F(LlvmCodeGenTest, HashTest) {
     uint32_t result = test_fn();
 
     // Validate that the hashes are identical
-    EXPECT_EQ(result, expected_hash) << CpuInfo::IsSupported(CpuInfo::SSE4_2);
+    EXPECT_EQ(result, expected_hash) << LlvmCodeGen::IsCPUFeatureEnabled(CpuInfo::SSE4_2);
 
-    if (i == 0 && CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
+    if (i == 0 && LlvmCodeGen::IsCPUFeatureEnabled(CpuInfo::SSE4_2)) {
+      // Modify both CpuInfo and LlvmCodeGen::cpu_attrs_ to ensure that they have the
+      // same view of the underlying hardware while generating the hash.
+      EnableCodeGenCPUFeature(CpuInfo::SSE4_2, false);
       CpuInfo::EnableFeature(CpuInfo::SSE4_2, false);
       restore_sse_support = true;
       LlvmCodeGenTest::ClearHashFns(codegen.get());
@@ -473,6 +501,7 @@ TEST_F(LlvmCodeGenTest, HashTest) {
   }
 
   // Restore hardware feature for next test
+  EnableCodeGenCPUFeature(CpuInfo::SSE4_2, restore_sse_support);
   CpuInfo::EnableFeature(CpuInfo::SSE4_2, restore_sse_support);
 }
 
@@ -495,19 +524,17 @@ TEST_F(LlvmCodeGenTest, CpuAttrWhitelist) {
   // Non-existent attributes should be disabled regardless of initial states.
   // Whitelisted attributes like sse2 and lzcnt should retain their initial
   // state.
-  EXPECT_EQ(vector<string>(
-          {"-dummy1", "-dummy2", "-dummy3", "-dummy4", "+sse2", "-lzcnt"}),
+  EXPECT_EQ(std::unordered_set<string>(
+                {"-dummy1", "-dummy2", "-dummy3", "-dummy4", "+sse2", "-lzcnt"}),
       LlvmCodeGen::ApplyCpuAttrWhitelist(
-          {"+dummy1", "+dummy2", "-dummy3", "+dummy4", "+sse2", "-lzcnt"}));
+                {"+dummy1", "+dummy2", "-dummy3", "+dummy4", "+sse2", "-lzcnt"}));
 
   // IMPALA-6291: Test that all AVX512 attributes are disabled.
   vector<string> avx512_attrs;
-  EXPECT_EQ(vector<string>(
-        {"-avx512ifma", "-avx512dqavx512er", "-avx512f", "-avx512bw", "-avx512vl",
-         "-avx512cd", "-avx512vbmi", "-avx512pf"}),
-      LlvmCodeGen::ApplyCpuAttrWhitelist(
-        {"+avx512ifma", "+avx512dqavx512er", "+avx512f", "+avx512bw", "+avx512vl",
-         "+avx512cd", "+avx512vbmi", "+avx512pf"}));
+  EXPECT_EQ(std::unordered_set<string>({"-avx512ifma", "-avx512dqavx512er", "-avx512f",
+                "-avx512bw", "-avx512vl", "-avx512cd", "-avx512vbmi", "-avx512pf"}),
+      LlvmCodeGen::ApplyCpuAttrWhitelist({"+avx512ifma", "+avx512dqavx512er", "+avx512f",
+          "+avx512bw", "+avx512vl", "+avx512cd", "+avx512vbmi", "+avx512pf"}));
 }
 
 // Test that exercises the code path that deletes non-finalized methods before it
diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc
index 173dbb2..d6f89a3 100644
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -119,10 +119,19 @@ namespace impala {
 
 bool LlvmCodeGen::llvm_initialized_ = false;
 string LlvmCodeGen::cpu_name_;
-vector<string> LlvmCodeGen::cpu_attrs_;
+std::unordered_set<string> LlvmCodeGen::cpu_attrs_;
 string LlvmCodeGen::target_features_attr_;
 CodegenCallGraph LlvmCodeGen::shared_call_graph_;
 
+const map<int64_t, std::string> LlvmCodeGen::cpu_flag_mappings_{
+    {CpuInfo::SSSE3, "+ssse3"}, {CpuInfo::SSE4_1, "+sse4.1"},
+    {CpuInfo::SSE4_2, "+sse4.2"}, {CpuInfo::POPCNT, "+popcnt"}, {CpuInfo::AVX, "+avx"},
+    {CpuInfo::AVX2, "+avx2"}, {CpuInfo::PCLMULQDQ, "+pclmul"},
+    {~(CpuInfo::SSSE3), "-ssse3"}, {~(CpuInfo::SSE4_1), "-sse4.1"},
+    {~(CpuInfo::SSE4_2), "-sse4.2"}, {~(CpuInfo::POPCNT), "-popcnt"},
+    {~(CpuInfo::AVX), "-avx"}, {~(CpuInfo::AVX2), "-avx2"},
+    {~(CpuInfo::PCLMULQDQ), "-pclmul"}};
+
 [[noreturn]] static void LlvmCodegenHandleError(
     void* user_data, const string& reason, bool gen_crash_diag) {
   LOG(FATAL) << "LLVM hit fatal error: " << reason.c_str();
@@ -238,7 +247,7 @@ Status LlvmCodeGen::CreateFromMemory(RuntimeState* state, ObjectPool* pool,
   // a machine without SSE4.2 support.
   llvm::StringRef module_ir;
   string module_name;
-  if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
+  if (IsCPUFeatureEnabled(CpuInfo::SSE4_2)) {
     module_ir = llvm::StringRef(
         reinterpret_cast<const char*>(impala_sse_llvm_ir), impala_sse_llvm_ir_len);
     module_name = "Impala IR with SSE 4.2 support";
@@ -471,17 +480,24 @@ void LlvmCodeGen::EnableOptimizations(bool enable) {
   optimizations_enabled_ = enable;
 }
 
-void LlvmCodeGen::GetHostCPUAttrs(vector<string>* attrs) {
+void LlvmCodeGen::GetHostCPUAttrs(std::unordered_set<string>* attrs) {
   // LLVM's ExecutionEngine expects features to be enabled or disabled with a list
   // of strings like ["+feature1", "-feature2"].
   llvm::StringMap<bool> cpu_features;
   llvm::sys::getHostCPUFeatures(cpu_features);
   for (const llvm::StringMapEntry<bool>& entry : cpu_features) {
-    attrs->emplace_back(
-        Substitute("$0$1", entry.second ? "+" : "-", entry.first().data()));
+    attrs->emplace(Substitute("$0$1", entry.second ? "+" : "-", entry.first().data()));
   }
 }
 
+bool LlvmCodeGen::IsCPUFeatureEnabled(int64_t flag) {
+  DCHECK(llvm_initialized_);
+  auto enable_flag_it = cpu_flag_mappings_.find(flag);
+  DCHECK(enable_flag_it != cpu_flag_mappings_.end());
+  const std::string& enabled_feature = enable_flag_it->second;
+  return cpu_attrs_.find(enabled_feature) != cpu_attrs_.end();
+}
+
 string LlvmCodeGen::GetIR(bool full_module) const {
   string str;
   llvm::raw_string_ostream stream(str);
@@ -1526,7 +1542,7 @@ void LlvmCodeGen::ClearHashFns() {
 //   ret i32 %12
 // }
 llvm::Function* LlvmCodeGen::GetHashFunction(int num_bytes) {
-  if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
+  if (IsCPUFeatureEnabled(CpuInfo::SSE4_2)) {
     if (num_bytes == -1) {
       // -1 indicates variable length, just return the generic loop based
       // hash fn.
@@ -1671,8 +1687,9 @@ llvm::Constant* LlvmCodeGen::ConstantsToGVArrayPtr(llvm::Type* element_type,
   return ConstantToGVPtr(array_type, array_const, name);
 }
 
-vector<string> LlvmCodeGen::ApplyCpuAttrWhitelist(const vector<string>& cpu_attrs) {
-  vector<string> result;
+std::unordered_set<string> LlvmCodeGen::ApplyCpuAttrWhitelist(
+    const std::unordered_set<string>& cpu_attrs) {
+  std::unordered_set<string> result;
   vector<string> attr_whitelist;
   boost::split(attr_whitelist, FLAGS_llvm_cpu_attr_whitelist, boost::is_any_of(","));
   for (const string& attr : cpu_attrs) {
@@ -1680,17 +1697,17 @@ vector<string> LlvmCodeGen::ApplyCpuAttrWhitelist(const vector<string>& cpu_attr
     DCHECK(attr[0] == '-' || attr[0] == '+') << attr;
     if (attr[0] == '-') {
       // Already disabled - copy it over unmodified.
-      result.push_back(attr);
+      result.insert(attr);
       continue;
     }
     const string attr_name = attr.substr(1);
     auto it = std::find(attr_whitelist.begin(), attr_whitelist.end(), attr_name);
     if (it != attr_whitelist.end()) {
       // In whitelist - copy it over unmodified.
-      result.push_back(attr);
+      result.insert(attr);
     } else {
       // Not in whitelist - disable it.
-      result.push_back("-" + attr_name);
+      result.insert("-" + attr_name);
     }
   }
   return result;
diff --git a/be/src/codegen/llvm-codegen.h b/be/src/codegen/llvm-codegen.h
index 7e9da26..9ede053 100644
--- a/be/src/codegen/llvm-codegen.h
+++ b/be/src/codegen/llvm-codegen.h
@@ -238,7 +238,10 @@ class LlvmCodeGen {
   };
 
   /// Get host cpu attributes in format expected by EngineBuilder.
-  static void GetHostCPUAttrs(std::vector<std::string>* attrs);
+  static void GetHostCPUAttrs(std::unordered_set<std::string>* attrs);
+
+  /// Returns whether or not this cpu feature is supported.
+  static bool IsCPUFeatureEnabled(int64_t flag);
 
   /// Return a pointer type to 'type'
   llvm::PointerType* GetPtrType(llvm::Type* type);
@@ -593,6 +596,7 @@ class LlvmCodeGen {
   friend class ExprCodegenTest;
   friend class LlvmCodeGenTest;
   friend class LlvmCodeGenTest_CpuAttrWhitelist_Test;
+  friend class LlvmCodeGenTest_HashTest_Test;
   friend class SubExprElimination;
 
   /// Top level codegen object. 'module_id' is used for debugging when outputting the IR.
@@ -714,21 +718,29 @@ class LlvmCodeGen {
   /// always present in the output, except "+" is flipped to "-" for the disabled
   /// attributes. E.g. if 'cpu_attrs' is {"+x", "+y", "-z"} and the whitelist is
   /// {"x", "z"}, returns {"+x", "-y", "-z"}.
-  static std::vector<std::string> ApplyCpuAttrWhitelist(
-      const std::vector<std::string>& cpu_attrs);
+  static std::unordered_set<std::string> ApplyCpuAttrWhitelist(
+      const std::unordered_set<std::string>& cpu_attrs);
 
   /// Whether InitializeLlvm() has been called.
   static bool llvm_initialized_;
 
   /// Host CPU name and attributes, filled in by InitializeLlvm().
   static std::string cpu_name_;
-  static std::vector<std::string> cpu_attrs_;
+  /// The cpu_attrs_ should not be modified during the execution except for tests.
+  static std::unordered_set<std::string> cpu_attrs_;
 
   /// Value of "target-features" attribute to be set on all IR functions. Derived from
-  /// 'cpu_attrs_'. Using a consistent value for this attribute among hand-crafted IR
-  /// and cross-compiled functions allow them to be inlined into each other.
+  /// 'cpu_attrs_'. Using a consistent value for this attribute among
+  /// hand-crafted IR and cross-compiled functions allow them to be inlined into each
+  /// other.
   static std::string target_features_attr_;
 
+  /// Mapping between CpuInfo flags and the corresponding strings.
+  /// The key is mapped to the string as follows:
+  /// CpuInfo flag -> enabled feature.
+  /// Bitwise negation of CpuInfo flag -> disabled feature.
+  const static std::map<int64_t, std::string> cpu_flag_mappings_;
+
   /// A global shared call graph for all IR functions in the main module.
   /// Used for determining dependencies when materializing IR functions.
   static CodegenCallGraph shared_call_graph_;
diff --git a/be/src/exec/filter-context.cc b/be/src/exec/filter-context.cc
index 5c39ff9..08cd666 100644
--- a/be/src/exec/filter-context.cc
+++ b/be/src/exec/filter-context.cc
@@ -388,7 +388,7 @@ Status FilterContext::CodegenInsert(LlvmCodeGen* codegen, ScalarExpr* filter_exp
 
     // Call Insert() on the bloom filter.
     llvm::Function* insert_bloom_filter_fn;
-    if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
+    if (LlvmCodeGen::IsCPUFeatureEnabled(CpuInfo::AVX2)) {
       insert_bloom_filter_fn =
           codegen->GetFunction(IRFunction::BLOOM_FILTER_INSERT_AVX2, false);
     } else {
diff --git a/be/src/util/cpu-info.cc b/be/src/util/cpu-info.cc
index 815e0d5..e486479 100644
--- a/be/src/util/cpu-info.cc
+++ b/be/src/util/cpu-info.cc
@@ -68,6 +68,14 @@ void WarnIfFileNotEqual(
 
 namespace impala {
 
+const int64_t CpuInfo::SSSE3;
+const int64_t CpuInfo::SSE4_1;
+const int64_t CpuInfo::SSE4_2;
+const int64_t CpuInfo::POPCNT;
+const int64_t CpuInfo::AVX;
+const int64_t CpuInfo::AVX2;
+const int64_t CpuInfo::PCLMULQDQ;
+
 bool CpuInfo::initialized_ = false;
 int64_t CpuInfo::hardware_flags_ = 0;
 int64_t CpuInfo::original_hardware_flags_;

[impala] 01/20: IMPALA-7314: Doc generation should fail on error

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c8b03a7c1ad4046acd8710740bab37f34c6e1d9c
Author: Fredy Wijaya <fw...@cloudera.com>
AuthorDate: Tue Jul 17 21:34:46 2018 -0700

    IMPALA-7314: Doc generation should fail on error
    
    This patch updates the doc generation to fail when there is an error.
    dita does not exit with non-zero exit code when there is an error. The
    patch checks for [ERROR] in the dita output and fails if it encounters
    one.
    
    Testing:
    - Manually tested by injecting failures
    
    Change-Id: Ic452aa282a3f2a761e3b04a7460e0d86bc51d721
    Reviewed-on: http://gerrit.cloudera.org:8080/10976
    Reviewed-by: Alex Rodoni <ar...@cloudera.com>
    Reviewed-by: Michael Brown <mi...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 docs/.gitignore                 |  1 +
 docs/Makefile                   |  4 ++--
 docs/{Makefile => build-doc.sh} | 26 +++++++++++++++-----------
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/docs/.gitignore b/docs/.gitignore
index b6e5611..cf8132f 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,2 +1,3 @@
 !Makefile
 build/
+*.log
diff --git a/docs/Makefile b/docs/Makefile
index 99a0e09..fc2f8e9 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -26,7 +26,7 @@ pdf: build/impala.pdf
 ALL_DEPS=Makefile impala.ditamap shared/*.xml images/* topics/*.xml
 
 build/html/index.html: impala_html.ditaval ${ALL_DEPS}
-	dita -i impala.ditamap -f html5 -o $(dir $@) -filter $<
+	./build-doc.sh html5 $(dir $@) $< gen-html.log
 
 build/impala.pdf: impala_pdf.ditaval ${ALL_DEPS}
-	dita -i impala.ditamap -f pdf -o $(dir $@) -filter $<
+	./build-doc.sh pdf $(dir $@) $< gen-pdf.log
diff --git a/docs/Makefile b/docs/build-doc.sh
old mode 100644
new mode 100755
similarity index 68%
copy from docs/Makefile
copy to docs/build-doc.sh
index 99a0e09..6153e1d
--- a/docs/Makefile
+++ b/docs/build-doc.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,18 +17,20 @@
 # specific language governing permissions and limitations
 # under the License.
 
-.PHONY: all html pdf
-
-all: html pdf
-
-html: build/html/index.html
+set -euo pipefail
 
-pdf: build/impala.pdf
+function usage() {
+  echo "$0 <file_format> <output_file> <filter_file> <log_file>"
+  exit 1
+}
 
-ALL_DEPS=Makefile impala.ditamap shared/*.xml images/* topics/*.xml
+[[ $# -eq 4 ]] || usage
 
-build/html/index.html: impala_html.ditaval ${ALL_DEPS}
-	dita -i impala.ditamap -f html5 -o $(dir $@) -filter $<
+FILE_FORMAT=$1
+OUTPUT_FILE=$2
+FILTER_FILE=$3
+LOG_FILE=$4
 
-build/impala.pdf: impala_pdf.ditaval ${ALL_DEPS}
-	dita -i impala.ditamap -f pdf -o $(dir $@) -filter $<
+dita -i impala.ditamap -f ${FILE_FORMAT} -o ${OUTPUT_FILE} -filter ${FILTER_FILE} 2>&1 \
+    | tee ${LOG_FILE}
+[[ -z $(grep "\[ERROR\]" ${LOG_FILE}) ]] || exit 1

[impala] 09/20: IMPALA-7256: Aggregator mem usage isn't reflected in summary

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 8304591be3b056caa6acef54fa4ac629152e493e
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
AuthorDate: Thu Jul 19 02:14:15 2018 +0000

    IMPALA-7256: Aggregator mem usage isn't reflected in summary
    
    This patch fixes a bug where memory used by an Aggregator wasn't being
    reflected in the exec summary for the corresponding aggregation node
    by ensuring that the Aggregator's MemTracker is a child of the node's
    MemTracker.
    
    Testing:
    - Manually ran a query and checked that all memory used by the
      Aggregator is accounted for in the exec summary.
    
    Change-Id: Iba6ef207bed47810fc742aec3481db5f313cf97f
    Reviewed-on: http://gerrit.cloudera.org:8080/10989
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/aggregator.cc             |  3 ++-
 be/src/exec/aggregator.h              |  1 +
 be/src/exec/exec-node.cc              |  3 ++-
 be/src/exec/grouping-aggregator.cc    | 11 ++++++++---
 be/src/exec/grouping-aggregator.h     |  1 +
 be/src/runtime/reservation-manager.cc |  9 +++++----
 be/src/runtime/reservation-manager.h  |  5 ++++-
 7 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/be/src/exec/aggregator.cc b/be/src/exec/aggregator.cc
index 87abc52..9e79bf4 100644
--- a/be/src/exec/aggregator.cc
+++ b/be/src/exec/aggregator.cc
@@ -46,6 +46,7 @@ const char* Aggregator::LLVM_CLASS_NAME = "class.impala::Aggregator";
 Aggregator::Aggregator(ExecNode* exec_node, ObjectPool* pool, const TPlanNode& tnode,
     const DescriptorTbl& descs, const std::string& name)
   : id_(exec_node->id()),
+    exec_node_(exec_node),
     pool_(pool),
     intermediate_tuple_id_(tnode.agg_node.intermediate_tuple_id),
     intermediate_tuple_desc_(descs.GetTupleDescriptor(intermediate_tuple_id_)),
@@ -82,7 +83,7 @@ Status Aggregator::Init(const TPlanNode& tnode, RuntimeState* state) {
 
 Status Aggregator::Prepare(RuntimeState* state) {
   mem_tracker_.reset(new MemTracker(
-      runtime_profile_, -1, runtime_profile_->name(), state->instance_mem_tracker()));
+      runtime_profile_, -1, runtime_profile_->name(), exec_node_->mem_tracker()));
   expr_mem_tracker_.reset(new MemTracker(-1, "Exprs", mem_tracker_.get(), false));
   expr_perm_pool_.reset(new MemPool(expr_mem_tracker_.get()));
   expr_results_pool_.reset(new MemPool(expr_mem_tracker_.get()));
diff --git a/be/src/exec/aggregator.h b/be/src/exec/aggregator.h
index f415606..1ea3c47 100644
--- a/be/src/exec/aggregator.h
+++ b/be/src/exec/aggregator.h
@@ -93,6 +93,7 @@ class Aggregator {
  protected:
   /// The id of the ExecNode this Aggregator corresponds to.
   int id_;
+  ExecNode* exec_node_;
   ObjectPool* pool_;
 
   /// Account for peak memory used by this aggregator.
diff --git a/be/src/exec/exec-node.cc b/be/src/exec/exec-node.cc
index 5dca184..e187179 100644
--- a/be/src/exec/exec-node.cc
+++ b/be/src/exec/exec-node.cc
@@ -129,7 +129,8 @@ Status ExecNode::Prepare(RuntimeState* state) {
   }
   reservation_manager_.Init(
       Substitute("$0 id=$1 ptr=$2", PrintThriftEnum(type_), id_, this), runtime_profile_,
-      mem_tracker_.get(), resource_profile_, debug_options_);
+      state->instance_buffer_reservation(), mem_tracker_.get(), resource_profile_,
+      debug_options_);
   return Status::OK();
 }
 
diff --git a/be/src/exec/grouping-aggregator.cc b/be/src/exec/grouping-aggregator.cc
index 4f3e5cf..092ecfd 100644
--- a/be/src/exec/grouping-aggregator.cc
+++ b/be/src/exec/grouping-aggregator.cc
@@ -175,9 +175,13 @@ Status GroupingAggregator::Prepare(RuntimeState* state) {
       MAX_PARTITION_DEPTH, 1, expr_perm_pool_.get(), expr_results_pool_.get(),
       expr_results_pool_.get(), &ht_ctx_));
 
-  reservation_manager_.Init(
-      Substitute("GroupingAggregator id=$0 ptr=$1", id_, this), runtime_profile_,
-      mem_tracker_.get(), resource_profile_, debug_options_);
+  reservation_tracker_.reset(new ReservationTracker);
+  reservation_tracker_->InitChildTracker(runtime_profile_,
+      state->instance_buffer_reservation(), exec_node_->mem_tracker(),
+      numeric_limits<int64_t>::max());
+  reservation_manager_.Init(Substitute("GroupingAggregator id=$0 ptr=$1", id_, this),
+      runtime_profile_, reservation_tracker_.get(), mem_tracker_.get(),
+      resource_profile_, debug_options_);
   return Status::OK();
 }
 
@@ -400,6 +404,7 @@ void GroupingAggregator::Close(RuntimeState* state) {
   ScalarExpr::Close(build_exprs_);
 
   reservation_manager_.Close(state);
+  if (reservation_tracker_ != nullptr) reservation_tracker_->Close();
   // Must be called after tuple_pool_ is freed, so that mem_tracker_ can be closed.
   Aggregator::Close(state);
 }
diff --git a/be/src/exec/grouping-aggregator.h b/be/src/exec/grouping-aggregator.h
index b766a1e..3d64711 100644
--- a/be/src/exec/grouping-aggregator.h
+++ b/be/src/exec/grouping-aggregator.h
@@ -216,6 +216,7 @@ class GroupingAggregator : public Aggregator {
   /// Resource information sent from the frontend.
   const TBackendResourceProfile resource_profile_;
 
+  std::unique_ptr<ReservationTracker> reservation_tracker_;
   ReservationManager reservation_manager_;
   BufferPool::ClientHandle* buffer_pool_client();
 
diff --git a/be/src/runtime/reservation-manager.cc b/be/src/runtime/reservation-manager.cc
index f16b48f..1712e09 100644
--- a/be/src/runtime/reservation-manager.cc
+++ b/be/src/runtime/reservation-manager.cc
@@ -29,10 +29,11 @@ using strings::Substitute;
 namespace impala {
 
 void ReservationManager::Init(string name, RuntimeProfile* runtime_profile,
-    MemTracker* mem_tracker, const TBackendResourceProfile& resource_profile,
-    const TDebugOptions& debug_options) {
+    ReservationTracker* parent_reservation, MemTracker* mem_tracker,
+    const TBackendResourceProfile& resource_profile, const TDebugOptions& debug_options) {
   name_ = name;
   runtime_profile_ = runtime_profile;
+  parent_reservation_ = parent_reservation;
   mem_tracker_ = mem_tracker;
   resource_profile_ = resource_profile;
   debug_options_ = debug_options;
@@ -60,8 +61,8 @@ Status ReservationManager::ClaimBufferReservation(RuntimeState* state) {
   }
 
   RETURN_IF_ERROR(buffer_pool->RegisterClient(name_, state->query_state()->file_group(),
-      state->instance_buffer_reservation(), mem_tracker_,
-      resource_profile_.max_reservation, runtime_profile_, &buffer_pool_client_));
+      parent_reservation_, mem_tracker_, resource_profile_.max_reservation,
+      runtime_profile_, &buffer_pool_client_));
   VLOG_FILE << name_ << " claiming reservation " << resource_profile_.min_reservation;
   state->query_state()->initial_reservations()->Claim(
       &buffer_pool_client_, resource_profile_.min_reservation);
diff --git a/be/src/runtime/reservation-manager.h b/be/src/runtime/reservation-manager.h
index 0545548..10563df 100644
--- a/be/src/runtime/reservation-manager.h
+++ b/be/src/runtime/reservation-manager.h
@@ -40,7 +40,8 @@ class ReservationManager {
   /// and error messages, 'debug_options' is used for the SET_DENY_RESERVATION_PROBABILITY
   /// action. Does not take ownership of 'runtime_profile' and 'mem_tracker'. Must be
   /// called before ClaimBufferReservation().
-  void Init(std::string name, RuntimeProfile* runtime_profile, MemTracker* mem_tracker,
+  void Init(std::string name, RuntimeProfile* runtime_profile,
+      ReservationTracker* parent_reservation, MemTracker* mem_tracker,
       const TBackendResourceProfile& resource_profile,
       const TDebugOptions& debug_options);
   void Close(RuntimeState* state);
@@ -72,6 +73,8 @@ class ReservationManager {
 
   MemTracker* mem_tracker_;
 
+  ReservationTracker* parent_reservation_;
+
   /// Buffer pool client for this node. Initialized with the node's minimum reservation
   /// in ClaimBufferReservation(). After initialization, the client must hold onto at
   /// least the minimum reservation so that it can be returned to the initial

[impala] 20/20: IMPALA-7218: [DOCS] Support column list in ALTER VIEW

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit dc888ba8247ea85e985d304f4d87273775ba711e
Author: Alex Rodoni <ar...@cloudera.com>
AuthorDate: Tue Jul 24 17:22:32 2018 -0700

    IMPALA-7218: [DOCS] Support column list in ALTER VIEW
    
    Change-Id: I19e5cf97302a46738fd832344415fb7ad4ca0e41
    Reviewed-on: http://gerrit.cloudera.org:8080/11043
    Reviewed-by: Fredy Wijaya <fw...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 docs/topics/impala_alter_view.xml | 87 +++++++++++++++++++++++----------------
 1 file changed, 52 insertions(+), 35 deletions(-)

diff --git a/docs/topics/impala_alter_view.xml b/docs/topics/impala_alter_view.xml
index e0d21d9..08d26bb 100644
--- a/docs/topics/impala_alter_view.xml
+++ b/docs/topics/impala_alter_view.xml
@@ -21,7 +21,13 @@ under the License.
 <concept rev="1.1" id="alter_view">
 
   <title>ALTER VIEW Statement</title>
-  <titlealts audience="PDF"><navtitle>ALTER VIEW</navtitle></titlealts>
+
+  <titlealts audience="PDF">
+
+    <navtitle>ALTER VIEW</navtitle>
+
+  </titlealts>
+
   <prolog>
     <metadata>
       <data name="Category" value="Impala"/>
@@ -38,67 +44,78 @@ under the License.
   <conbody>
 
     <p>
-      <indexterm audience="hidden">ALTER VIEW statement</indexterm>
-      Changes the characteristics of a view. The syntax has two forms:
+      The <codeph>ALTER VIEW</codeph> statement changes the characteristics of a view.
     </p>
 
-    <ul>
-      <li>
-        The <codeph>AS</codeph> clause associates the view with a different query.
-      </li>
-      <li>
-        The <codeph>RENAME TO</codeph> clause changes the name of the view, moves the view to
-        a different database, or both.
-      </li>
-    </ul>
-
     <p>
-      Because a view is purely a logical construct (an alias for a query) with no physical data behind it,
-      <codeph>ALTER VIEW</codeph> only involves changes to metadata in the metastore database, not any data files
-      in HDFS.
+      Because a view is a logical construct, an alias for a query, with no physical data behind
+      it, <codeph>ALTER VIEW</codeph> only involves changes to metadata in the metastore
+      database, not any data files in HDFS.
     </p>
 
-<!-- View _permissions_ don't rely on underlying table. -->
+    <p>
+      To see the definition of the updated view, issue a <codeph>DESCRIBE FORMATTED</codeph>
+      statement.
+    </p>
 
-<!-- Could use views to grant access only to certain columns. -->
+    <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
 
-<!-- Treated like a table for authorization. -->
+<codeblock>ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname>
+   [(<varname>column_name</varname> [COMMENT '<varname>column_comment</varname>'][, ...])]
+   AS <varname>select_statement</varname>;
 
-<!-- ALTER VIEW that queries another view - possibly a runtime error. -->
+ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname>
+   RENAME TO [<varname>database_name</varname>.]<varname>view_name</varname>;</codeblock>
 
-    <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+    <ul>
+      <li>
+        The <codeph>AS</codeph> clause associates the view with a different query.
+        <p>
+          An optional list of column names can be specified with or without the column-level
+          comments.
+        </p>
+
+        <p>
+          For example:
+<codeblock>
+ALTER VIEW v1 AS SELECT x, UPPER(s) s FROM t2;
+ALTER VIEW v1 (c1, c2, c3) AS SELECT x, UPPER(s) s FROM t2;
+ALTER VIEW v7 (c1 COMMENT 'Comment for c1', c2) AS SELECT t1.c1, t1.c2 FROM t1;
+</codeblock>
+        </p>
+      </li>
 
-<codeblock>ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname> AS <varname>select_statement</varname>
-ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname> RENAME TO [<varname>database_name</varname>.]<varname>view_name</varname></codeblock>
+      <li>
+        The <codeph>RENAME TO</codeph> clause changes the name of the view, moves the view to a
+        different database, or both.
+        <p>
+          For example:
+<codeblock>ALTER VIEW db1.v1 RENAME TO db2.v2; -- Move the view to a different database with a new name.
+ALTER VIEW db1.v1 RENAME TO db1.v2;  -- Rename the view in the same database.
+ALTER VIEW db1.v1 RENAME TO db2.v1; -- Move the view to a difference database with the same view name.</codeblock>
+        </p>
+      </li>
+    </ul>
 
     <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
 
     <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
 
     <p conref="../shared/impala_common.xml#common/security_blurb"/>
+
     <p conref="../shared/impala_common.xml#common/redaction_yes"/>
 
     <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
 
     <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
 
-    <p conref="../shared/impala_common.xml#common/example_blurb"/>
-
-<codeblock>create table t1 (x int, y int, s string);
-create table t2 like t1;
-create view v1 as select * from t1;
-alter view v1 as select * from t2;
-alter view v1 as select x, upper(s) s from t2;</codeblock>
-
-<!-- Repeat the same blurb + example to see the definition of a view, as in CREATE VIEW. -->
-
-    <p conref="../shared/impala_common.xml#common/describe_formatted_view"/>
-
     <p conref="../shared/impala_common.xml#common/related_info"/>
 
     <p>
       <xref href="impala_views.xml#views"/>, <xref href="impala_create_view.xml#create_view"/>,
       <xref href="impala_drop_view.xml#drop_view"/>
     </p>
+
   </conbody>
+
 </concept>

[impala] 12/20: IMPALA-5031: Fix undefined behavior: memset NULL

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 58ae370e93cead3930960a490bb3824a0e08ac72
Author: Jim Apple <jb...@apache.org>
AuthorDate: Sun Jul 15 18:32:52 2018 -0700

    IMPALA-5031: Fix undefined behavior: memset NULL
    
    memset has undefined behavior when its first argument is NULL. The
    instance fixed here was found by Clang's undefined behavior
    sanitizer.
    
    It was found in the end-to-end tests. The interesting part of the
    stack trace is:
    
    /exec/data-source-scan-node.cc:152:10: runtime error: null pointer passed as argument 1, which is declared to never be null
    /usr/include/string.h:62:79: note: nonnull attribute specified here
        #0 0x482fd8e in DataSourceScanNode::GetNextInputBatch() /exec/data-source-scan-node.cc:152:3
        #1 0x482fb40 in DataSourceScanNode::Open(RuntimeState*) /exec/data-source-scan-node.cc:124:10
        #2 0x47ef854 in AggregationNode::Open(RuntimeState*) /exec/aggregation-node.cc:71:49
        #3 0x23506a4 in FragmentInstanceState::Open() /runtime/fragment-instance-state.cc:266:53
        #4 0x234b6a8 in FragmentInstanceState::Exec() /runtime/fragment-instance-state.cc:81:12
        #5 0x236ee52 in QueryState::ExecFInstance(FragmentInstanceState*) /runtime/query-state.cc:401:24
        #6 0x237093e in QueryState::StartFInstances()::$_0::operator()() const /runtime/query-state.cc:341:44
    
    Change-Id: I18fa02dc887a42a94c6f81e4923d17568f2184f2
    Reviewed-on: http://gerrit.cloudera.org:8080/10948
    Reviewed-by: Jim Apple <jb...@apache.org>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/data-source-scan-node.cc |  3 ++-
 be/src/util/ubsan.h                  | 37 ++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/be/src/exec/data-source-scan-node.cc b/be/src/exec/data-source-scan-node.cc
index 4b967bb..7d3bbf1 100644
--- a/be/src/exec/data-source-scan-node.cc
+++ b/be/src/exec/data-source-scan-node.cc
@@ -33,6 +33,7 @@
 #include "runtime/tuple-row.h"
 #include "util/jni-util.h"
 #include "util/periodic-counter-updater.h"
+#include "util/ubsan.h"
 #include "util/runtime-profile-counters.h"
 
 #include "common/names.h"
@@ -149,7 +150,7 @@ Status DataSourceScanNode::GetNextInputBatch() {
   input_batch_.reset(new TGetNextResult());
   next_row_idx_ = 0;
   // Reset all the indexes into the column value arrays to 0
-  memset(cols_next_val_idx_.data(), 0, sizeof(int) * cols_next_val_idx_.size());
+  Ubsan::MemSet(cols_next_val_idx_.data(), 0, sizeof(int) * cols_next_val_idx_.size());
   TGetNextParams params;
   params.__set_scan_handle(scan_handle_);
   RETURN_IF_ERROR(data_source_executor_->GetNext(params, input_batch_.get()));
diff --git a/be/src/util/ubsan.h b/be/src/util/ubsan.h
new file mode 100644
index 0000000..78f6bc1
--- /dev/null
+++ b/be/src/util/ubsan.h
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef UTIL_UBSAN_H_
+#define UTIL_UBSAN_H_
+
+// Utilities mimicking parts of the standard prone to accidentally using in a way causeing
+// undefined behavior.
+
+#include <cstring>
+
+class Ubsan {
+ public:
+  static void* MemSet(void* s, int c, size_t n) {
+    if (s == nullptr) {
+      DCHECK_EQ(n, 0);
+      return s;
+    }
+    return std::memset(s, c, n);
+  }
+};
+
+#endif // UTIL_UBSAN_H_

[impala] 15/20: IMPALA-3040: Remove cache directives if a partition is dropped externally

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 45d2ce7d3b953fbc325608bc378ee38a1b506a49
Author: Tianyi Wang <ti...@apache.org>
AuthorDate: Tue Jul 3 14:51:54 2018 -0700

    IMPALA-3040: Remove cache directives if a partition is dropped externally
    
    HdfsTable.dropPartition() doesn't uncache the partition right now. If
    the partition is dropped from Hive and refreshed in Impala, the
    partition will be removed from the catalog but the cache directive
    remains. Because Impala directly uses HMS client to drop a
    table/database, the cache directive won't be removed even if the table
    is dropped in Impala, if the backgroud loading is run concurrenty with
    the HMS client RPC call. This patch removes the cache directive in
    dropPartition() if the partition is removed from HMS.
    
    Change-Id: Id7701a499405e961456adea63f3592b43bd69170
    Reviewed-on: http://gerrit.cloudera.org:8080/10792
    Reviewed-by: Bharath Vissapragada <bh...@cloudera.com>
    Tested-by: Tianyi Wang <tw...@cloudera.com>
---
 .../java/org/apache/impala/catalog/HdfsTable.java  | 40 ++++++++++++++++------
 .../apache/impala/service/CatalogOpExecutor.java   |  3 --
 tests/query_test/test_hdfs_caching.py              | 19 ++++++++++
 3 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index 5f2fc05..0e472a3 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -59,6 +59,7 @@ import org.apache.impala.analysis.PartitionKeyValue;
 import org.apache.impala.catalog.HdfsPartition.FileBlock;
 import org.apache.impala.catalog.HdfsPartition.FileDescriptor;
 import org.apache.impala.common.FileSystemUtil;
+import org.apache.impala.common.ImpalaException;
 import org.apache.impala.common.Pair;
 import org.apache.impala.common.PrintUtils;
 import org.apache.impala.common.Reference;
@@ -1149,8 +1150,10 @@ public class HdfsTable extends Table implements FeFsTable {
   /**
    * Drops a partition and updates partition column statistics. Returns the
    * HdfsPartition that was dropped or null if the partition does not exist.
+   * If removeCacheDirective = true, any cache directive on the partition is removed.
    */
-  private HdfsPartition dropPartition(HdfsPartition partition) {
+  private HdfsPartition dropPartition(HdfsPartition partition,
+      boolean removeCacheDirective) {
     if (partition == null) return null;
     fileMetadataStats_.totalFileBytes -= partition.getSize();
     fileMetadataStats_.numFiles -= partition.getNumFileDescriptors();
@@ -1159,6 +1162,14 @@ public class HdfsTable extends Table implements FeFsTable {
     Long partitionId = partition.getId();
     partitionMap_.remove(partitionId);
     nameToPartitionMap_.remove(partition.getPartitionName());
+    if (removeCacheDirective && partition.isMarkedCached()) {
+      try {
+        HdfsCachingUtil.removePartitionCacheDirective(partition);
+      } catch (ImpalaException e) {
+        LOG.error("Unable to remove the cache directive on table " + getFullName() +
+            ", partition " + partition.getPartitionName() + ": ", e);
+      }
+    }
     if (!isStoredInImpaladCatalogCache()) return partition;
     for (int i = 0; i < partition.getPartitionValues().size(); ++i) {
       ColumnStats stats = getColumns().get(i).getStats();
@@ -1184,20 +1195,29 @@ public class HdfsTable extends Table implements FeFsTable {
     return partition;
   }
 
+  private HdfsPartition dropPartition(HdfsPartition partition) {
+    return dropPartition(partition, true);
+  }
+
   /**
    * Drops the given partitions from this table. Cleans up its metadata from all the
    * mappings used to speed up partition pruning/lookup. Also updates partitions column
    * statistics. Returns the list of partitions that were dropped.
    */
-  public List<HdfsPartition> dropPartitions(List<HdfsPartition> partitions) {
+  public List<HdfsPartition> dropPartitions(List<HdfsPartition> partitions,
+      boolean removeCacheDirective) {
     ArrayList<HdfsPartition> droppedPartitions = Lists.newArrayList();
     for (HdfsPartition partition: partitions) {
-      HdfsPartition hdfsPartition = dropPartition(partition);
+      HdfsPartition hdfsPartition = dropPartition(partition, removeCacheDirective);
       if (hdfsPartition != null) droppedPartitions.add(hdfsPartition);
     }
     return droppedPartitions;
   }
 
+  public List<HdfsPartition> dropPartitions(List<HdfsPartition> partitions) {
+    return dropPartitions(partitions, true);
+  }
+
   /**
    * Update the prototype partition used when creating new partitions for
    * this table. New partitions will inherit storage properties from the
@@ -1359,16 +1379,15 @@ public class HdfsTable extends Table implements FeFsTable {
     Map<Path, List<HdfsPartition>> partitionsToUpdateFileMdByPath = Maps.newHashMap();
     // Partitions that need to be dropped and recreated from scratch
     List<HdfsPartition> dirtyPartitions = Lists.newArrayList();
-    // Partitions that need to be removed from this table. That includes dirty
-    // partitions as well as partitions that were removed from the Hive Metastore.
-    List<HdfsPartition> partitionsToRemove = Lists.newArrayList();
+    // Partitions removed from the Hive Metastore.
+    List<HdfsPartition> removedPartitions = Lists.newArrayList();
     // Identify dirty partitions that need to be loaded from the Hive Metastore and
     // partitions that no longer exist in the Hive Metastore.
     for (HdfsPartition partition: partitionMap_.values()) {
       // Remove partitions that don't exist in the Hive Metastore. These are partitions
       // that were removed from HMS using some external process, e.g. Hive.
       if (!msPartitionNames.contains(partition.getPartitionName())) {
-        partitionsToRemove.add(partition);
+        removedPartitions.add(partition);
       }
       if (partition.isDirty()) {
         // Dirty partitions are updated by removing them from table's partition
@@ -1390,8 +1409,9 @@ public class HdfsTable extends Table implements FeFsTable {
       Preconditions.checkNotNull(partition.getCachedMsPartitionDescriptor());
       partitionNames.add(partition.getPartitionName());
     }
-    partitionsToRemove.addAll(dirtyPartitions);
-    dropPartitions(partitionsToRemove);
+    dropPartitions(removedPartitions);
+    // dirtyPartitions are reloaded and hence cache directives are not dropped.
+    dropPartitions(dirtyPartitions, false);
     // Load dirty partitions from Hive Metastore
     loadPartitionsFromMetastore(dirtyPartitions, client);
 
@@ -2145,7 +2165,7 @@ public class HdfsTable extends Table implements FeFsTable {
     refreshPartitionFileMetadata(refreshedPartition);
     Preconditions.checkArgument(oldPartition == null
         || HdfsPartition.KV_COMPARATOR.compare(oldPartition, refreshedPartition) == 0);
-    dropPartition(oldPartition);
+    dropPartition(oldPartition, false);
     addPartition(refreshedPartition);
   }
 
diff --git a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
index 39cc108..dc19e9f 100644
--- a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
+++ b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
@@ -2270,9 +2270,6 @@ public class CatalogOpExecutor {
           msClient.getHiveClient().dropPartition(tableName.getDb(), tableName.getTbl(),
               part.getPartitionValuesAsStrings(true), dropOptions);
           ++numTargetedPartitions;
-          if (part.isMarkedCached()) {
-            HdfsCachingUtil.removePartitionCacheDirective(part);
-          }
         } catch (NoSuchObjectException e) {
           if (!ifExists) {
             throw new ImpalaRuntimeException(
diff --git a/tests/query_test/test_hdfs_caching.py b/tests/query_test/test_hdfs_caching.py
index c013ed4..beedc7c 100644
--- a/tests/query_test/test_hdfs_caching.py
+++ b/tests/query_test/test_hdfs_caching.py
@@ -273,6 +273,25 @@ class TestHdfsCachingDdl(ImpalaTestSuite):
     drop_cache_directives_for_path(
         "/test-warehouse/cachedb.db/cached_tbl_reload_part/j=2")
 
+  @pytest.mark.execute_serially
+  def test_external_drop(self):
+    """IMPALA-3040: Tests that dropping a partition in Hive leads to the removal of the
+       cache directive after a refresh statement in Impala."""
+    num_entries_pre = get_num_cache_requests()
+    self.client.execute("use cachedb")
+    self.client.execute("create table test_external_drop_tbl (i int) partitioned by "
+                        "(j int) cached in 'testPool'")
+    self.client.execute("insert into test_external_drop_tbl (i,j) select 1, 2")
+    # 1 directive for the table and 1 directive for the partition.
+    assert num_entries_pre + 2 == get_num_cache_requests()
+    self.hive_client.drop_partition("cachedb", "test_external_drop_tbl", ["2"], True)
+    self.client.execute("refresh test_external_drop_tbl")
+    # The directive on the partition is removed.
+    assert num_entries_pre + 1 == get_num_cache_requests()
+    self.client.execute("drop table test_external_drop_tbl")
+    # We want to see the number of cached entities return to the original count.
+    assert num_entries_pre == get_num_cache_requests()
+
 def drop_cache_directives_for_path(path):
   """Drop the cache directive for a given path"""
   rc, stdout, stderr = exec_process("hdfs cacheadmin -removeDirectives -path %s" % path)

[impala] 10/20: IMPALA-7173: [DOCS] Added check options in the load balancer examples

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 2a1f2fbd040944eb59bcbd8197cc728adab35ff3
Author: Alex Rodoni <ar...@cloudera.com>
AuthorDate: Fri Jul 20 17:04:20 2018 -0700

    IMPALA-7173: [DOCS] Added check options in the load balancer examples
    
    Change-Id: Ic5148aecf4605373ea6a28bf032bcfdab4f822fd
    Reviewed-on: http://gerrit.cloudera.org:8080/11006
    Reviewed-by: Michael Brown <mi...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 docs/topics/impala_proxy.xml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/topics/impala_proxy.xml b/docs/topics/impala_proxy.xml
index ddf63c3..b172535 100644
--- a/docs/topics/impala_proxy.xml
+++ b/docs/topics/impala_proxy.xml
@@ -505,10 +505,10 @@ listen impala :25003
     option tcplog
     balance leastconn
 
-    server <varname>symbolic_name_1</varname> impala-host-1.example.com:21000
-    server <varname>symbolic_name_2</varname> impala-host-2.example.com:21000
-    server <varname>symbolic_name_3</varname> impala-host-3.example.com:21000
-    server <varname>symbolic_name_4</varname> impala-host-4.example.com:21000
+    server <varname>symbolic_name_1</varname> impala-host-1.example.com:21000 check
+    server <varname>symbolic_name_2</varname> impala-host-2.example.com:21000 check
+    server <varname>symbolic_name_3</varname> impala-host-3.example.com:21000 check
+    server <varname>symbolic_name_4</varname> impala-host-4.example.com:21000 check
 
 # Setup for Hue or other JDBC-enabled applications.
 # In particular, Hue requires sticky sessions.

[impala] 19/20: IMPALA-5031: Fix undefined behavior: memset NULL

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 446939f818d43a5be2967f1d8db26c42bc34cf54
Author: Jim Apple <jb...@apache.org>
AuthorDate: Tue Jul 24 15:40:56 2018 -0700

    IMPALA-5031: Fix undefined behavior: memset NULL
    
    memset has undefined behavior when its first argument is NULL. The
    instance fixed here was found by Clang's undefined behavior sanitizer.
    
    It was found in the end-to-end tests. The interesting part of the
    stack trace is:
    
    be/src/util/bitmap.h:78:12: runtime error: null pointer passed as argument 1, which is declared to never be null
    /usr/include/string.h:62:79: note: nonnull attribute specified here
        #0 0x2ccb59b in Bitmap::SetAllBits(bool) be/src/util/bitmap.h:78:5
        #1 0x2cb6b9e in NestedLoopJoinNode::ResetMatchingBuildRows(RuntimeState*, long) be/src/exec/nested-loop-join-node.cc:176:27
        #2 0x2cb5ad6 in NestedLoopJoinNode::Open(RuntimeState*) be/src/exec/nested-loop-join-node.cc:90:43
    
    Change-Id: I804f642f4be3b74c24f871f656c5147ee226d2c8
    Reviewed-on: http://gerrit.cloudera.org:8080/11042
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/util/bitmap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/be/src/util/bitmap.h b/be/src/util/bitmap.h
index b2f7f72..ced824b 100644
--- a/be/src/util/bitmap.h
+++ b/be/src/util/bitmap.h
@@ -20,6 +20,7 @@
 #define IMPALA_UTIL_BITMAP_H
 
 #include "util/bit-util.h"
+#include "util/ubsan.h"
 
 namespace impala {
 
@@ -75,7 +76,7 @@ class Bitmap {
   }
 
   void SetAllBits(bool b) {
-    memset(buffer_.data(), 255 * b, buffer_.size() * sizeof(uint64_t));
+    Ubsan::MemSet(buffer_.data(), 255 * b, buffer_.size() * sizeof(uint64_t));
   }
 
   int64_t num_bits() const { return num_bits_; }

[impala] 18/20: IMPALA-7330. After LOAD DATA, only refresh affected partition

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit cfbfface1728bedbdfdb26791a6e8babeb38ddb1
Author: Todd Lipcon <to...@cloudera.com>
AuthorDate: Mon Jul 23 12:28:43 2018 -0700

    IMPALA-7330. After LOAD DATA, only refresh affected partition
    
    This changes LOAD DATA so that, if a specific partition is provided,
    only the named partition will be refreshed upon completion of the
    statement.
    
    No new tests are added since this code path is covered by existing tests
    and this just optimizes the metadata reload.
    
    I did verify looking at the catalogd logs that only the single specified
    partition was refreshed when I issued a LOAD statement from the shell.
    
    Change-Id: I3b29846deac49a89abcd3495e4b757ef536ff331
    Reviewed-on: http://gerrit.cloudera.org:8080/11014
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Todd Lipcon <to...@apache.org>
---
 be/src/service/client-request-state.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/be/src/service/client-request-state.cc b/be/src/service/client-request-state.cc
index 2c4be43..829f211 100644
--- a/be/src/service/client-request-state.cc
+++ b/be/src/service/client-request-state.cc
@@ -180,6 +180,10 @@ Status ClientRequestState::Exec(TExecRequest* exec_request) {
       reset_req.reset_metadata_params.__set_is_refresh(true);
       reset_req.reset_metadata_params.__set_table_name(
           exec_request_.load_data_request.table_name);
+      if (exec_request_.load_data_request.__isset.partition_spec) {
+        reset_req.reset_metadata_params.__set_partition_spec(
+            exec_request_.load_data_request.partition_spec);
+      }
       reset_req.reset_metadata_params.__set_sync_ddl(
           exec_request_.query_options.sync_ddl);
       catalog_op_executor_.reset(

[impala] 11/20: IMPALA-3675: part 1: -Werror for ASAN

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit efe751078f8ab674523ccfe15be50349332b16bf
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Fri Jul 20 09:58:47 2018 -0700

    IMPALA-3675: part 1: -Werror for ASAN
    
    This fixes some clang warnings and enables -Werror going forward for
    ASAN.
    
    It adds -Wno-return-type-c-linkage to suppress a warning that otherwise
    would fail the build.
    
    It also enables the mismatched tags check and fixes the various
    instances of it (this warning is irritating for YouCompleteMe users).
    
    Change-Id: If83ca41cde49fd6cfde479e45f9cfdd9e3a45bec
    Reviewed-on: http://gerrit.cloudera.org:8080/11002
    Reviewed-by: Tim Armstrong <ta...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .clang-tidy                             |  1 -
 be/CMakeLists.txt                       | 12 +++++-------
 be/src/exec/hdfs-table-writer.h         |  2 +-
 be/src/exprs/expr.h                     |  2 +-
 be/src/exprs/scalar-expr.h              |  2 +-
 be/src/runtime/krpc-data-stream-recvr.h |  2 +-
 be/src/runtime/mem-tracker.h            |  2 +-
 7 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index 54f095c..bf37d7a 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -46,7 +46,6 @@ Checks: "-*,clang*,\
 -clang-diagnostic-gnu-anonymous-struct,\
 -clang-diagnostic-header-hygiene,\
 -clang-diagnostic-implicit-fallthrough,\
--clang-diagnostic-mismatched-tags,\
 -clang-diagnostic-missing-prototypes,\
 -clang-diagnostic-missing-variable-declarations,\
 -clang-diagnostic-nested-anon-types,\
diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index 83d932a..c29a225 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -63,8 +63,6 @@ SET(CXX_COVERAGE_FLAGS "-fprofile-arcs -ftest-coverage")
 #        makes extra calls to clang which may have extra includes (-I) that are unused.
 #   -fcolor-diagnostics: ensure clang generates colorized output, which is necessary
 #        when using ccache as clang thinks it is not called from a terminal.
-#   -Wno-mismatched-tags: ignore harmless class/struct mismatch for forward declarations.
-#        Disabling to be consistent with gcc, which doesn't have this warning.
 #   -Wno-zero-as-null-pointer-constant: We are slowly moving towards the use of nullptr,
 #        but till we switch to it completely, we will ignore the warnings due to use of
 #        NULL as a null pointer constant.
@@ -75,10 +73,10 @@ SET(CXX_COVERAGE_FLAGS "-fprofile-arcs -ftest-coverage")
 #        destructors with 'override' which is enforced by clang by not recommended by c++
 #        core guidelines (read C.128).
 SET(CXX_CLANG_FLAGS "-Qunused-arguments -fcolor-diagnostics -Wno-unused-local-typedef")
-SET(CXX_CLANG_FLAGS "${CXX_CLANG_FLAGS} -Wno-mismatched-tags")
 SET(CXX_CLANG_FLAGS "${CXX_CLANG_FLAGS} -Wno-zero-as-null-pointer-constant")
 SET(CXX_CLANG_FLAGS "${CXX_CLANG_FLAGS} -Wno-c++17-extensions")
 SET(CXX_CLANG_FLAGS "${CXX_CLANG_FLAGS} -Wno-inconsistent-missing-destructor-override")
+SET(CXX_CLANG_FLAGS "${CXX_CLANG_FLAGS} -Wno-return-type-c-linkage")
 # For any gcc builds:
 #   -g: Enable symbols for profiler tools
 #   -Wno-unused-local-typedefs: Do not warn for local typedefs that are unused.
@@ -104,15 +102,15 @@ endif()
 # Debug information is stored as dwarf2 to be as compatible as possible
 SET(CXX_FLAGS_DEBUG "${CXX_GCC_FLAGS} -ggdb -gdwarf-2")
 # -Werror: compile warnings should be errors when using the toolchain compiler.
-#   Only enable for debug builds because this is what we test in pre-commit tests.
+#   Enabled for DEBUG, ASAN, TSAN and UBSAN builds which are built pre-commit.
 SET(CXX_FLAGS_DEBUG "${CXX_FLAGS_DEBUG} -Werror")
 SET(CXX_FLAGS_RELEASE "${CXX_GCC_FLAGS} -O3 -DNDEBUG -gdwarf-2")
 SET(CXX_FLAGS_ADDRESS_SANITIZER
-  "${CXX_CLANG_FLAGS} -O1 -g -fsanitize=address -fno-omit-frame-pointer -DADDRESS_SANITIZER")
+  "${CXX_CLANG_FLAGS} -Werror -O1 -g -fsanitize=address -fno-omit-frame-pointer -DADDRESS_SANITIZER")
 
 # Set the flags to the undefined behavior sanitizer, also known as "ubsan"
 # Turn on sanitizer and debug symbols to get stack traces:
-SET(CXX_FLAGS_UBSAN "${CXX_CLANG_FLAGS} -ggdb3 -fno-omit-frame-pointer -fsanitize=undefined")
+SET(CXX_FLAGS_UBSAN "${CXX_CLANG_FLAGS} -Werror -ggdb3 -fno-omit-frame-pointer -fsanitize=undefined")
 # Set preprocessor macros to facilitate initialization the relevant configuration.
 SET(CXX_FLAGS_UBSAN "${CXX_FLAGS_UBSAN} -DUNDEFINED_SANITIZER")
 # Calling getenv() in __ubsan_default_options doesn't work, likely because of
@@ -131,7 +129,7 @@ SET(CXX_FLAGS_UBSAN "${CXX_FLAGS_UBSAN} -O0")
 
 # Set the flags to the thread sanitizer, also known as "tsan"
 # Turn on sanitizer and debug symbols to get stack traces:
-SET(CXX_FLAGS_TSAN "${CXX_CLANG_FLAGS} -O1 -ggdb3 -fno-omit-frame-pointer")
+SET(CXX_FLAGS_TSAN "${CXX_CLANG_FLAGS} -Werror -O1 -ggdb3 -fno-omit-frame-pointer")
 SET(CXX_FLAGS_TSAN "${CXX_FLAGS_TSAN} -fsanitize=thread -DTHREAD_SANITIZER")
 
 SET(CXX_FLAGS_TIDY "${CXX_CLANG_FLAGS}")
diff --git a/be/src/exec/hdfs-table-writer.h b/be/src/exec/hdfs-table-writer.h
index cc6c6cc..a6f06c0 100644
--- a/be/src/exec/hdfs-table-writer.h
+++ b/be/src/exec/hdfs-table-writer.h
@@ -30,7 +30,7 @@ namespace impala {
 class HdfsPartitionDescriptor;
 class HdfsTableDescriptor;
 class HdfsTableSink;
-class OutputPartition;
+struct OutputPartition;
 class RowBatch;
 class RuntimeState;
 class ScalarExprEvaluator;
diff --git a/be/src/exprs/expr.h b/be/src/exprs/expr.h
index 1a47db6..79f980d 100644
--- a/be/src/exprs/expr.h
+++ b/be/src/exprs/expr.h
@@ -32,7 +32,7 @@
 namespace impala {
 
 class IsNullExpr;
-class LibCacheEntry;
+struct LibCacheEntry;
 class LlvmCodeGen;
 class MemTracker;
 class ObjectPool;
diff --git a/be/src/exprs/scalar-expr.h b/be/src/exprs/scalar-expr.h
index e1954b7..6eb221e 100644
--- a/be/src/exprs/scalar-expr.h
+++ b/be/src/exprs/scalar-expr.h
@@ -55,7 +55,7 @@ using impala_udf::StringVal;
 using impala_udf::DecimalVal;
 using impala_udf::CollectionVal;
 
-class LibCacheEntry;
+struct LibCacheEntry;
 class LlvmCodeGen;
 class MemTracker;
 class ObjectPool;
diff --git a/be/src/runtime/krpc-data-stream-recvr.h b/be/src/runtime/krpc-data-stream-recvr.h
index 1435e44..18454d7 100644
--- a/be/src/runtime/krpc-data-stream-recvr.h
+++ b/be/src/runtime/krpc-data-stream-recvr.h
@@ -43,7 +43,7 @@ class MemTracker;
 class RowBatch;
 class RuntimeProfile;
 class SortedRunMerger;
-class TransmitDataCtx;
+struct TransmitDataCtx;
 class TransmitDataRequestPB;
 class TransmitDataResponsePB;
 
diff --git a/be/src/runtime/mem-tracker.h b/be/src/runtime/mem-tracker.h
index 9c41693..01fd331 100644
--- a/be/src/runtime/mem-tracker.h
+++ b/be/src/runtime/mem-tracker.h
@@ -41,7 +41,7 @@ namespace impala {
 
 class ObjectPool;
 class MemTracker;
-class ReservationTrackerCounters;
+struct ReservationTrackerCounters;
 class TQueryOptions;
 
 /// A MemTracker tracks memory consumption; it contains an optional limit