You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jr...@apache.org on 2017/07/10 23:09:09 UTC
[3/3] incubator-impala git commit: IMPALA-5583: [DOCS] Document
default_join_distribution_mode query option
IMPALA-5583: [DOCS] Document default_join_distribution_mode query option
New page for the query option.
Change-Id: I4ec6213efc46bce0fe07c590841d51c009fb5c84
Reviewed-on: http://gerrit.cloudera.org:8080/7300
Reviewed-by: Mostafa Mokhtar <mm...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/801c32de
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/801c32de
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/801c32de
Branch: refs/heads/master
Commit: 801c32dec3914939c95c2cab07f8628dd627aef5
Parents: db3f323
Author: John Russell <jr...@cloudera.com>
Authored: Mon Jun 26 15:49:27 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Mon Jul 10 23:08:12 2017 +0000
----------------------------------------------------------------------
docs/impala.ditamap | 1 +
docs/impala_keydefs.ditamap | 1 +
.../impala_default_join_distribution_mode.xml | 134 +++++++++++++++++++
3 files changed, 136 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/801c32de/docs/impala.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index 574602a..b10ddbf 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -176,6 +176,7 @@ under the License.
<topicref href="topics/impala_batch_size.xml"/>
<topicref href="topics/impala_compression_codec.xml"/>
<topicref href="topics/impala_debug_action.xml"/>
+ <topicref rev="2.9.0 IMPALA-5381" href="topics/impala_default_join_distribution_mode.xml"/>
<topicref href="topics/impala_default_order_by_limit.xml"/>
<topicref audience="hidden" href="topics/impala_disable_cached_reads.xml"/>
<topicref href="topics/impala_disable_codegen.xml"/>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/801c32de/docs/impala_keydefs.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index 7c9bb60..378a5bb 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -10749,6 +10749,7 @@ under the License.
<keydef href="topics/impala_batch_size.xml" keys="batch_size"/>
<keydef href="topics/impala_compression_codec.xml" keys="compression_codec"/>
<keydef href="topics/impala_debug_action.xml" keys="debug_action"/>
+ <keydef href="topics/impala_default_join_distribution_mode.xml" keys="default_join_distribution_mode"/>
<keydef href="topics/impala_default_order_by_limit.xml" keys="default_order_by_limit"/>
<keydef href="topics/impala_disable_cached_reads.xml" keys="disable_cached_reads"/>
<keydef href="topics/impala_disable_codegen.xml" keys="disable_codegen"/>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/801c32de/docs/topics/impala_default_join_distribution_mode.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_default_join_distribution_mode.xml b/docs/topics/impala_default_join_distribution_mode.xml
new file mode 100644
index 0000000..1b17d50
--- /dev/null
+++ b/docs/topics/impala_default_join_distribution_mode.xml
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="default_join_distribution_mode" rev="2.9.0 IMPALA-5381 IMPALA-5583">
+
+ <title>DEFAULT_JOIN_DISTRIBUTION_MODE Query Option</title>
+ <titlealts audience="PDF"><navtitle>DEFAULT_JOIN_DISTRIBUTION_MODE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Performance"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Data Analysts"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ <indexterm audience="hidden">DEFAULT_JOIN_DISTRIBUTION_MODE query option</indexterm>
+ This option determines the join distribution that Impala uses when any of the tables
+ involved in a join query is missing statistics.
+ </p>
+
+ <p>
+ Impala optimizes join queries based on the presence of table statistics,
+ which are produced by the Impala <codeph>COMPUTE STATS</codeph> statement.
+ By default, when a table involved in the join query does not have statistics,
+ Impala uses the <q>broadcast</q> technique that transmits the entire contents
+ of the table to all executor nodes participating in the query. If one table
+ involved in a join has statistics and the other does not, the table without
+ statistics is broadcast. If both tables are missing statistics, the table
+ that is referenced second in the join order is broadcast. This behavior
+ is appropriate when the table involved is relatively small, but can lead to
+ excessive network, memory, and CPU overhead if the table being broadcast is
+ large.
+ </p>
+
+ <p>
+ Because Impala queries frequently involve very large tables, and suboptimal
+ joins for such tables could result in spilling or out-of-memory errors,
+ the setting <codeph>DEFAULT_JOIN_DISTRIBUTION_MODE=SHUFFLE</codeph> lets you
+ override the default behavior. The shuffle join mechanism divides the corresponding rows
+ of each table involved in a join query using a hashing algorithm, and transmits
+ subsets of the rows to other nodes for processing. Typically, this kind of join is
+ more efficient for joins between large tables of similar size.
+ </p>
+
+ <p>
+ The setting <codeph>DEFAULT_JOIN_DISTRIBUTION_MODE=SHUFFLE</codeph> is
+ recommended when setting up and deploying new clusters, because it is less likely
+ to result in serious consequences such as spilling or out-of-memory errors if
+ the query plan is based on incomplete information. This setting is not the default,
+ to avoid changing the performance characteristics of join queries for clusters that
+ are already tuned for their existing workloads.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/type_integer"/>
+ <p>
+ The allowed values are <codeph>BROADCAST</codeph> (equivalent to 0)
+ or <codeph>SHUFFLE</codeph> (equivalent to 1).
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+ <p>
+ The following examples demonstrate appropriate scenarios for each
+ setting of this query option.
+ </p>
+
+<codeblock>
+-- Create a billion-row table.
+create table big_table stored as parquet
+ as select * from huge_table limit 1e9;
+
+-- For a big table with no statistics, the
+-- shuffle join mechanism is appropriate.
+set default_join_distribution_mode=shuffle;
+
+...join queries involving the big table...
+</codeblock>
+
+<codeblock>
+-- Create a hundred-row table.
+create table tiny_table stored as parquet
+ as select * from huge_table limit 100;
+
+-- For a tiny table with no statistics, the
+-- broadcast join mechanism is appropriate.
+set default_join_distribution_mode=broadcast;
+
+...join queries involving the tiny table...
+</codeblock>
+
+<codeblock>
+compute stats tiny_table;
+compute stats big_table;
+
+-- Once the stats are computed, the query option has
+-- no effect on join queries involving these tables.
+-- Impala can determine the absolute and relative sizes
+-- of each side of the join query by examining the
+-- row size, cardinality, and so on of each table.
+
+...join queries involving both of these tables...
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref keyref="compute_stats"/>,
+ <xref keyref="joins"/>,
+ <xref keyref="perf_joins"/>
+ </p>
+
+ </conbody>
+</concept>