You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/08/24 06:31:24 UTC
[1/2] incubator-impala git commit: IMPALA-5309: [DOCS] Add
TABLESAMPLE clause to SELECT statement
Repository: incubator-impala
Updated Branches:
refs/heads/master acaf8b9f0 -> bb6b0ce24
IMPALA-5309: [DOCS] Add TABLESAMPLE clause to SELECT statement
Change-Id: Idd7e5b7cfe11c986348bc6c8d1b11921f34df336
Reviewed-on: http://gerrit.cloudera.org:8080/7680
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/88d00b4b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/88d00b4b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/88d00b4b
Branch: refs/heads/master
Commit: 88d00b4b09ed4bf727083296dab2cfb843ae1c41
Parents: acaf8b9
Author: John Russell <jr...@cloudera.com>
Authored: Thu Jul 27 11:56:35 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu Aug 24 05:54:01 2017 +0000
----------------------------------------------------------------------
docs/impala.ditamap | 1 +
docs/impala_keydefs.ditamap | 1 +
docs/shared/impala_common.xml | 8 +
docs/topics/impala_hbase.xml | 3 +
docs/topics/impala_kudu.xml | 2 +
docs/topics/impala_scalability.xml | 42 ++-
docs/topics/impala_select.xml | 9 +
docs/topics/impala_subqueries.xml | 3 +
docs/topics/impala_tablesample.xml | 556 ++++++++++++++++++++++++++++++++
docs/topics/impala_views.xml | 3 +
10 files changed, 611 insertions(+), 17 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/impala.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index b10ddbf..80514da 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -163,6 +163,7 @@ under the License.
<topicref href="topics/impala_offset.xml"/>
<topicref href="topics/impala_union.xml"/>
<topicref href="topics/impala_subqueries.xml"/>
+ <topicref rev="IMPALA-5309" href="topics/impala_tablesample.xml"/>
<topicref href="topics/impala_with.xml"/>
<topicref href="topics/impala_distinct.xml"/>
<topicref href="topics/impala_hints.xml"/>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/impala_keydefs.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index 8748d03..cdcaed6 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -10732,6 +10732,7 @@ under the License.
<keydef href="topics/impala_joins.xml" keys="joins"/>
<keydef href="topics/impala_order_by.xml" keys="order_by"/>
<keydef href="topics/impala_group_by.xml" keys="group_by"/>
+ <keydef rev="IMPALA-5309" href="topics/impala_tablesample.xml" keys="tablesample"/>
<keydef href="topics/impala_having.xml" keys="having"/>
<keydef href="topics/impala_limit.xml" keys="limit"/>
<keydef href="topics/impala_offset.xml" keys="offset"/>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/shared/impala_common.xml
----------------------------------------------------------------------
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index d4dfaab..5d79acc 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -765,6 +765,14 @@ select concat('abc','mno','xyz');</codeblock>
they are primarily used in new SQL syntax topics underneath that parent topic.
</p>
+ <p id="tablesample_caveat" rev="IMPALA-5309">
+ The <codeph>TABLESAMPLE</codeph> clause of the <codeph>SELECT</codeph>
+ statement does not apply to a table reference derived from a view, a subquery,
+ or anything other than a real base table. This clause only works for tables
+ backed by HDFS or HDFS-like data files, therefore it does not apply to Kudu or
+ HBase tables.
+ </p>
+
<codeblock id="parquet_fallback_schema_resolution_example"><![CDATA[
create database schema_evolution;
use schema_evolution;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_hbase.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_hbase.xml b/docs/topics/impala_hbase.xml
index 2eae975..6c1822f 100644
--- a/docs/topics/impala_hbase.xml
+++ b/docs/topics/impala_hbase.xml
@@ -758,6 +758,9 @@ set hbase_caching=1000;
<li>
<p conref="../shared/impala_common.xml#common/hbase_no_load_data"/>
</li>
+ <li>
+ <p conref="../shared/impala_common.xml#common/tablesample_caveat"/>
+ </li>
</ul>
</conbody>
</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_kudu.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_kudu.xml b/docs/topics/impala_kudu.xml
index 3334d6b..08d3559 100644
--- a/docs/topics/impala_kudu.xml
+++ b/docs/topics/impala_kudu.xml
@@ -1378,6 +1378,8 @@ kudu.table_name | impala::some_database.table_name_demo
the predicate pushdown for a specific query against a Kudu table.
</p>
+ <p conref="../shared/impala_common.xml#common/tablesample_caveat"/>
+
<!-- Hide until subtopics are ready to display. -->
<p outputclass="toc inpage" audience="hidden"/>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_scalability.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_scalability.xml b/docs/topics/impala_scalability.xml
index 2ea7603..7c24fe2 100644
--- a/docs/topics/impala_scalability.xml
+++ b/docs/topics/impala_scalability.xml
@@ -843,23 +843,31 @@ Currently, there is no throttling mechanism for Impala I/O.
</conbody>
</concept>
-<concept id="big_tables">
-<title>Scalability Considerations for Table Layout</title>
-<conbody>
-<p>
-Due to the overhead of retrieving and updating table metadata
-in the metastore database, try to limit the number of columns
-in a table to a maximum of approximately 2000.
-Although Impala can handle wider tables than this, the metastore overhead
-can become significant, leading to query performance that is slower
-than expected based on the actual data volume.
-</p>
-<p>
-To minimize overhead related to the metastore database and Impala query planning,
-try to limit the number of partitions for any partitioned table to a few tens of thousands.
-</p>
-</conbody>
-</concept>
+ <concept id="big_tables">
+ <title>Scalability Considerations for Table Layout</title>
+ <conbody>
+ <p>
+ Due to the overhead of retrieving and updating table metadata
+ in the metastore database, try to limit the number of columns
+ in a table to a maximum of approximately 2000.
+ Although Impala can handle wider tables than this, the metastore overhead
+ can become significant, leading to query performance that is slower
+ than expected based on the actual data volume.
+ </p>
+ <p>
+ To minimize overhead related to the metastore database and Impala query planning,
+ try to limit the number of partitions for any partitioned table to a few tens of thousands.
+ </p>
+ <p rev="IMPALA-5309">
+ If the volume of data within a table makes it impractical to run exploratory
+ queries, consider using the <codeph>TABLESAMPLE</codeph> clause to limit query processing
+ to only a percentage of data within the table. This technique reduces the overhead
+ for query startup, I/O to read the data, and the amount of network, CPU, and memory
+ needed to process intermediate results during the query. See <xref keyref="tablesample"/>
+ for details.
+ </p>
+ </conbody>
+ </concept>
<concept rev="" id="kerberos_overhead_cluster_size">
<title>Kerberos-Related Network Overhead for Large Clusters</title>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_select.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_select.xml b/docs/topics/impala_select.xml
index 52954e7..0253712 100644
--- a/docs/topics/impala_select.xml
+++ b/docs/topics/impala_select.xml
@@ -65,6 +65,9 @@ HAVING <codeph>conditions</codeph>
ORDER BY { <i>column</i> | <i>expression</i> [ASC | DESC] [NULLS FIRST | NULLS LAST] [, ...] }
LIMIT <i>expression</i> [OFFSET <i>expression</i>]
[UNION [ALL] <i>select_statement</i>] ...]
+
+table_reference := { <varname>table_name</varname> | (<varname>subquery</varname>) }
+ <ph rev="IMPALA-5309">[ TABLESAMPLE SYSTEM(<varname>percentage</varname>) [REPEATABLE(<varname>seed</varname>)] ]</ph>
</codeblock>
<p>
@@ -166,6 +169,12 @@ LIMIT <i>expression</i> [OFFSET <i>expression</i>]
<codeph>LIKE</codeph>, <codeph>IN</codeph>, <codeph>BETWEEN</codeph>, and <codeph>COALESCE</codeph>. Impala
specifically supports built-ins described in <xref href="impala_functions.xml#builtins"/>.
</li>
+
+ <li rev="IMPALA-5309">
+ In <keyword keyref="impala29_full"/> and higher, an optional <codeph>TABLESAMPLE</codeph>
+ clause immediately after a table reference, to specify that the query only processes a
+ specified percentage of the table data. See <xref keyref="tablesample"/> for details.
+ </li>
</ul>
<p conref="../shared/impala_common.xml#common/ignore_file_extensions"/>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_subqueries.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_subqueries.xml b/docs/topics/impala_subqueries.xml
index d3d509c..b7dd5af 100644
--- a/docs/topics/impala_subqueries.xml
+++ b/docs/topics/impala_subqueries.xml
@@ -313,6 +313,9 @@ Therefore, this is not an efficient construct to use with Impala queries for HBa
when referring to any column from the outer query block within a subquery.
</p>
</li>
+ <li>
+ <p conref="../shared/impala_common.xml#common/tablesample_caveat"/>
+ </li>
</ul>
<p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_tablesample.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_tablesample.xml b/docs/topics/impala_tablesample.xml
new file mode 100644
index 0000000..f60c5be
--- /dev/null
+++ b/docs/topics/impala_tablesample.xml
@@ -0,0 +1,556 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="tablesample" rev="IMPALA-5309">
+
+ <title>TABLESAMPLE Clause</title>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="SQL"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Data Analysts"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+
+ <p>
+ Specify the <codeph>TABLESAMPLE</codeph> clause in cases where you need
+ to explore the data distribution within the table, the table is very large,
+ and it is impractical or unnecessary to process all the data from the table
+ or selected partitions.
+ </p>
+
+ <p>
+ The clause makes the query process a randomized set of data files from the
+ table, so that the total volume of data is greater than or equal to the specified
+ percentage of data bytes within that table. (Or the data bytes within the set of
+ partitions that remain after partition pruning is performed.)
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>
+ <ph rev="IMPALA-5309">TABLESAMPLE SYSTEM(<varname>percentage</varname>) [REPEATABLE(<varname>seed</varname>)]</ph>
+</codeblock>
+
+ <p>
+ The <codeph>TABLESAMPLE</codeph> clause comes immediately after a table name or table alias.
+ </p>
+
+ <p>
+ The <codeph>SYSTEM</codeph> keyword represents the sampling method. Currently,
+ Impala only supports a single sampling method named <codeph>SYSTEM</codeph>.
+ </p>
+
+ <p>
+ The <varname>percentage</varname> argument is an integer literal from 0 to 100.
+ A percentage of 0 produces an empty result set for a particular table reference,
+ while a percentage of 100 uses the entire contents. Because the sampling works by
+ selecting a random set of data files, the proportion of sampled data from the
+ table may be greater than the specified percentage, based on the number and sizes
+ of the underlying data files. See the usage notes for details.
+ </p>
+
+ <p>
+ The optional <codeph>REPEATABLE</codeph> keyword lets you specify an arbitrary
+ positive integer seed value that ensures that when the query is run again, the
+ sampling selects the same set of data files each time. <codeph>REPEATABLE</codeph>
+ does not have a default value. If you omit the <codeph>REPEATABLE</codeph> keyword,
+ the random seed is derived from the current time.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/added_in_290"/>
+
+ <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+ <p>
+ You might use this clause with aggregation queries, such as finding
+ the approximate average, minimum, or maximum where exact precision
+ is not required. You can use these findings to plan the most effective
+ strategy for constructing queries against the full table or designing
+ a partitioning strategy for the data.
+ </p>
+
+ <p>
+ Some other database systems have a <codeph>TABLESAMPLE</codeph> clause.
+ The Impala syntax for this clause is modeled on the syntax for popular
+ relational databases, not the Hive <codeph>TABLESAMPLE</codeph> clause.
+ For example, there is no <codeph>BUCKETS</codeph> keyword as in HiveQL.
+ </p>
+
+ <p>
+ The precision of the <varname>percentage</varname> threshold depends on
+ the number and sizes of the underlying data files. Impala brings in
+ additional data files, one at a time, until the number of bytes exceeds
+ the specified percentage based on the total number of bytes for the
+ entire set of table data. The precision of the percentage threshold is higher
+ when the table contains many data files with consistent sizes. See the
+ code listings later in this section for examples.
+ </p>
+
+ <p>
+ When you estimate characteristics of the data distribution based on sampling
+ a percentage of the table data, be aware that the data might be unevenly distributed
+ between different files. Do not assume that the percentage figure reflects the
+ percentage of rows in the table. For example, one file might contain all blank values
+ for a <codeph>STRING</codeph> column, while another file contains long strings
+ in that column; therefore, one file could contain many more rows than another.
+ Likewise, a table created with the <codeph>SORT BY</codeph> clause might
+ contain narrow ranges of values for the sort columns, making it impractical to
+ extrapolate the number of distinct values for those columns based on sampling
+ only some of the data files.
+ </p>
+
+ <p>
+ Because a sample of the table data might not contain all values for a particular
+ column, if the <codeph>TABLESAMPLE</codeph> is used in a join query, the
+ key relationships between the tables might produce incomplete result sets
+ compared to joins using all the table data. For example, if you join 50%
+ of table A with 50% of table B, some values in the join columns might
+ not match between the two tables, even though overall there is a 1:1
+ relationship between the tables.
+ </p>
+
+ <p>
+ The <codeph>REPEATABLE</codeph> keyword makes identical queries use a
+ consistent set of data files when the query is repeated. You specify an
+ arbitrary integer key that acts as a seed value when Impala randomly
+ selects the set of data files to use in the query. This technique
+ lets you verify correctness, examine performance, and so on for queries
+ using the <codeph>TABLESAMPLE</codeph> clause without the sampled data
+ being different each time. The repeatable aspect is reset (that is, the
+ set of selected data files may change) any time the contents of the table
+ change. The statements or operations that can make sampling results
+ non-repeatable are:
+ </p>
+
+ <ul>
+ <li>
+ <codeph>INSERT</codeph>.
+ </li>
+ <li>
+ <codeph>TRUNCATE TABLE</codeph>.
+ </li>
+ <li>
+ <codeph>LOAD DATA</codeph>.
+ </li>
+ <li>
+ <codeph>REFRESH</codeph> or <codeph>INVALIDATE METADATA</codeph>
+ after files are added or removed by a non-Impala mechanism.
+ </li>
+ <li>
+ </li>
+ </ul>
+
+ <p>
+ This clause is similar in some ways to the <codeph>LIMIT</codeph> clause,
+ because both serve to limit the size of the intermediate data and final
+ result set. <codeph>LIMIT 0</codeph> is more efficient than
+ <codeph>TABLESAMPLE SYSTEM(0)</codeph> for verifying that a query can execute
+ without producing any results. <codeph>TABLESAMPLE SYSTEM(<varname>n</varname>)</codeph>
+ often makes query processing more efficient than using a <codeph>LIMIT</codeph> clause
+ by itself, because all phases of query execution use less data overall.
+ If the intent is to retrieve some representative values from the table
+ in an efficient way, you might combine <codeph>TABLESAMPLE</codeph>,
+ <codeph>ORDER BY</codeph>, and <codeph>LIMIT</codeph> clauses within a single query.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/partitioning_blurb"/>
+ <p>
+ When you query a partitioned table, any partition pruning happens
+ before Impala selects the data files to sample. For example, in a
+ table partitioned by year, a query with <codeph>WHERE year = 2017</codeph>
+ and a <codeph>TABLESAMPLE SYSTEM(10)</codeph> clause would sample
+ data files representing at least 10% of the bytes present in the
+ 2017 partition.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+ <p>
+ This clause applies to S3 tables the same way as tables
+ with data files stored on HDFS.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/adls_blurb"/>
+ <p>
+ This clause applies to ADLS tables the same way as tables
+ with data files stored on HDFS.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/kudu_blurb"/>
+ <p>
+ This clause does not apply to Kudu tables.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/hbase_blurb"/>
+ <p>
+ This clause does not apply to HBase tables.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/performance_blurb"/>
+ <p>
+ From a performance perspective, the <codeph>TABLESAMPLE</codeph>
+ clause is especially valuable for exploratory queries on
+ text, Avro, or other file formats other than Parquet. Text-based
+ or row-oriented file formats must process substantial amounts of
+ redundant data for queries that derive aggregate results such as
+ <codeph>MAX()</codeph>, <codeph>MIN()</codeph>, or <codeph>AVG()</codeph>
+ for a single column. Therefore, you might use <codeph>TABLESAMPLE</codeph>
+ early in the ETL pipeline, when data is still in raw text format
+ and has not been converted to Parquet or moved into a partitioned
+ table.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/restrictions_blurb"/>
+
+ <p>
+ This clause applies only to tables that use a storage layer
+ with underlying raw data files, such as HDFS, Amazon S3,
+ or Microsoft ADLS.
+ </p>
+
+ <p>
+ This clause does not apply to table references that represent views.
+ A query that applies the <codeph>TABLESAMPLE</codeph> clause to a
+ view or a subquery fails with a semantic error.
+ </p>
+
+ <p>
+ Because the sampling works at the level of entire data files, it
+ is by nature coarse-grained. It is possible to specify a small
+ sample percentage but still process a substantial portion of the
+ table data if the table contains relatively few data files, if
+ each data file is very large, or if the data files vary substantially
+ in size. Be sure that you understand the data distribution and physical
+ file layout so that you can verify if the results are suitable for
+ extrapolation. For example, if the table contains only a single data file,
+ the <q>sample</q> will consist of all the table data regardless of
+ the percentage you specify. If the table contains data files of
+ 1 GiB, 1 GiB, and 1 KiB, when you specify a sampling percentage of
+ 50 you would either process slightly more than 50% of the table
+ (1 GiB + 1 KiB) or almost the entire table (1 GiB + 1 GiB),
+ depending on which data files were selected for sampling.
+ </p>
+
+ <p>
+ If data files are added by a non-Impala mechanism, and the
+ table metadata is not updated by a <codeph>REFRESH</codeph>
+ or <codeph>INVALIDATE METADATA</codeph> statement, the
+ <codeph>TABLESAMPLE</codeph> clause does not consider those
+ new files when computing the number of bytes in the table
+ or selecting which files to sample.
+ </p>
+
+ <p>
+ If data files are removed by a non-Impala mechanism, and the
+ table metadata is not updated by a <codeph>REFRESH</codeph>
+ or <codeph>INVALIDATE METADATA</codeph> statement, the
+ query fails if the <codeph>TABLESAMPLE</codeph> clause
+ attempts to reference any of the missing files.
+ </p>
+
+ <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+ <p>
+ The following examples demonstrate the <codeph>TABLESAMPLE</codeph> clause.
+ These examples intentionally use very small data sets to illustrate how
+ the number of files, size of each file, and overall size of data in the table
+ interact with the percentage specified in the clause.
+ </p>
+
+ <p>
+ These examples use an unpartitioned table, containing several files of roughly
+ the same size:
+ </p>
+
+<codeblock><![CDATA[
+create table sample_demo (x int, s string);
+
+insert into sample_demo values (1, 'one');
+insert into sample_demo values (2, 'two');
+insert into sample_demo values (3, 'three');
+insert into sample_demo values (4, 'four');
+insert into sample_demo values (5, 'five');
+
+show files in sample_demo;
++---------------------+------+-----------+
+| Path | Size | Partition |
++---------------------+------+-----------+
+| 991213608_data.0. | 7B | |
+| 982196806_data.0. | 6B | |
+| _2122096884_data.0. | 8B | |
+| _586325431_data.0. | 6B | |
+| 1894746258_data.0. | 7B | |
++---------------------+------+-----------+
+
+show table stats sample_demo;
++-------+--------+------+--------+-------------------------+
+| #Rows | #Files | Size | Format | Location |
++-------+--------+------+--------+-------------------------+
+| -1 | 5 | 34B | TEXT | /tsample.db/sample_demo |
++-------+--------+------+--------+-------------------------+
+</codeblock>
+
+ <p>
+ A query that samples 50% of the table must process at least
+ 17 bytes of data. Based on the sizes of the data files,
+ we can predict that each such query uses 3 arbitrary files.
+ Any 1 or 2 files are not enough to reach 50% of the total
+ data in the table (34 bytes), so the query adds more files
+ until it passes the 50% threshold:
+ </p>
+
+<codeblock><![CDATA[
+select distinct x from sample_demo tablesample system(50);
++---+
+| x |
++---+
+| 4 |
+| 1 |
+| 5 |
++---+
+
+select distinct x from sample_demo tablesample system(50);
++---+
+| x |
++---+
+| 5 |
+| 4 |
+| 2 |
++---+
+
+select distinct x from sample_demo tablesample system(50);
++---+
+| x |
++---+
+| 5 |
+| 3 |
+| 2 |
++---+
+</codeblock>
+
+ <p>
+ To help run reproducible experiments, the <codeph>REPEATABLE</codeph>
+ clause causes Impala to choose the same set of files for each query.
+ Although the data set being considered is deterministic, the order
+ of results varies (in the absence of an <codeph>ORDER BY</codeph>
+ clause) because of the way distributed queries are processed:
+ </p>
+
+<codeblock><![CDATA[
+select distinct x from sample_demo
+ tablesample system(50) repeatable (12345);
++---+
+| x |
++---+
+| 3 |
+| 2 |
+| 1 |
++---+
+
+select distinct x from sample_demo
+ tablesample system(50) repeatable (12345);
++---+
+| x |
++---+
+| 2 |
+| 1 |
+| 3 |
++---+
+</codeblock>
+
+ <p>
+ The following examples show how uneven data distribution affects
+ which data is sampled. Adding another data file containing a long
+ string value changes the threshold for 50% of the total data in
+ the table:
+ </p>
+
+<codeblock><![CDATA[
+insert into sample_demo values (1000, 'Boyhood is the longest time in li
+fe for a boy. The last term of the school-year is made of decades, not o
+f weeks, and living through them is like waiting for the millennium. Boo
+th Tarkington');
+
+show files in sample_demo;
++---------------------+------+-----------+
+| Path | Size | Partition |
++---------------------+------+-----------+
+| 991213608_data.0. | 7B | |
+| 982196806_data.0. | 6B | |
+| _253317650_data.0. | 196B | |
+| _2122096884_data.0. | 8B | |
+| _586325431_data.0. | 6B | |
+| 1894746258_data.0. | 7B | |
++---------------------+------+-----------+
+
+show table stats sample_demo;
++-------+--------+------+--------+-------------------------+
+| #Rows | #Files | Size | Format | Location |
++-------+--------+------+--------+-------------------------+
+| -1 | 6 | 230B | TEXT | /tsample.db/sample_demo |
++-------+--------+------+--------+-------------------------+
+</codeblock>
+
+ <p>
+ Even though the queries do not refer to the <codeph>S</codeph>
+ column containing the long value, all the sampling queries include
+ the data file containing the column value <codeph>X=1000</codeph>,
+ because the query cannot reach the 50% threshold (115 bytes) without
+ including that file. The large file might be considered first, in which
+ case it is the only file processed by the query. Or an arbitrary
+ set of other files might be considered first.
+ </p>
+
+<codeblock><![CDATA[
+select distinct x from sample_demo tablesample system(50);
++------+
+| x |
++------+
+| 1000 |
+| 3 |
+| 1 |
++------+
+
+select distinct x from sample_demo tablesample system(50);
++------+
+| x |
++------+
+| 1000 |
++------+
+
+select distinct x from sample_demo tablesample system(50);
++------+
+| x |
++------+
+| 1000 |
+| 4 |
+| 2 |
+| 1 |
++------+
+</codeblock>
+
+ <p>
+ The following examples demonstrate how the <codeph>TABLESAMPLE</codeph>
+ clause interacts with other table aspects, such as partitioning and file
+ format:
+ </p>
+
+<codeblock><![CDATA[
+create table sample_demo_partitions (x int, s string) partitioned by (n int) stored as parquet;
+
+insert into sample_demo_partitions partition (n = 1) select * from sample_demo;
+insert into sample_demo_partitions partition (n = 2) select * from sample_demo;
+insert into sample_demo_partitions partition (n = 3) select * from sample_demo;
+
+show files in sample_demo_partitions;
++--------------------------------+--------+-----------+
+| Path | Size | Partition |
++--------------------------------+--------+-----------+
+| 000000_364262785_data.0.parq | 1.24KB | n=1 |
+| 000001_973526736_data.0.parq | 566B | n=1 |
+| 0000000_1300598134_data.0.parq | 1.24KB | n=2 |
+| 0000001_689099063_data.0.parq | 568B | n=2 |
+| 0000000_1861371709_data.0.parq | 1.24KB | n=3 |
+| 0000001_1065507912_data.0.parq | 566B | n=3 |
++--------------------------------+--------+-----------+
+
+show table stats tablesample_demo_partitioned;
++-------+-------+--------+--------+---------+----------------------------------------------+
+| n | #Rows | #Files | Size | Format | Location |
++-------+-------+--------+--------+---------+----------------------------------------------+
+| 1 | -1 | 2 | 1.79KB | PARQUET | /tsample.db/tablesample_demo_partitioned/n=1 |
+| 2 | -1 | 2 | 1.80KB | PARQUET | /tsample.db/tablesample_demo_partitioned/n=2 |
+| 3 | -1 | 2 | 1.79KB | PARQUET | /tsample.db/tablesample_demo_partitioned/n=3 |
+| Total | -1 | 6 | 5.39KB | | |
++-------+-------+--------+--------+---------+----------------------------------------------+
+</codeblock>
+
+ <p>
+ If the query does not involve any partition pruning, the
+ sampling applies to the data volume of the entire table:
+ </p>
+
+<codeblock><![CDATA[
+-- 18 rows total.
+select count(*) from sample_demo_partitions;
++----------+
+| count(*) |
++----------+
+| 18 |
++----------+
+
+-- The number of rows per data file is not
+-- perfectly balanced, therefore the count
+-- is different depending on which set of files
+-- is considered.
+select count(*) from sample_demo_partitions
+ tablesample system(75);
++----------+
+| count(*) |
++----------+
+| 14 |
++----------+
+
+select count(*) from sample_demo_partitions
+ tablesample system(75);
++----------+
+| count(*) |
++----------+
+| 16 |
++----------+
+</codeblock>
+
+ <p>
+ If the query only processes certain partitions,
+ the query computes the sampling threshold based on
+ the data size and set of files only from the
+ relevant partitions:
+ </p>
+
+<codeblock><![CDATA[
+select count(*) from sample_demo_partitions
+ tablesample system(50) where n = 1;
++----------+
+| count(*) |
++----------+
+| 6 |
++----------+
+
+select count(*) from sample_demo_partitions
+ tablesample system(50) where n = 1;
++----------+
+| count(*) |
++----------+
+| 2 |
++----------+
+]]>
+</codeblock>
+
+ <p conref="../shared/impala_common.xml#common/related_info"/>
+ <p>
+ <xref href="impala_select.xml#select"/>
+ </p>
+
+ </conbody>
+</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/88d00b4b/docs/topics/impala_views.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_views.xml b/docs/topics/impala_views.xml
index 09693fe..678f681 100644
--- a/docs/topics/impala_views.xml
+++ b/docs/topics/impala_views.xml
@@ -192,6 +192,9 @@ select * from top_10_days;</codeblock>
<li rev="1.4.0">
<p conref="../shared/impala_common.xml#common/order_by_view_restriction"/>
</li>
+ <li>
+ <p conref="../shared/impala_common.xml#common/tablesample_caveat"/>
+ </li>
</ul>
<p>
[2/2] incubator-impala git commit: IMPALA-5780,
IMPALA-5779: extra spilling tests
Posted by ta...@apache.org.
IMPALA-5780,IMPALA-5779: extra spilling tests
* Test for disable_unsafe_spills
* Test for buffer size > I/O size (--read_size)
Change-Id: I03de00394bb6bbcf381250f816e22a4b987f1135
Reviewed-on: http://gerrit.cloudera.org:8080/7787
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/bb6b0ce2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/bb6b0ce2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/bb6b0ce2
Branch: refs/heads/master
Commit: bb6b0ce249f561af04ea42752ca874cd93dd63c9
Parents: 88d00b4
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Wed Aug 23 13:47:37 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu Aug 24 06:27:05 2017 +0000
----------------------------------------------------------------------
.../QueryTest/disable-unsafe-spills.test | 11 +++++++
.../QueryTest/spilling-sorts-exhaustive.test | 31 +++++++++++++++++++-
tests/query_test/test_spilling.py | 4 +++
3 files changed, 45 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb6b0ce2/testdata/workloads/functional-query/queries/QueryTest/disable-unsafe-spills.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/disable-unsafe-spills.test b/testdata/workloads/functional-query/queries/QueryTest/disable-unsafe-spills.test
new file mode 100644
index 0000000..a9b8f35
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/disable-unsafe-spills.test
@@ -0,0 +1,11 @@
+====
+---- QUERY
+# tpch_avro does not have stats computed, so if we set disable_unsafe_spills we should
+# not spill to disk.
+set disable_unsafe_spills=true;
+set buffer_pool_limit=40m;
+select distinct *
+from tpch_avro.orders
+---- CATCH
+Could not free memory by spilling to disk: spilling was disabled by planner. Re-enable spilling by setting the query option DISABLE_UNSAFE_SPILLS=false
+====
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb6b0ce2/testdata/workloads/functional-query/queries/QueryTest/spilling-sorts-exhaustive.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/spilling-sorts-exhaustive.test b/testdata/workloads/functional-query/queries/QueryTest/spilling-sorts-exhaustive.test
index d391884..415fe5d 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/spilling-sorts-exhaustive.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/spilling-sorts-exhaustive.test
@@ -192,4 +192,33 @@ STRING
# Verify that the sort actually spilled
row_regex: .*SpilledRuns: .* \([1-9][0-9]*\)
row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
-
+====
+---- QUERY
+# Test that we can spill with a buffer size larger than the default I/O size (8mb).
+set default_spillable_buffer_size=16m;
+set buffer_pool_limit=96m;
+select l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
+from tpch_parquet.lineitem
+order by 1, 2, 3, 4, 5
+offset 6001200
+---- TYPES
+BIGINT,BIGINT,BIGINT,INT,STRING
+---- RESULTS
+5999971,97922,7923,5,'place of the slyly quick pla'
+5999971,132903,2904,4,'ckly above the boldly '
+5999971,144823,4824,2,'e blithely after the carefully pending '
+5999971,161882,6915,6,'luffy theodolites nag boldly bli'
+5999972,102942,473,1,'s maintain carefully among the'
+5999972,133109,8136,2,' the furiously express pearls. furi'
+5999972,152761,2762,3,'sual accounts al'
+5999973,176345,1380,1,'gular excuses. '
+5999974,10463,5466,2,'dolites wake'
+5999974,25360,5361,1,'express dependencies. express, pendi'
+5999975,6452,1453,2,'lar pinto beans aft'
+5999975,7272,2273,1,'tructions. excu'
+5999975,37131,2138,3,', quick deposits. ironic, unusual deposi'
+6000000,32255,2256,1,'carefully '
+6000000,96127,6128,2,'ooze furiously about the pe'
+---- RUNTIME_PROFILE
+row_regex: .*SpilledRuns: .* \([1-9][0-9]*\)
+====
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/bb6b0ce2/tests/query_test/test_spilling.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_spilling.py b/tests/query_test/test_spilling.py
index 437739c..c818b65 100644
--- a/tests/query_test/test_spilling.py
+++ b/tests/query_test/test_spilling.py
@@ -72,3 +72,7 @@ class TestSpilling(ImpalaTestSuite):
if self.exploration_strategy() != 'exhaustive':
pytest.skip("only run large sorts on exhaustive")
self.run_test_case('QueryTest/spilling-sorts-exhaustive', vector)
+
+ def test_disable_unsafe_spills(self, vector):
+ """Test that the disable_unsafe_spills query options works end-to-end."""
+ self.run_test_case('QueryTest/disable-unsafe-spills', vector)